In [None]:
#Observations
#1. For Capomulin treated mice, higher weight had a positive correlation with higher tumor volume
#2. For Capomulin treated mice, there was a strong correlation between time passed and lower tumor volume
#3. Placebo treatment resulted in very similiar tumor volume to several other treatments.

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st

# Study data files
mouse_metadata_path = "../data/Mouse_metadata.csv"
study_results_path = "../data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
dataset = pd.merge(mouse_metadata, study_results, how='left', on=['Mouse ID', 'Mouse ID'])

# Display the data table for preview
dataset.head()

In [None]:
# Checking the number of mice.
unique_mice = dataset['Mouse ID'].nunique()
mice_count = len(dataset['Mouse ID'])
print(f'{mice_count} total records with {unique_mice} unique Mouse IDs')

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
dataset_dupes = dataset[dataset.duplicated(['Mouse ID', 'Timepoint'])]
dataset_dupes

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_data = dataset.drop_duplicates(['Mouse ID', 'Timepoint'])
clean_data.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
print(f"{len(clean_data['Mouse ID'])} total records with {clean_data['Mouse ID'].nunique()} unique Mouse IDs")

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method is the most straighforward, creating multiple series and putting them all together at the end.
summary_statistics = pd.DataFrame({
    'Mean': clean_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].mean(), 
    'Median': clean_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].median(), 
    'Variance': clean_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].var(),
    'Standard Error': clean_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].std(), 
    'SEM': clean_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].sem()
})
summary_statistics

## Bar and Pie Charts

In [51]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
treatment_options = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']
tumor_data = clean_data.loc[clean_data['Drug Regimen'].isin(treatment_options)].reset_index()
# Start by getting the last (greatest) timepoint for each mouse
last_timepoint = tumor_data.groupby('Mouse ID')['Timepoint'].max().reset_index()
last_timepoint
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
merge_mouse = pd.merge(last_timepoint, clean_data, how='left', on=['Mouse ID', 'Timepoint'])
tumor_by_treatment = merge_mouse[['Mouse ID', 'Tumor Volume (mm3)']]
merge_mouse.head(10)


Unnamed: 0,Mouse ID,Timepoint,Drug Regimen,Sex,Age_months,Weight (g),Tumor Volume (mm3),Metastatic Sites
0,a203,45,Infubinol,Female,20,23,67.973419,2
1,a251,45,Infubinol,Female,21,25,65.525743,1
2,a275,45,Ceftamin,Female,20,28,62.999356,3
3,a411,45,Ramicane,Male,3,22,38.407618,1
4,a444,45,Ramicane,Female,10,25,43.047543,0
5,a520,45,Ramicane,Male,13,21,38.810366,1
6,a577,30,Infubinol,Female,6,25,57.031862,2
7,a644,45,Ramicane,Female,7,17,32.978522,1
8,a685,45,Infubinol,Male,8,30,66.083066,3
9,b128,45,Capomulin,Female,9,22,38.982878,2


In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
#treatment = clean_data.groupby('Drug Regimen')['Mouse ID'].count().reset_index()
count_by_treatment = clean_data.groupby('Drug Regimen')['Mouse ID'].count().reset_index()
treatments = count_by_treatment.iloc[:, 0]
treatment_counts = count_by_treatment.iloc[:, 1]

treatment_summary = plt.bar(treatments, treatment_counts, color='r', alpha=0.5, align='center')
plt.title('Counts by Treatment')
plt.xlabel('Drug Regimen')
plt.xticks(rotation='vertical')
plt.ylabel('Count')
plt.ylim(min(treatment_counts) - 10, max(treatment_counts) + 10)

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
colors = ['pink', 'blue']
genders = clean_data.groupby('Sex')['Mouse ID'].nunique().reset_index()
plt.pie(genders['Mouse ID'], labels = genders['Sex'], colors=colors, autopct='%1.1f%%', startangle=90)

## Quartiles, Outliers and Boxplots

In [None]:
capomulin = []
ramicane = []
infubinol = []
ceftamin = []
all_drugs = []

for i in range(len(merge_mouse)):
    if merge_mouse['Drug Regimen'][i] == 'Capomulin':
        capomulin.append(merge_mouse['Tumor Volume (mm3)'][i])
        all_drugs.append(merge_mouse['Tumor Volume (mm3)'][i])
    if merge_mouse['Drug Regimen'][i] == 'Ramicane':
        ramicane.append(merge_mouse['Tumor Volume (mm3)'][i])
        all_drugs.append(merge_mouse['Tumor Volume (mm3)'][i])
    if merge_mouse['Drug Regimen'][i] == 'Infubinol':
        infubinol.append(merge_mouse['Tumor Volume (mm3)'][i])
        all_drugs.append(merge_mouse['Tumor Volume (mm3)'][i])
    if merge_mouse['Drug Regimen'][i] == 'Ceftamin':
        ceftamin.append(merge_mouse['Tumor Volume (mm3)'][i])
        all_drugs.append(merge_mouse['Tumor Volume (mm3)'][i])

list_all = [capomulin, ramicane, infubinol, ceftamin]

In [None]:
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
quartiles = pd.Series(tumor_vol_data).quantile([0.25, 0.5, 0.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq - lowerq
print(iqr)

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
ax1.boxplot([list_all[0], list_all[1], list_all[2], list_all[3]])

## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
capomulin = dataset[dataset['Drug Regimen'] == 'Capomulin']

capomulin_time_df = capomulin[['Timepoint', 'Tumor Volume (mm3)']].groupby(capomulin['Timepoint']).mean()
capomulin_time = capomulin_time_df['Timepoint']
capomulin_tumor = capomulin_time_df['Tumor Volume (mm3)']

plt.plot(capomulin_time, capomulin_tumor)
plt.title('Capomulin Tumor Volume Over Time')
plt.xlabel('Timepoint')
plt.ylabel('Tumor Volume (mm3)')

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
capomulin_weight_df = capomulin[['Weight (g)', 'Tumor Volume (mm3)']].groupby(capomulin['Weight (g)']).mean()
capomulin_weight = capomulin_weight_df['Weight (g)']
capomulin_tumor = capomulin_weight_df['Tumor Volume (mm3)']

plt.scatter(capomulin_weight, capomulin_tumor)
plt.title('Capomulin Tumor Volume by Weight')
plt.xlabel('Weight (g)')
plt.ylabel('Tumor Volume (mm3)')

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

(slope, intercept, rvalue, pvalue, stderr) = st.linregress(capomulin_weight, capomulin_tumor)
regress_values = capomulin_weight * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(capomulin_weight, capomulin_tumor)
plt.plot(capomulin_weight,regress_values,"r-")
plt.annotate(line_eq,(20,40),fontsize=15,color="red")
plt.title('Capomulin Tumor Volume by Weight')
plt.xlabel('Weight (g)')
plt.ylabel('Tumor Volume (mm3)')
plt.show()