## Observations and Insights 

Some observations to be made are that we ultimately look at Capomulin, but Ramicone may be slightly better based off the overall results of tumor size and the quartile data and the fact that it has more data points as well.

As for the Capomulin we can see per the line plot, that at least for some mice, the final tumor size is not necessarily the smallest.  This would suggest that a longer trial will be needed to ensure the treatments continue an overall downward trend.

This study showed that younger/smaller mice had smaller tumors leading me to think that there may exist an age bias in the data in that the tumor size will be weighted towards the older/larger mice in the summary statistics.  I would suggest using only mice of the same age/size in any given study rather than confounding the data.


In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [None]:
mouse_metadata.head()

In [None]:
study_results.head()

In [None]:
# Combine the data into a single dataset
combined_mouse_data_df = pd.merge(mouse_metadata, study_results, how="outer", on="Mouse ID")
# Display the data table for preview
combined_mouse_data_df.head()

In [None]:
# Checking the number of mice.
len(combined_mouse_data_df['Mouse ID'].unique())

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
combined_mouse_data_df[["Mouse ID", "Timepoint"]].value_counts()

Here I can see that mouse g989 has 2 counts for each timepoint and so I know that is a duplicate

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID. and check again
cleaned_merged_data_df = combined_mouse_data_df.loc[combined_mouse_data_df['Mouse ID'] != 'g989']
cleaned_merged_data_df[["Mouse ID", "Timepoint"]].value_counts()

In [None]:
cleaned_merged_data_df.head()

In [None]:
# Checking the number of mice again to see that the bad mouse is out
len(cleaned_merged_data_df['Mouse ID'].unique())

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
dupe_mouse_data_df = combined_mouse_data_df.loc[combined_mouse_data_df['Mouse ID'] == 'g989']
dupe_mouse_data_df

## Summary Statistics

In [None]:
# Setting up a new dataframe with just the values of interest
tumor_volume_bydrug_df = cleaned_merged_data_df[['Drug Regimen', 'Tumor Volume (mm3)']]

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
drug_performance_gdf = tumor_volume_bydrug_df.groupby('Drug Regimen', as_index=False)

In [None]:
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
mean_drug_performance_gdf = drug_performance_gdf.mean().rename(columns={'Tumor Volume (mm3)' : 'Tumor Volume (mm3) mean'})
median_drug_performance_gdf = drug_performance_gdf.median().rename(columns={'Tumor Volume (mm3)' : 'Tumor Volume (mm3) median'})
std_drug_performance_gdf = drug_performance_gdf.std().rename(columns={'Tumor Volume (mm3)' : 'Tumor Volume (mm3) std'})
var_drug_performance_gdf = drug_performance_gdf.var().rename(columns={'Tumor Volume (mm3)' : 'Tumor Volume (mm3) var'})
sem_drug_performance_gdf = drug_performance_gdf.sem().rename(columns={'Tumor Volume (mm3)' : 'Tumor Volume (mm3) sem'})

In [None]:
# Assemble the resulting series into a single summary dataframe.
merged_drug_summaries_gdf = pd.merge(mean_drug_performance_gdf, median_drug_performance_gdf, how='outer', on='Drug Regimen')
merged_drug_summaries_gdf = pd.merge(merged_drug_summaries_gdf, std_drug_performance_gdf, how='outer', on='Drug Regimen')
merged_drug_summaries_gdf = pd.merge(merged_drug_summaries_gdf, var_drug_performance_gdf, how='outer', on='Drug Regimen')
merged_drug_summaries_gdf = pd.merge(merged_drug_summaries_gdf, sem_drug_performance_gdf, how='outer', on='Drug Regimen')
merged_drug_summaries_gdf = merged_drug_summaries_gdf.set_index('Drug Regimen')
merged_drug_summaries_gdf

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Using the aggregation method, produce the same summary statistics in a single line
aggs = cleaned_merged_data_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].agg([np.mean, np.median, np.std, np.var, st.sem])
aggs

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
# Again make a new df for only values of interest
drug_vs_timepoints_df = cleaned_merged_data_df[['Drug Regimen', 'Timepoint']]

In [None]:
# A table of total timepoint counts
total_timepoints = drug_vs_timepoints_df['Drug Regimen'].value_counts()

In [None]:
figure1 = total_timepoints.plot(kind="bar", facecolor="red", figsize=(8,6),
                                title="Total timepoints for all mice tested for each drug regimen",
                                xlabel="Drug Regimen",
                                ylabel="Total timepoints")
treatments = drug_vs_timepoints_df['Drug Regimen'].unique()

# Configure x-tick axis

figure1.set_xticklabels(treatments, rotation=45, rotation_mode="anchor", ha="right")

# Show plot
plt.show()

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
x_axis = np.arange(len(drug_vs_timepoints_df['Drug Regimen'].unique()))
tick_locations = [value+0.4 for value in x_axis]

plt.figure(figsize=(8,6))
plt.bar(x_axis, total_timepoints, color='r', alpha=0.5, align="edge")
plt.xticks(tick_locations, drug_vs_timepoints_df['Drug Regimen'].unique(), rotation=45)
plt.xlabel('Drug Regimen')
plt.ylabel('Total Number of Timepoints')
plt.title("Total timepoints for all mice tested for each drug regimen")
plt.tight_layout()
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
gender_dist = cleaned_merged_data_df['Sex']
gender_dist_count = gender_dist.value_counts() 
gender_dist_count
# Note that this is the number of times a mouse is tested by gender, not the number of mice of a gender

In [None]:
figure2 = gender_dist_count.plot(kind="pie", y='gender_dist_count', title="Gender Distribution of Mice", autopct="%1.1f%%")
plt.axis("equal")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
gender = ['Male', 'Female']
explode = (0.1,0)
plt.pie(gender_dist_count, explode=explode, labels=gender, autopct="%1.1f%%", shadow=True)
plt.axis("equal")
plt.ylabel("Sex")
plt.title("Gender Distribution of Mice")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
last_time_cleaned_merged_data_gdf = cleaned_merged_data_df.groupby('Mouse ID')["Timepoint"].last()
last_time_cleaned_merged_data_gdf

In [None]:
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
final_tumor_data_gdf = pd.merge(cleaned_merged_data_df, last_time_cleaned_merged_data_gdf, how="right", on=["Mouse ID", "Timepoint"])
final_tumor_data_gdf

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
candidate_drugs = ['Capomulin', 'Ceftamin', 'Infubinol', 'Ramicane']

In [None]:
# Get the df down to just the candidates
candidate_treatments_results_gdf = final_tumor_data_gdf.loc[(final_tumor_data_gdf['Drug Regimen']==candidate_drugs[0]) |
                                                     (final_tumor_data_gdf['Drug Regimen']==candidate_drugs[1]) | 
                                                     (final_tumor_data_gdf['Drug Regimen']==candidate_drugs[2])|
                                                     (final_tumor_data_gdf['Drug Regimen']==candidate_drugs[3])]
candidate_treatments_results_gdf = candidate_treatments_results_gdf.sort_values(["Drug Regimen", "Mouse ID"]).reset_index(drop=True)
candidate_treatments_results_gdf

We now have the data down to only 100 rows indicating success.

In [None]:
# Create empty list to fill with tumor vol data (for plotting)
tumor_volumes = []

In [None]:
# Locate the rows which contain mice on each drug and get the tumor volumes
for drug in candidate_drugs:
    tumor_volumes.append(candidate_treatments_results_gdf.loc[candidate_treatments_results_gdf['Drug Regimen']==drug, 'Tumor Volume (mm3)'])


In [None]:
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for x in range(len(candidate_drugs)):
    quartiles = tumor_volumes[x].quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq

    print(f"The lower quartile of {candidate_drugs[x]} is: {lowerq}")
    print(f"The upper quartile of {candidate_drugs[x]} is: {upperq}")
    print(f"The interquartile range of {candidate_drugs[x]} is: {iqr}")
    print(f"The the median of {candidate_drugs[x]} is: {quartiles[0.5]} ")

    # Determine outliers using upper and lower bounds
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    if (tumor_volumes[x].max() > upper_bound):
        print(f"The value of {tumor_volumes[x].max()} in {candidate_drugs[x]} could be an outlier.")
    elif (tumor_volumes[x].min() < lower_bound):
        print(f"The value of {tumor_volumes[x].min()} in {candidate_drugs[x]} could be an outlier.")
    else:
        print(f"The treatment {candidate_drugs[x]} does not have any outliers.")
    
    print("")

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
ax1.set_title("Tumor Volumes at the End of Each Treatment")
ax1.boxplot(tumor_volumes, labels=candidate_drugs)
plt.xlabel("Candidate Treatments")
plt.ylabel("Tumor Size at End of Treatment")
plt.show()

In the figure above as well as in the calculated values, we can see that Infubinol has a potential outlier.

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
# First reduce the data to only to one mouse from the Capomulin results

single_mouse_in_capo_study = cleaned_merged_data_df.loc[(cleaned_merged_data_df['Mouse ID'] == 'b742') & (cleaned_merged_data_df['Drug Regimen'] == candidate_drugs[0]) ]

# Get the X-axis

x_axis = single_mouse_in_capo_study.iloc[:,5]
tumor_vol = single_mouse_in_capo_study.iloc[:,6]

plt.plot(x_axis, tumor_vol, marker='o', color='blue', label='b742')
plt.title("Mouse b742 Under Treatment by Capomulin")
plt.xlabel('Time (days)')
plt.ylabel('Tumor Volume (mm3)')
plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
capomulin_study = cleaned_merged_data_df.loc[(cleaned_merged_data_df['Drug Regimen']==candidate_drugs[0])]

In [None]:
# Get averages for weights and tumor volumes
capo_weight = capomulin_study.groupby('Mouse ID')['Weight (g)'].mean()
capo_tumor = capomulin_study.groupby('Mouse ID')['Tumor Volume (mm3)'].mean()

In [None]:
#mouse_weight =  capomulin_study.iloc[:,4] 
plt.scatter(capo_weight, capo_tumor)
plt.xticks(capo_weight)
plt.title("Total Tumor Size vs. Mouse Weight in Treatment Capomulin")
plt.xlabel('Mouse Weight (g)')
plt.ylabel('Tumor Volume (mm3)')
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(capo_weight, capo_tumor)
regress_values = capo_weight * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

In [None]:
plt.scatter(capo_weight, capo_tumor)
plt.plot(capo_weight,regress_values,"r-")
plt.annotate(line_eq,(19,25),fontsize=15,color="red")
plt.xticks(capo_weight)
plt.title("Total Tumor Size vs. Mouse Weight in Treatment Capomulin")
plt.xlabel('Mouse Weight (g)')
plt.ylabel('Tumor Volume (mm3)')
print(f"The r-squared is: {rvalue**2}")
plt.show()