## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
complete_data=pd.merge(study_results,mouse_metadata,on='Mouse ID',how='left')
# Display the data table for preview
complete_data

In [None]:
# Checking the number of mice.
number_of_mice=len(complete_data['Mouse ID'].unique())
number_of_mice

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
time_points=complete_data["Timepoint"].value_counts()
number_of_time_ID_appears=complete_data['Mouse ID'].value_counts()
number_of_time_ID_appears=number_of_time_ID_appears.reset_index()

duplicate_mice_row=number_of_time_ID_appears.loc[number_of_time_ID_appears['Mouse ID']>len(time_points)]
duplicate_mice_ID=duplicate_mice_row.iloc[0,0]
duplicate_mice_ID

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
all_data_for_duplicate_mice=complete_data.loc[complete_data['Mouse ID']==duplicate_mice_ID]
all_data_for_duplicate_mice

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
complete_data_clean=complete_data.loc[complete_data['Mouse ID']!=duplicate_mice_ID]
complete_data_clean

In [None]:
# Checking the number of mice in the clean DataFrame.
len(complete_data_clean["Mouse ID"].unique())

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

tumor_response_mean_df = pd.DataFrame(complete_data.groupby(['Drug Regimen', 'Timepoint'])['Tumor Volume (mm3)'].mean())
tumor_response_mean_df = tumor_response_mean_df.reset_index()
tumor_response_mean_df.rename(columns={"Tumor Volume (mm3)" : "Mean Tumor Volume (mm3)"},inplace=True)
tumor_response_mean_df.head(10)
tumor_response_median_df = pd.DataFrame(complete_data_clean.groupby(['Drug Regimen', 'Timepoint'])['Tumor Volume (mm3)'].median())
tumor_response_median_df = tumor_response_median_df.reset_index()
tumor_response_median_df.rename(columns={"Tumor Volume (mm3)" : "Median Tumor Volume (mm3)"},inplace=True)
tumor_response_median_df.head(10)
tumor_response_std_df = pd.DataFrame(complete_data_clean.groupby(['Drug Regimen', 'Timepoint'])['Tumor Volume (mm3)'].std())
tumor_response_std_df = tumor_response_std_df.reset_index()
tumor_response_std_df.rename(columns={"Tumor Volume (mm3)" : "Tumor Volume-STDEV"},inplace=True)
tumor_response_std_df.head(10)
tumor_response_var_df = pd.DataFrame(complete_data_clean.groupby(['Drug Regimen', 'Timepoint'])['Tumor Volume (mm3)'].var())
tumor_response_var_df = tumor_response_var_df.reset_index()
tumor_response_var_df.rename(columns={"Tumor Volume (mm3)" : "Tumor Volume-VAR"},inplace=True)
tumor_response_var_df.head(10)
tumor_response_sem_df = pd.DataFrame(complete_data_clean.groupby(['Drug Regimen', 'Timepoint'])['Tumor Volume (mm3)'].sem())
tumor_response_sem_df = tumor_response_sem_df.reset_index()
tumor_response_sem_df.rename(columns={"Tumor Volume (mm3)" : "Tumor Volume-SEM"},inplace=True)
tumor_response_sem_df.head(10)

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

tumor_response_summary_df = pd.merge(tumor_response_mean_df, tumor_response_median_df, on=['Drug Regimen','Timepoint'])
tumor_response_summary_df = pd.merge(tumor_response_summary_df, tumor_response_std_df, on=['Drug Regimen','Timepoint'])
tumor_response_summary_df = pd.merge(tumor_response_summary_df, tumor_response_var_df, on=['Drug Regimen','Timepoint'])
tumor_response_summary_df = pd.merge(tumor_response_summary_df, tumor_response_sem_df, on=['Drug Regimen','Timepoint'])
tumor_response_summary_df.head(20)

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line

summarystats = pd.DataFrame(complete_data_clean.groupby("Drug Regimen").count())

#Use groupby to create summary stats by drug regime, add results into columns in summarystats

summarystats["Mean"] = pd.DataFrame(complete_data_clean.groupby("Drug Regimen")["Tumor Volume (mm3)"].mean())
summarystats["Median"] = pd.DataFrame(complete_data_clean.groupby("Drug Regimen")["Tumor Volume (mm3)"].median())
summarystats["Standard Deviation"] = pd.DataFrame(complete_data_clean.groupby("Drug Regimen")["Tumor Volume (mm3)"].std())
summarystats["Variance"] = pd.DataFrame(complete_data_clean.groupby("Drug Regimen")["Tumor Volume (mm3)"].var())
summarystats["SEM"] = pd.DataFrame(complete_data_clean.groupby("Drug Regimen")["Tumor Volume (mm3)"].sem())

summarystats = summarystats[["Mouse ID", "Mean", "Median", "Standard Deviation", "Variance", "SEM"]]
summarystats = summarystats.rename(columns = {"Mouse ID" : "Trials"})
summarystats.head()

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.

number_of_mice_per_treatment=summarystats.iloc[:,0]
number_of_mice_per_treatment.plot(kind='bar',title='Number of Mice for Drug Regimen',color='g',alpha=.5)
plt.show()


In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.

import numpy as np
Drug_Regimen=summarystats.reset_index().iloc[:,0].to_list()
x_axis=np.arange(len(Drug_Regimen))
plt.bar(x_axis,number_of_mice_per_treatment,color='g',alpha=.5)
plt.xticks(x_axis,['Capomulin',
 'Ceftamin',
 'Infubinol',
 'Ketapril',
 'Naftisol',
 'Placebo',
 'Propriva',
 'Ramicane',
 'Stelasyn',
 'Zoniferol'],rotation='vertical')

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas

pie_panda = complete_data_clean["Sex"].value_counts().plot.pie()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

plt.title("Sex")
plt.pie(complete_data_clean["Sex"].value_counts(), autopct="%1.1f%%", shadow=True, startangle=140, labels=["Male","Female"])
plt.axis("equal")
plt.show()
plt.savefig('Demographics-Pie')


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint

In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds

    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
data_to_plot = [cap_tum_vol, ram_tum_vol, inf_tum_vol, cef_tum_vol]

fig1, ax1 = plt.subplots()
ax1.set_title('Tumors')
ax1.set_ylabel('Final Tumor Volume (mm3)')
ax1.set_xlabel('Drug Regimen')

ax1.boxplot(data_to_plot, labels=["Capomulin","Ramicane","Infubinol","Ceftamin",])

plt.savefig('Final Tumor Volume by Drugs')
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
cap_df.head(20) #look at head to identify any mouse ID
plot_df = cap_df.loc[cap_df["Mouse ID"] == "s185",:]
plot_df
plt.title('Tumor Volume with Capomulin over time')
plt.plot(plot_df["Timepoint"], plot_df["Tumor Volume (mm3)"],linewidth=2, markersize=12)
plt.xlabel('Time (Days)')
plt.ylabel('Tumor Volume (mm3)')
plt.savefig('Tumor Volume Progression-s185')
plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
cap_avg_df = cap_df.groupby(['Mouse ID']).mean()# Compute an averaged df; all fields may not make sense
plt.scatter(cap_avg_df['Weight (g)'],cap_avg_df['Tumor Volume (mm3)'])
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')

plt.savefig('Avg Tumor Vol vs Weight-Capomulin')
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
corr=round(st.pearsonr(cap_avg_df['Weight (g)'],cap_avg_df['Tumor Volume (mm3)'])[0],2)
print(f"The correlation between mouse weight and average tumor volume is {corr}")

In [None]:
from scipy.stats import linregress
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(6,10),fontsize=15,color="red")
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
# Print out the r-squared value along with the plot.
print(f"The r-squared is: {rvalue}")
plt.savefig('Avg Tumor Vol vs Weight-Capomulin w regression')
plt.show()