## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as sts
import numpy as np
from pprint import pprint
from scipy.stats import linregress
from sklearn import datasets

In [None]:
# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_data = pd.merge(mouse_metadata, study_results, on="Mouse ID")
# Display the data table for preview
combined_data.to_csv("data/combined_data.csv")
combined_data.head()

In [None]:
# Checking the number of mice.
print(combined_data["Mouse ID"].count())

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

# Optional: Get all the data for the duplicate mouse ID. 

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = combined_data.drop_duplicates(subset=["Mouse ID", "Timepoint"], keep='first')
clean_df.to_csv("data/clean_df.csv")
clean_df.head(10)

In [None]:
# Checking the number of mice in the clean DataFrame.
print(clean_df["Mouse ID"].count())

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
tumor_volume_statics= clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].describe()
tumor_volume_statics["Median"]=clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].median()
tumor_volume_statics["Variance"]= clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].var()
tumor_volume_statics["Standard Deviation"]= clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].std()
tumor_volume_statics["SEM"]=clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].sem()
tvs_df1 = tumor_volume_statics.drop(tumor_volume_statics.columns[[0, 2, 3,4,5,6,7]], axis=1)
tvs_df1.rename(columns={'mean':"Mean"},inplace =True)
tvs_df1.head()


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
#df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
tvs_df2=clean_df.groupby("Drug Regimen").agg({"Tumor Volume (mm3)":['mean', 'median', 'var', 'std', 'sem']})
tvs_df2.rename(columns={'mean':"Mean", "median": "Median", "var":"Variance","std":"Standard Deviation",'sem':"SEM"},inplace =True)
tvs_df2.head()

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
ax =tvs_df1.plot(kind='bar',rot=45, figsize=(20,5))
plt.title('Tumor Measurement Statistics Taken on Each Drug Regimen')
plt.tight_layout()
plt.show()

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
def multibarplot(x,y1,y2,y3,y4,y5,ticks):
    plt.rcParams["figure.figsize"] = [25,5]
    fig = plt.figure()
    ax = fig.add_subplot(111)
    width=0.1
    y_values=[y1,y2,y3,y4,y5]
    ticks=ticks
    bars=[]
    for y in range(1,len(y_values)+1):
        if y == 1:
            bar=ax.bar(x_axis-0.2,y_values[y-1],width)
            bars.append(bar[0])
        else:
            bar=ax.bar(x_axis-0.2+width*(y-1),y_values[y-1],width)
            bars.append(bar[0])
    plt.xticks(x_axis, ticks,rotation=45)
    ax.legend(bars,["Mean", "Median", "Variance","Standard Deviation","SEM"],loc=2)
    plt.xlabel("Drug Regimen")
    plt.title('Tumor Measurement Statistics Taken on Each Drug Regimen')
    plt.show()
x_axis = np.arange(len(tvs_df1["Mean"]))+1
mean_tv=tvs_df1["Mean"]
#ax.bar(x_axis,mean_tv,shift)
median_tv=tvs_df1["Median"]
#ax.bar(x_axis+width,median_tv,shift)
var_tv=tvs_df1["Variance"]
#ax.bar(x_axis+width*2,var_tv,shift)
std_tv=tvs_df1["Standard Deviation"]
sem_tv=tvs_df1["SEM"]
drug_names=tvs_df1.reset_index()["Drug Regimen"]
multibarplot(x_axis,mean_tv,median_tv,var_tv,std_tv,sem_tv,drug_names)


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
sex_nums=combined_data["Sex"].value_counts().plot(kind="pie", explode = (0.1,0), shadow =True, startangle=160,autopct="%1.1f%%", figsize=(5,5))
plt.title("Sex")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
male_count=combined_data.loc[combined_data["Sex"]=="Male"].count()[2]
female_count=combined_data.loc[combined_data["Sex"]=="Female"].count()[2]
sex_counts=[male_count,female_count]
sex_type=["Male", "Female"]
explode = (0.1,0)
plt.pie(sex_counts,explode=explode,labels=sex_type,autopct="%1.1f%%",shadow=True,startangle=160)
plt.title("Sex",loc="center")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
last_timepoint_df = combined_data.groupby("Mouse ID")["Timepoint"].max()
last_timepoint_df=last_timepoint_df.reset_index()
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
final_tv_df = pd.merge(combined_data,last_timepoint_df, on="Mouse ID")
final_tv_df.head(20)
final_timepoint_tv = final_tv_df.loc[final_tv_df['Timepoint_x'] == final_tv_df['Timepoint_y']]
final_timepoint_tv = final_timepoint_tv.reset_index()
final_timepoint_tv.rename(columns={'Timepoint_x':"Timepoint", "Timepoint_y": "Final Timepoint"},inplace =True)
final_timepoint_tv.head(10)

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatments=['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']
# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data=[]
tumor_vol_data_s=[]
lowerq=[]
upperq=[]
iqr=[]
lower_bound =[]
upper_bound=[]
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for drug in treatments:
    t_vol=final_timepoint_tv.loc[final_timepoint_tv["Drug Regimen"]==drug] #["Tumor Volume (mm3)"]
    t_vol_list=t_vol["Tumor Volume (mm3)"].tolist()
    tumor_vol_data_s.append(t_vol["Tumor Volume (mm3)"])
    tumor_vol_data.append(t_vol_list)
    # add subset 
    quartiles = t_vol["Tumor Volume (mm3)"].quantile([.25,.5,.75])
    lowerq.append(quartiles[0.25])
    upperq.append(quartiles[0.75])
    iqr.append(quartiles[0.75]-quartiles[0.25])

In [None]:
for i in range(len(treatments)):
    # Determine outliers using upper and lower bounds
    lower_bound.append(lowerq[i]-(1.5*iqr[i]))
    upper_bound.append(upperq[i]+(1.5*iqr[i]))

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
outliers=[]
outlier_x_points=[]
fig,axs=plt.subplots(2,2)
for j in range(len(tumor_vol_data_s)):
    x=[0,1,0,1]
    y=[0,1,1,0]
    ax1=axs[x[j],y[j]]
    ax1.set_title(f"Final Tumor Volume of Each Mouse Treated with {treatments[j]}")
    ax1.set_ylabel("Tumor Volume (mm3)")
    ax1.set_ylim(0,upper_bound[j]+10)
    ax1.boxplot(tumor_vol_data_s[j],notch=True,flierprops=dict(markerfacecolor='r',marker='D'))
    y_points=[lowerq[j],upperq[j],iqr[j],lower_bound[j],upper_bound[j]]
    x_points=[1,1,1,1,1]
    sub_outliers=[]
    sub_x_p_outliers=[]
    for o in tumor_vol_data_s[j]:
        if o > upper_bound[j] or o <lower_bound[j]:
            sub_outliers.append(o)
            sub_x_p_outliers.append(1)
    outliers.append(sub_outliers)
    outlier_x_points.append(sub_x_p_outliers)
    q_labels=["Lower Quartiles","Upper Quartile", "Interquartile Range","Lower Bound", "Upper Bound"]
    ax1.plot(x_points[:2],y_points[:2],ls='',marker='o',mec="blue",mfc="green",markersize=7)
    ax1.plot(x_points[2],y_points[2],ls='',marker='h',mec="blue",mfc="yellow",markersize=10)
    ax1.plot(x_points[3:],y_points[3:],ls='',marker='*',mec="blue",mfc="orange",markersize=13)
    #ax1.plit()
    for i in range(5):
        ax1.text(x_points[i]+.08,y_points[i],"<-------"+ q_labels[i]+ ": "+f"{round(y_points[i],4)}",horizontalalignment="left")
        plt.tight_layout()
    if not outliers[j]:
        x=0
    else:
        round_out=round(outliers[j][0],4)
        ax1.plot(1,outliers[j][0],ls='',marker='_',mec="red",mfc="red",markersize=70)
        ax1.text(1-.2,outliers[j][0]-2, f"Outlier {round_out}---->",horizontalalignment="left")
    plt.tight_layout()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
treatment_slect=0
drug=treatments[0]
capomulin_weight_df=combined_data.loc[combined_data["Drug Regimen"]==drug]
#mouse treated with capomulin_mouse_df
mouse=(np.unique(capomulin_weight_df["Mouse ID"].tolist())).tolist()
mouse_select=0
capomulin_mouse_df=combined_data.loc[combined_data["Mouse ID"]==mouse[mouse_select]]
x_time=capomulin_mouse_df["Timepoint"]
y_tv=capomulin_mouse_df["Tumor Volume (mm3)"]
plt.plot(x_time,y_tv,marker='o',color='blue')
plt.xlabel("Time")
plt.ylabel("Tumor Volume (mm3)")
plt.title(f"Tumor Volume vs. Time Point for {mouse[mouse_select]} treated with {drug}")
plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
capomulin_weight_df.head()

In [None]:
avg_tv=capomulin_weight_df.groupby("Mouse ID")["Tumor Volume (mm3)"].mean()
avg_weight=capomulin_weight_df.groupby("Mouse ID")["Weight (g)"].mean()
#mice_weight=capomulin_weight_df["Weight (g)"].tolist()
#tv_vol_mm3=capomulin_weight_df["Tumor Volume (mm3)"].tolist()
plt.scatter(avg_tv,avg_weight)
plt.title(f"Average Tumor Volume vs Mouse Weight for the {drug} Regimen.")
plt.ylabel("Weight (g)")
plt.xlabel("Tumor Volume (mm3)")
plt.show()


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
c=sts.pearsonr(avg_tv,avg_weight)[0]
(slope1, intercept1, rvalue1, pvalue1, stderr1) = linregress(avg_tv,avg_weight)
regress_values1 = avg_tv * slope1 + intercept1
line_eq1 = "y = " + str(round(slope1,3)) + "x + " + str(round(intercept1,3))
plt.scatter(avg_tv,avg_weight)
plt.title(f"Average Tumor Volume vs Mouse Weight for the {drug} Regimen.")
plt.plot(avg_tv, regress_values1,"r-")
plt.annotate(line_eq1,(40,21.5),fontsize=15,color="red")
plt.xlabel(f"Tumor Volume (mm3) \n The r-squared is: {round((rvalue1**2),3)} \n The correlation between mouse weight and average tumor volume is {round(c,3)}")
plt.ylabel("Weight (g)")
plt.show()