## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
import os

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combine_ms_df = pd.merge(mouse_metadata, study_results, on = 'Mouse ID' )
# Display the data table for preview
combine_ms_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [2]:
# Checking the number of mice
mice_count_df = combine_ms_df["Mouse ID"].nunique()
mice_count_df

249

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
combine_ms_df[["Mouse ID", "Timepoint"]].duplicated()


In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
dup_combine_df = combine_ms_df[combine_ms_df.duplicated(['Mouse ID', 'Timepoint'], keep=False)]
dup_combine_df

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
index_names = combine_ms_df[combine_ms_df['Mouse ID']=='g989'].index

new_combine_ms_df = combine_ms_df.drop(index_names)

new_combine_ms_df.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
new_mouse_df = new_combine_ms_df["Mouse ID"].nunique()
new_mouse_df

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
drug_df = new_combine_ms_df.groupby("Drug Regimen")

#perform calculations
drug_mean = drug_df["Tumor Volume (mm3)"].mean()
drug_median = drug_df["Tumor Volume (mm3)"].median()
drug_var = drug_df["Tumor Volume (mm3)"].var()
drug_stddev = drug_df["Tumor Volume (mm3)"].std()
drug_sem = drug_df["Tumor Volume (mm3)"].sem()

#create dataframe for calculations
drug_calc_df = pd.DataFrame({
    "Tumor Vol Mean": drug_mean,
    "Tumor Vol Median": drug_median,
    "Tumor Vol Variance": drug_var,
    "Tumor Vol Standard Dev": drug_stddev,
    "Tumor Vol SEM": drug_sem
})

#format numbers
drug_calc_df["Tumor Vol Mean"] = drug_calc_df["Tumor Vol Mean"].map("{:,.2f}".format)
drug_calc_df["Tumor Vol Median"] = drug_calc_df["Tumor Vol Median"].map("{:,.2f}".format)
drug_calc_df["Tumor Vol Variance"] = drug_calc_df["Tumor Vol Variance"].map("{:,.2f}".format)
drug_calc_df["Tumor Vol Standard Dev"] = drug_calc_df["Tumor Vol Standard Dev"].map("{:,.2f}".format)
drug_calc_df["Tumor Vol SEM"] = drug_calc_df["Tumor Vol SEM"].map("{:,.4f}".format)

#print table
drug_calc_df

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
drug_df["Tumor Volume (mm3)"].agg(['mean','median','var', 'std','sem'])

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
drug_df["Tumor Volume (mm3)"].count().plot(kind = "bar", title =
                                          "Measurment per Regimen", width = .75)

plt.ylabel("Number of Measurement")
plt.tight_layout()
plt.show()


In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
bar_plot_df = pd.DataFrame(drug_df["Tumor Volume (mm3)"].count())
bar_plot_df = bar_plot_df.reset_index()

drugs_bar_plot = bar_plot_df["Drug Regimen"]
measurements = bar_plot_df["Tumor Volume (mm3)"]

x_axis = len(drugs_bar_plot)
plt.title("Measurments per Regimen", loc='center')
plt.xlim(-.75, len(drugs_bar_plot)-.25)
plt.ylim(0, max(measurements) +10)
plt.xticks(rotation = 90)
plt.bar(drugs_bar_plot, measurements, width =.75, align ="center")
plt.ylabel("Measurements")
plt.xlabel("Drug Regimen")

plt.tight_layout()
plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

new_combine_ms_df["Sex"].value_counts().plot(kind = "pie", 
                                             title = "Mice By Gender", autopct="%1.1f%%")

plt.tight_layout()
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
values = new_combine_ms_df["Sex"].value_counts()
categories = new_combine_ms_df["Sex"].unique()

labels = categories

fig1, ax1 = plt.subplots()
ax1.pie(values, labels=labels, autopct='%1.1f%%')
ax1.set_title('Mice by Gender')
ax1.set_ylabel('Sex')

plt.tight_layout
plt.show()


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
mouse_id_df = new_combine_ms_df.groupby(["Mouse ID", "Drug Regimen"], as_index=False)
mouse_id_df = pd.DataFrame(mouse_id_df["Timepoint"].max())

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
mouse_final_df = mouse_id_df.merge(new_combine_ms_df)

index_names = mouse_final_df[(mouse_final_df["Drug Regimen"]!="Capomulin")
                            & (mouse_final_df["Drug Regimen"]!= "Ramicane")
                            & (mouse_final_df["Drug Regimen"]!= "Infubinol")
                            & (mouse_final_df["Drug Regimen"]!="Ceftamin")].index

#drop indexes
mouse_final_df.drop(index_names, inplace = True)

#display df 
mouse_final_df.head()


In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatment_list = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

d_volume_df = mouse_final_df.loc[:, ["Drug Regimen", "Tumor Volume (mm3)"]]
d_volume_df.values.tolist()
d_volume_list = d_volume_df.to_numpy().tolist()

# Create empty list to fill with tumor vol data (for plotting)
Capomulin_tumor_vol_data =[]
Ramicane_tumor_vol_data =[]
Infubinol_tumor_vol_data =[]
Ceftamin_tumor_vol_data =[]

# Calculate the IQR and quantitatively determine if there are any potential outliers.
q3 = mouse_final_df["Tumor Volume (mm3)"].quantile(q=.75)
q1 = mouse_final_df["Tumor Volume (mm3)"].quantile(q=.25)
iqr = q3 - q1

print(f"The lower quartile of tumor volume is: {q1}")
print(f"The upper quartile of tumor volume is: {q3}")
print(f"The interquartile range of tumor volume is: {iqr}")

#find outliers
upper_bound = q3 + (iqr * 1.5)
lower_bound = q1 + (iqr * 1.5)

print(f"The upper bound is: {upper_bound}")
print(f"The lower bound is: {lower_bound}")

# Locate the rows which contain mice on each drug and get the tumor volumes
# add subset 
drugs, vol = zip(*d_volume_list)

for index, drug in enumerate(drugs):
    if drug == "Capomulin":
        Capomulin_tumor_vol_data.append(vol[index])
    elif drug == "Ramicane":
        Ramicane_tumor_vol_data.append(vol[index])
    elif drug == "Infubinol":
        Infubinol_tumor_vol_data.append(vol[index])
    elif drug == "Ceftamin":
        Ceftamin_tumor_vol_data.append(vol[index])



# Determine outliers using upper and lower bounds
outlier_df = mouse_final_df.loc[(mouse_final_df['Tumor Volume (mm3)'] < lower_bound)
                                |(mouse_final_df['Tumor Volume (mm3)'] > upper_bound)]
outlier_df.head()

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
red = dict(markerfacecolor ='r', marker='D')
fig1, ax1 = plt.subplots()
ax1.set_title('Final Tumor Volume')
ax1.set_xticklabels(["Capomulin", "Ramicane", "Infubinol", "Ceftamin"], fontsize=10)
ax1.boxplot([Capomulin_tumor_vol_data,Ramicane_tumor_vol_data, Infubinol_tumor_vol_data,Ceftamin_tumor_vol_data ]
           , flierprops = red)
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
index_names2 = new_combine_ms_df[new_combine_ms_df['Mouse ID']!= "b128"].index
tumor_volume_df = new_combine_ms_df.drop(index_names2)
tumor_volume_df = tumor_volume_df.set_index("Timepoint")

#line graph
plt.ylabel('Tumor Volume (mm3)')
tumor_volume_df["Tumor Volume (mm3)"].plot.line(title="Tumor Test for Capomulin Mouse")

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
index_names3 = new_combine_ms_df[new_combine_ms_df["Drug Regimen"]!= "Capomulin"].index
avg_tumor_vol = new_combine_ms_df.drop(index_names3)
avg_tumor_vol = avg_tumor_vol.groupby(["Mouse ID"]).mean()

#add avg_tumor_vol to dv
avg_tumor_vol = avg_tumor_vol.rename(columns = {"Tumor Volume (mm3)": "Avg Tumor Vol"})

#create plot
avg_tumor_vol.plot.scatter(x = "Weight (g)", y = "Avg Tumor Vol", title = "Avg Tumor Vol by Weight")
plt.show()


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
