## Observations and Insights 

In [1]:
%matplotlib notebook

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as st


# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mergeData_df=pd.merge(mouse_metadata, study_results, on="Mouse ID", how = "left")

# Display the data table for preview
mergeData_df.head()

In [None]:
# Checking the number of mice.
mergeData_df["Mouse ID"].nunique()

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
#find_duplicate =mergeData_df.loc[mergeData_df.duplicated(subset=['Mouse ID', 'Timepoint']),['Mouse ID','Timepoint']]
find_duplicate = mergeData_df.loc[mergeData_df.duplicated(subset=['Mouse ID', 'Timepoint']),'Mouse ID'].unique()
find_duplicate

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
iso_dup_mouseId = mergeData_df.loc[mergeData_df["Mouse ID"] == "g989"]
iso_dup_mouseId

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleanmouse_df = mergeData_df[mergeData_df["Mouse ID"].isin(find_duplicate) == False]
cleanmouse_df.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
cleanmouse_df["Mouse ID"].nunique()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
meandata = cleanmouse_df.groupby("Drug Regimen").mean()["Tumor Volume (mm3)"]
mediandata = cleanmouse_df.groupby("Drug Regimen").median()["Tumor Volume (mm3)"]
vardata = cleanmouse_df.groupby("Drug Regimen").var()["Tumor Volume (mm3)"]
stddata= cleanmouse_df.groupby("Drug Regimen").std()["Tumor Volume (mm3)"]
semdata= cleanmouse_df.groupby("Drug Regimen").sem()["Tumor Volume (mm3)"]
# This method is the most straighforward, creating multiple series and putting them all together at the end.
TumVolSum_df = pd.DataFrame({"Tumor Volume Mean": meandata,
                           "Tumor Volume Median": mediandata,
                           "Tumor Volume Variance": vardata,
                           "Tumor Volume STDEV": stddata,
                           "Tumor Volume SEM": semdata })
TumVolSum_df

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method produces everything in a single groupby function
summary1_df = cleanmouse_df.groupby("Drug Regimen").agg({"Tumor Volume (mm3)":["mean","median","var","std","sem"]})
summary1_df



## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
miceOndrugs = cleanmouse_df.groupby("Drug Regimen")
# Create a dataframe counting the number of data points for each Drug Regimen
drugscount_df = pd.DataFrame(miceOndrugs["Mouse ID"].count())
drugscount_df = drugscount_df.rename(columns ={"Mouse ID": "Mice"})
drugscount_df = drugscount_df.sort_values("Mice",ascending=False)

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
drugsbarplot = drugscount_df.plot(kind='bar', title = "Mice on Drug Regimen/Treatment", color="salmon", figsize=(10,5))
#Set xlabel/ylabel
drugsbarplot.set_xlabel("Treatment")
drugsbarplot.set_ylabel("Count")
#Layout
plt.show()
plt.tight_layout()

In [None]:
forpyplot = drugscount_df.reset_index("Drug Regimen")
forpyplot

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.

#Create a handle/variable because passing dataframe columnn wasnt working like in avg_state_rain activity
mice_counts = forpyplot["Mice"]

#Set the x_axis and tick locations 
x_axis = np.arange(len(forpyplot))
tick_locations = [value for value in x_axis]


#Set size of bar chart and xticks and color see example from avg_state_rain activity
plt.figure(figsize=(10,5))
plt.bar(x_axis, mice_counts, alpha=0.5, align="center", color='salmon')
plt.xticks(tick_locations, forpyplot["Drug Regimen"], rotation=45)

#Set xlim/ylim
plt.xlim(-0.75, len(x_axis)-0.30)
plt.ylim(0, max(mice_counts)+10)

#Legend
plt.legend(["Mice"])

#Set title, xlabel and ylabel
plt.title("Mice on Drug Regimen/Treatment")
plt.xlabel("Treatment")
plt.ylabel("Mice Count")



In [None]:
# Group genders to show the distribution of female versus male mice using pandas
# Refenece https://kontext.tech/column/code-snippets/402/pandas-dataframe-plot-pie-chart
Fem_Male_groupby = cleanmouse_df.groupby("Sex")
Fem_vs_Male= Fem_Male_groupby["Sex"].count()
#PieChart=cleanmouse_df.groupby(['Sex']).sum().plot(kind='pie', y='Sex', shadow = True)
explode = (0.075, 0)
fMchart = Fem_vs_Male.plot(kind='pie', y='Sex', title="Female Vs. Male Mice",\
                           autopct='%1.1f%%',shadow=True, startangle=90, explode = explode,\
                           fontsize=12, colors =["coral","teal"],legend =False)

plt.axis("equal")
plt.show()



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

#Explode
explode = (0.075, 0)

#Labels 
labels = ["Female","Male"]

#List of sizes
sizes = [49.0,51.0]

#Colors
colors = ['coral','teal']

#plt.pie values 
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct="%1.1f%%", shadow=True, startangle=90)

#Title
plt.title("Female Vs. Male Mice")

#Make axis equal
plt.axis("Equal")


## Quartiles, Outliers and Boxplots

In [None]:
cleanmouse_df

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:
# Start by getting the last (greatest) timepoint for each mouse
max_timepoint = cleanmouse_df.groupby(["Mouse ID"]).max()
max_timepoint = max_timepoint.reset_index() 
max_timepoint

In [None]:
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
combined_maxTP_df = max_timepoint[["Mouse ID","Timepoint"]].merge(cleanmouse_df, on=["Mouse ID","Timepoint"], how ='left' )
combined_maxTP_df

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens: 
# Capomulin, Ramicane, Infubinol, and Ceftamin
Drug1 = combined_maxTP_df.loc[combined_maxTP_df["Drug Regimen"]=="Capomulin"]["Tumor Volume (mm3)"]
Drug2 = combined_maxTP_df.loc[combined_maxTP_df["Drug Regimen"]=="Ramicane"]["Tumor Volume (mm3)"]
Drug3 = combined_maxTP_df.loc[combined_maxTP_df["Drug Regimen"]=="Infubinol"]["Tumor Volume (mm3)"]
Drug4 = combined_maxTP_df.loc[combined_maxTP_df["Drug Regimen"]=="Ceftamin"]["Tumor Volume (mm3)"]



In [None]:
# Put treatments into a list for for loop (and later for plot labels)
List1 = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]
# Create empty list to fill with tumor vol data (for plotting)
tumor_list =[]

for x in List1:
#Locate the rows which contain mice on each drug and get the tumor volumes
    drugData = combined_maxTP_df.loc[combined_maxTP_df["Drug Regimen"]== x]["Tumor Volume (mm3)"]
    tumor_list.append(drugData)
    
    quartiles = drugData.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
    iqr = upperq-lowerq
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    
# Determine outliers using upper and lower bounds    
    print(f"{x} potential outliers: {drugData.loc[(drugData < lower_bound) | (drugData > upper_bound)]}")

    
    
  
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
plt.boxplot(tumor_list, labels = List1)

## Line and Scatter Plots

In [None]:
mouse_C_drug = combined_maxTP_df.loc[combined_maxTP_df["Drug Regimen"]=="Capomulin",:]
mouse_C_drug

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
one_mouse = mergeData_df[mergeData_df["Mouse ID"].isin(["j119"])]
one_mouse_cols = one_mouse[["Mouse ID", "Timepoint", "Tumor Volume (mm3)"]]
one_mouse_cols = one_mouse_cols.reset_index()
one_mouse_cols_reset = one_mouse_cols[["Mouse ID", "Timepoint", "Tumor Volume (mm3)"]]
Mouseplot = one_mouse_cols_reset.plot.line()


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
