## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata_df = pd.read_csv(mouse_metadata_path).set_index("Mouse ID", verify_integrity=True)
study_results_df = pd.read_csv(study_results_path)


In [2]:
mouse_metadata_df.head(1)

Unnamed: 0_level_0,Drug Regimen,Sex,Age_months,Weight (g)
Mouse ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
k403,Ramicane,Male,21,16


In [3]:
study_results_df.head(1)

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0


In [4]:
# Checking the number of mice.
len(mouse_metadata_df)

249

In [5]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicates = study_results_df.duplicated(["Mouse ID", "Timepoint"], keep = False)

# Create a clean DataFrame by dropping the duplicate mouse by its ID.
trimmed_df = study_results_df.drop_duplicates(subset = ["Mouse ID", "Timepoint"])

# Checking the number of mice in the clean DataFrame.
print("{} after trimming, {} before trimming".format(len(trimmed_df), len(study_results_df)))

1888 after trimming, 1893 before trimming


In [6]:
# Optional: Get all the data for the duplicate mouse ID. 
study_results_df[duplicates].set_index(["Mouse ID", "Timepoint"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Tumor Volume (mm3),Metastatic Sites
Mouse ID,Timepoint,Unnamed: 2_level_1,Unnamed: 3_level_1
g989,0,45.0,0
g989,0,45.0,0
g989,5,48.786801,0
g989,5,47.570392,0
g989,10,51.745156,0
g989,10,49.880528,0
g989,15,51.325852,1
g989,15,53.44202,0
g989,20,55.326122,1
g989,20,54.65765,1


## Summary Statistics

In [38]:

combined_df = mouse_metadata_df.merge(trimmed_df, on = "Mouse ID")

# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

volume_summary_df = combined_df.loc[:,["Drug Regimen", "Tumor Volume (mm3)"]].groupby("Drug Regimen").agg({"Tumor Volume (mm3)" : ['mean', 'median', "var", 'std', 'sem']})
volume_summary_df.head(1)


Unnamed: 0_level_0,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3)
Unnamed: 0_level_1,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.



In [10]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
