## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merged_df = pd.merge(mouse_metadata, study_results, on='Mouse ID')

# Display the data table for preview
merged_df.head(15)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
5,k403,Ramicane,Male,21,16,25,33.464577,1
6,k403,Ramicane,Male,21,16,30,31.099498,1
7,k403,Ramicane,Male,21,16,35,26.546993,1
8,k403,Ramicane,Male,21,16,40,24.365505,1
9,k403,Ramicane,Male,21,16,45,22.050126,1


In [2]:
# Checking the number of mice.
mice_count = merged_df.groupby(['Mouse ID'])
len(mice_count)

mice_count_2 = merged_df['Mouse ID'].value_counts()
len(mice_count_2)

249

In [3]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
dups = merged_df[merged_df.duplicated(['Mouse ID','Timepoint'], keep=False)]
dups

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [4]:
# Optional: Get all the data for the duplicate mouse ID. 


In [51]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = merged_df.drop_duplicates(subset=['Mouse ID', 'Timepoint'], keep='first').rename(columns={'Drug Regimen':'Drug_Regimen'})
clean_df

Unnamed: 0,Mouse ID,Drug_Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [6]:
# Checking the number of mice in the clean DataFrame.
mice_count_clean = clean_df.groupby('Mouse ID')
len(mice_count_clean)

249

## Summary Statistics

In [74]:
# Create a unique list of elements
drug_series = clean_df['Drug_Regimen'] 
drugs = list(drug_series.unique())

#Alternative to create unique list of elements
drug_list = list(clean_df['Drug_Regimen'].drop_duplicates(keep='first')) # save 1 line of code but less clean


#dff = clean_df.groupby(['Drug Regimen'])

ramicane = []



#df = clean_df.loc[clean_df['Drug_Regimen'] == 'Ramicane']['Tumor Volume (mm3)']
mean = ramicane.append(df.mean())
median = ramicane.append(df.median())
std = ramicane.append(df.std())
var = ramicane.append(df.var())
sem = ramicane.append(df.sem())

ramicane_df = pd.DataFrame({"Ramicane": ramicane})
drugs

['Ramicane',
 'Capomulin',
 'Infubinol',
 'Placebo',
 'Ceftamin',
 'Stelasyn',
 'Zoniferol',
 'Ketapril',
 'Propriva',
 'Naftisol']

In [107]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Generate empty lists
tumors_mean = []
tumors_median = []
tumors_std = []
tumors_var = []
tumors_sme = []

# Loop through drug regimens and append summary stats to lists
for drug in drugs:
    df = clean_df[clean_df['Drug_Regimen'] == drug]['Tumor Volume (mm3)']
    tumors_mean.append(round(df.mean(),2))
    tumors_median.append(round(df.median(),2))
    tumors_std.append(round(df.std(),2))
    tumors_var.append(round(df.var(),2))
    tumors_sme.append(round(df.sem(),2))

# Create summary data frame
stat_df = pd.DataFrame({'Mean': tumors_mean, 'Median': tumors_median, 'Std': tumors_std, 'Var':tumors_var, 'SME':tumors_sme},
                      index=drugs)
sum_stat_df = stat_df.T
sum_stat_df

Unnamed: 0,Ramicane,Capomulin,Infubinol,Placebo,Ceftamin,Stelasyn,Zoniferol,Ketapril,Propriva,Naftisol
Mean,40.22,40.68,52.88,54.03,52.59,54.23,53.24,55.24,52.39,54.33
Median,40.67,41.56,51.82,52.29,51.78,52.43,51.82,53.7,50.91,52.51
Std,4.85,4.99,6.57,7.82,6.27,7.71,6.97,8.28,6.57,8.13
Var,23.49,24.95,43.13,61.17,39.29,59.45,48.53,68.55,43.14,66.17
SME,0.32,0.33,0.49,0.58,0.47,0.57,0.52,0.6,0.53,0.6


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 



In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
