## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as sem

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

In [2]:
# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)
mouse_metadata

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16
...,...,...,...,...,...
244,z314,Stelasyn,Female,21,28
245,z435,Propriva,Female,12,26
246,z581,Infubinol,Female,24,25
247,z795,Naftisol,Female,13,29


In [3]:
study_results


Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.000000,0
1,f932,0,45.000000,0
2,g107,0,45.000000,0
3,a457,0,45.000000,0
4,c819,0,45.000000,0
...,...,...,...,...
1888,r944,45,41.581521,2
1889,u364,45,31.023923,3
1890,p438,45,61.433892,1
1891,x773,45,58.634971,4


In [4]:
# Combine the both data into a single dataset
# merge on mouse ID
comb_mousestudy_df = pd.merge(mouse_metadata, study_results, how='outer', on='Mouse ID')

In [5]:
# Display the data table for preview
comb_mousestudy_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [6]:
# Checking the number of mice.
mice_no_study = comb_mousestudy_df['Mouse ID'].count()
mice_no_study

1893

In [7]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

In [8]:
# Optional: Get all the data for the duplicate mouse ID. 



In [9]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
newmousedata = comb_mousestudy_df.drop_duplicates(subset=['Mouse ID', 'Timepoint'], keep="first")
newmousedata

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [10]:
# Checking the number of mice in the clean DataFrame.
newmousedata["Mouse ID"].count()

In [10]:
print(newmousedata['Tumor Volume (mm3)'].mean())
print(newmousedata['Tumor Volume (mm3)'].median())
print(newmousedata['Tumor Volume (mm3)'].var())   
print(newmousedata['Tumor Volume (mm3)'].std())
print(newmousedata['Tumor Volume (mm3)'].sem())

1888

## Summary Statistics

In [11]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each drug regimen Drug Regim

In [45]:
#Use groupby and summary statistical methods to calculate the following properties of each drug regimen:
#mean, median, variance, standard deviation and SEM.
# Groupby on drug regimen and get aggregates
newmousedata_drugs = newmousedata.groupby(['Drug Regimen'])
newmousedata_drugs.agg(['mean','median','var', 'std', 'sem'])['Tumor Volume (mm3)']


Unnamed: 0_level_0,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.393463,50.909965,43.138803,6.568014,0.525862
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


50.449276454131365
48.951421075
79.29127745388136
8.904564978362579
0.2049329125626731


Unnamed: 0_level_0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,13.456522,19.965217,21.565217,40.675741,0.713043
Ceftamin,13.247191,27.398876,19.747191,52.591172,1.179775
Infubinol,16.230337,27.196629,18.174157,52.884795,0.960674
Ketapril,15.659574,27.861702,19.707447,55.235638,1.297872
Naftisol,12.0,27.166667,19.623656,54.331565,1.182796
Placebo,10.734807,27.928177,18.674033,54.033581,1.441989
Propriva,10.570513,27.076923,17.083333,52.393463,1.0
Ramicane,10.684211,19.679825,21.425439,40.216745,0.548246
Stelasyn,12.78453,27.856354,19.226519,54.233149,0.872928
Zoniferol,12.598901,27.692308,19.368132,53.236507,1.230769


Unnamed: 0_level_0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,16.5,20.5,20.0,41.557809,0.0
Ceftamin,12.0,28.0,20.0,51.776157,1.0
Infubinol,20.0,27.0,15.0,51.820584,1.0
Ketapril,18.0,28.0,20.0,53.698743,1.0
Naftisol,9.0,27.0,20.0,52.509285,1.0
Placebo,10.0,28.0,15.0,52.288934,1.0
Propriva,8.0,26.0,15.0,50.909965,1.0
Ramicane,9.0,19.0,20.0,40.673236,0.0
Stelasyn,14.0,28.0,20.0,52.431737,1.0
Zoniferol,12.5,28.0,15.0,51.818479,1.0


Unnamed: 0_level_0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,59.620372,7.466034,206.928043,24.947764,0.72079
Ceftamin,65.147591,2.501016,204.031772,39.290177,1.402527
Infubinol,56.404272,4.769028,181.53447,43.128684,1.054942
Ketapril,36.236432,3.392536,196.839089,68.553577,1.942883
Naftisol,45.102703,2.247748,201.208951,66.173479,1.479919
Placebo,40.384837,3.378146,192.954266,61.168083,1.792449
Propriva,51.678867,2.845658,184.180108,43.138803,1.187097
Ramicane,35.362393,10.465318,203.796178,23.486704,0.477838
Stelasyn,63.036648,2.701473,191.620626,59.450562,0.944874
Zoniferol,33.479115,2.0153,206.918979,48.533355,1.559711


Unnamed: 0_level_0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,7.721423,2.732404,14.384994,4.994774,0.848993
Ceftamin,8.071406,1.58146,14.283969,6.268188,1.184283
Infubinol,7.510278,2.18381,13.473473,6.567243,1.027104
Ketapril,6.01967,1.841884,14.029935,8.279709,1.393873
Naftisol,6.715855,1.499249,14.184814,8.134708,1.216519
Placebo,6.354907,1.837973,13.890798,7.821003,1.338824
Propriva,7.188801,1.686908,13.571297,6.568014,1.08954
Ramicane,5.946629,3.235014,14.27572,4.846308,0.691259
Stelasyn,7.939562,1.643616,13.84271,7.710419,0.972046
Zoniferol,5.786114,1.419612,14.384679,6.966589,1.248884


Unnamed: 0_level_0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,0.509136,0.180169,0.948518,0.329346,0.055981
Ceftamin,0.604977,0.118535,1.070629,0.469821,0.088766
Infubinol,0.562919,0.163684,1.009879,0.492236,0.076985
Ketapril,0.43903,0.134333,1.023238,0.60386,0.101659
Naftisol,0.49243,0.10993,1.040081,0.596466,0.0892
Placebo,0.472356,0.136615,1.032495,0.581331,0.099514
Propriva,0.575565,0.135061,1.086573,0.525862,0.087233
Ramicane,0.393825,0.214244,0.945433,0.320955,0.04578
Stelasyn,0.590143,0.122169,1.028921,0.573111,0.072252
Zoniferol,0.428895,0.105229,1.066263,0.516398,0.092573


Unnamed: 0_level_0,sum,mean,std
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Capomulin,9355.420462,40.675741,4.994774
Ceftamin,9361.228582,52.591172,6.268188
Infubinol,9413.493529,52.884795,6.567243
Ketapril,10384.299876,55.235638,8.279709
Naftisol,10105.671026,54.331565,8.134708
Placebo,9780.078122,54.033581,7.821003
Propriva,8173.380288,52.393463,6.568014
Ramicane,9169.417875,40.216745,4.846308
Stelasyn,9816.199991,54.233149,7.710419
Zoniferol,9689.044192,53.236507,6.966589


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


Unnamed: 0_level_0,Unnamed: 1_level_0,Age_months,Age_months,Age_months,Weight (g),Weight (g),Weight (g),Timepoint,Timepoint,Timepoint,Metastatic Sites,Metastatic Sites,Metastatic Sites
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,std,sum,mean,std,sum,mean,std,sum,mean,std
Drug Regimen,Tumor Volume (mm3),Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Capomulin,23.343598,3,3.0,,17,17.0,,45,45,,1,1,
Capomulin,25.472143,3,3.0,,17,17.0,,40,40,,1,1,
Capomulin,28.167397,16,16.0,,15,15.0,,40,40,,0,0,
Capomulin,28.328531,3,3.0,,17,17.0,,35,35,,1,1,
Capomulin,28.430964,22,22.0,,17,17.0,,45,45,,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zoniferol,68.498639,12,12.0,,25,25.0,,40,40,,2,2,
Zoniferol,68.611061,2,2.0,,28,28.0,,45,45,,3,3,
Zoniferol,70.827796,12,12.0,,25,25.0,,45,45,,2,2,
Zoniferol,71.108118,20,20.0,,26,26.0,,40,40,,1,1,


AttributeError: 'DataFrameGroupBy' object has no attribute 'loc'

In [None]:
### sch_overall_math = passing_math.groupby(["school_name"]).count()["student_name"]
### sch_overall_math

In [None]:
### newschool_metrics_m.columns = ["school_name", "reading_score", "math_score","size","budget", "Student Budget", "% Overall Passing", "% Passing Math"]
### newschool_metrics_m

In [None]:
### create table that highlights the top 5 performing schools based on Overall Passing Rate
### Sort with newschool_metrics_all

### top_performing_df = newschool_metrics_summary.sort_values(by=['% Overall Passing'], ascending=False).head(5)
### top_performing_df

In [None]:
### Dataframe of all reading scores by grade

### combine_all_scores_df3 =pd.concat([combine_scores_df, combine_scores_df2], axis=1)
#3# combine_all_scores_df3

In [None]:
### create table that breaks down school performance based on average spending per student
### pick "newschool_metrics_summary" and drop irrelvant columns
### school_metrics.drop(columns=['0_x','0_y',0], inplace=True)

### school_spending_score = newschool_metrics_summary.drop(columns=['school_name', 'Total Students', 'Total School Budget'], inplace=True)

In [None]:
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.



In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
