## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import sem
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [2]:
# Checking Indiviudal CSV's
# mouse_metadata

In [3]:
# Checking Indiviudal CSV's
# study_results

In [4]:
# Combine the data into a single dataset
mouse_study_df = pd.merge(mouse_metadata, study_results, how='outer', on='Mouse ID')

# Display the data table for preview
mouse_study_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [5]:
# Count Length Of Data Set 
# mouse_study_df.count()
mouse_study_df['Mouse ID'].count()

1893

In [6]:
# Checking unique number of mice 
# Checking the number of mice.
len(mouse_study_df['Mouse ID'].unique())

249

In [7]:
duplicate_mouse_ids = mouse_study_df.loc[mouse_study_df.duplicated(subset=['Mouse ID', 'Timepoint']),'Mouse ID'].unique()
duplicate_mouse_ids

array(['g989'], dtype=object)

In [8]:
duplicate_mouse = mouse_study_df.loc[mouse_study_df["Mouse ID"] == "g989"]
duplicate_mouse

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [9]:
main_mouse_study_df = mouse_study_df[mouse_study_df['Mouse ID'].isin(duplicate_mouse_ids)==False]
# main_mouse_study_df.head()

In [10]:
len(main_mouse_study_df['Mouse ID'].unique())

248

## Summary Statistics

In [11]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
generate_summary = main_mouse_study_df[['Drug Regimen','Tumor Volume (mm3)']]

In [12]:
len(generate_summary['Drug Regimen'].unique())

10

In [13]:
groupby_regimen = generate_summary.groupby(['Drug Regimen'])
# groupby_regimen.head()

In [14]:
mean_summary = groupby_regimen['Tumor Volume (mm3)'].mean()
median_summary = groupby_regimen['Tumor Volume (mm3)'].median()
variance_summary = groupby_regimen['Tumor Volume (mm3)'].var()
std_summary = groupby_regimen['Tumor Volume (mm3)'].std()
sem_summary = groupby_regimen['Tumor Volume (mm3)'].sem()

In [15]:
summary_statistics_table = pd.DataFrame({"Mean":mean_summary,
                                          "Median":median_summary,
                                          "Variance":variance_summary,
                                          "Standard Deviation":std_summary,
                                          "SEM":std_summary})
summary_statistics_table

Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,4.994774
Ceftamin,52.591172,51.776157,39.290177,6.268188,6.268188
Infubinol,52.884795,51.820584,43.128684,6.567243,6.567243
Ketapril,55.235638,53.698743,68.553577,8.279709,8.279709
Naftisol,54.331565,52.509285,66.173479,8.134708,8.134708
Placebo,54.033581,52.288934,61.168083,7.821003,7.821003
Propriva,52.32093,50.446266,43.852013,6.622085,6.622085
Ramicane,40.216745,40.673236,23.486704,4.846308,4.846308
Stelasyn,54.233149,52.431737,59.450562,7.710419,7.710419
Zoniferol,53.236507,51.818479,48.533355,6.966589,6.966589


In [16]:
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
#tumor_volume = groupby_regimen['Tumor Volume (mm3)']
#tumor_volume.head()

In [17]:
#mean_summary = np.mean(tumor_volume)
#median_summary = np.median(tumor_volume)
#variance_summary = np.var(tumor_volume,ddof = 0)
#std_summary = np.std(tumor_volume,ddof = 0)

In [18]:
#sample_num = 100
#samples = [tumor_volume.sample(sample_num) for x in range(0,1)]

In [19]:
#sems = [sem(s) for s in samples]
#sems

In [20]:
#generate_summary["Mean"] = mean_summary
#generate_summary.head(50)

In [21]:
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
#creating_statistics_table = pd.DataFrame({
                                          #"Mean":mean_summary,
                                          #"Median":median_summary,
                                          #"Variance":variance_summary,
                                          #"Standard Deviation:":std_summary, 
                                          #"SEM":sems})
#creating_statistics_table.head()
# Assemble the resulting series into a single summary dataframe.



In [22]:
groupby_regimen_df = generate_summary.groupby(['Drug Regimen'])
# groupby_regimen_df.head()

groupby_regimen_df.head()

Unnamed: 0,Drug Regimen,Tumor Volume (mm3)
0,Ramicane,45.0
1,Ramicane,38.825898
2,Ramicane,35.014271
3,Ramicane,34.223992
4,Ramicane,32.997729
10,Capomulin,45.0
11,Capomulin,43.878496
12,Capomulin,37.614948
13,Capomulin,38.177232
14,Capomulin,36.866876


In [23]:
groupby_regimen_df['Drug Regimen'].value_counts()

Drug Regimen  Drug Regimen
Capomulin     Capomulin       230
Ceftamin      Ceftamin        178
Infubinol     Infubinol       178
Ketapril      Ketapril        188
Naftisol      Naftisol        186
Placebo       Placebo         181
Propriva      Propriva        148
Ramicane      Ramicane        228
Stelasyn      Stelasyn        181
Zoniferol     Zoniferol       182
Name: Drug Regimen, dtype: int64

In [24]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [25]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.



In [26]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.



In [27]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [28]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [29]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [30]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [31]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [32]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [33]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [34]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
