## Observations and Insights 

In [96]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st
%matplotlib notebook

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_df = pd.merge(mouse_metadata, study_results, on="Mouse ID", how="left")

# Display the data table for preview
combined_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [97]:
# Checking the number of mice.
count_mice = combined_df['Mouse ID'].nunique()
count_mice

249

In [98]:
# check number of data records (includes header)
count_records = combined_df['Mouse ID'].count()
count_records

1893

In [99]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

# Select all duplicate rows based on multiple column names in list
duplicate_entry = combined_df[combined_df.duplicated(['Mouse ID', 'Timepoint'])]
print("Duplicate Rows based on Mouse ID and Timepint are:", duplicate_entry, sep='\n')

Duplicate Rows based on Mouse ID and Timepint are:
    Mouse ID Drug Regimen     Sex  Age_months  Weight (g)  Timepoint  \
909     g989     Propriva  Female          21          26          0   
911     g989     Propriva  Female          21          26          5   
913     g989     Propriva  Female          21          26         10   
915     g989     Propriva  Female          21          26         15   
917     g989     Propriva  Female          21          26         20   

     Tumor Volume (mm3)  Metastatic Sites  
909           45.000000                 0  
911           47.570392                 0  
913           49.880528                 0  
915           53.442020                 0  
917           54.657650                 1  


In [100]:
# Optional: Get all the data for the duplicate mouse ID. 

In [101]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = combined_df.drop_duplicates(subset=['Mouse ID', 'Timepoint'])
clean_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [102]:
# reset index and remove old index column
clean_df = clean_df.reset_index(drop = True)
clean_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1883,z969,Naftisol,Male,9,30,25,63.145652,2
1884,z969,Naftisol,Male,9,30,30,65.841013,3
1885,z969,Naftisol,Male,9,30,35,69.176246,4
1886,z969,Naftisol,Male,9,30,40,70.314904,4


In [103]:
# Checking the number of mice in the clean DataFrame.
count_mice = clean_df['Mouse ID'].nunique()
count_mice

249

In [104]:
# check number of data records (includes header)
count_records = clean_df['Mouse ID'].count()
count_records

1888

In [105]:
# get number and names of drugs in study
drugs_unique_num = clean_df['Drug Regimen'].nunique()
drugs_unique = clean_df['Drug Regimen'].unique()
print(drugs_unique_num)
print(drugs_unique)

10
['Ramicane' 'Capomulin' 'Infubinol' 'Placebo' 'Ceftamin' 'Stelasyn'
 'Zoniferol' 'Ketapril' 'Propriva' 'Naftisol']


## Summary Statistics

In [106]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.

In [107]:
# create a Series the mean tumor volume by Drug Regimen 
mean_vol = [clean_df.groupby(['Drug Regimen'])['Tumor Volume (mm3)'].mean()]

# convert the Series into a DataFrame
mean_vol_df = pd.DataFrame(mean_vol)
mean_vol_df = mean_vol_df.transpose()
mean_vol_df

Unnamed: 0_level_0,Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,40.675741
Ceftamin,52.591172
Infubinol,52.884795
Ketapril,55.235638
Naftisol,54.331565
Placebo,54.033581
Propriva,52.393463
Ramicane,40.216745
Stelasyn,54.233149
Zoniferol,53.236507


In [108]:
# calculate the median tumor volume by Drug Regimen 
med_vol = [clean_df.groupby(['Drug Regimen'])['Tumor Volume (mm3)'].median()]

# convert the Series into a DataFrame
med_vol_df = pd.DataFrame(med_vol)
med_vol_df = med_vol_df.transpose()
med_vol_df

Unnamed: 0_level_0,Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,41.557809
Ceftamin,51.776157
Infubinol,51.820584
Ketapril,53.698743
Naftisol,52.509285
Placebo,52.288934
Propriva,50.909965
Ramicane,40.673236
Stelasyn,52.431737
Zoniferol,51.818479


In [109]:
# calculate the tumor volume variance by Drug Regimen 
var_vol = [clean_df.groupby(['Drug Regimen'])['Tumor Volume (mm3)'].var()]

# convert the Series into a DataFrame
var_vol_df = pd.DataFrame(var_vol)
var_vol_df = var_vol_df.transpose()
var_vol_df

Unnamed: 0_level_0,Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,24.947764
Ceftamin,39.290177
Infubinol,43.128684
Ketapril,68.553577
Naftisol,66.173479
Placebo,61.168083
Propriva,43.138803
Ramicane,23.486704
Stelasyn,59.450562
Zoniferol,48.533355


In [110]:
# calculate the tumor volume standard deviation by Drug Regimen 
std_dev_vol = [clean_df.groupby(['Drug Regimen'])['Tumor Volume (mm3)'].std()]

# convert the Series into a DataFrame
std_dev_vol_df = pd.DataFrame(std_dev_vol)
std_dev_vol_df =std_dev_vol_df.transpose()
std_dev_vol_df

Unnamed: 0_level_0,Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,4.994774
Ceftamin,6.268188
Infubinol,6.567243
Ketapril,8.279709
Naftisol,8.134708
Placebo,7.821003
Propriva,6.568014
Ramicane,4.846308
Stelasyn,7.710419
Zoniferol,6.966589


In [111]:
# calculate the tumor volume SEM by Drug Regimen 
sem_vol = [clean_df.groupby(['Drug Regimen'])['Tumor Volume (mm3)'].sem()]

# convert the Series into a DataFrame
sem_vol_df = pd.DataFrame(sem_vol)
sem_vol_df = sem_vol_df.transpose()
sem_vol_df

Unnamed: 0_level_0,Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,0.329346
Ceftamin,0.469821
Infubinol,0.492236
Ketapril,0.60386
Naftisol,0.596466
Placebo,0.581331
Propriva,0.525862
Ramicane,0.320955
Stelasyn,0.573111
Zoniferol,0.516398


In [112]:
# place all of the data into a summary DataFrame and display
tumor_vol_summary1_df = pd.merge(pd.merge(pd.merge(pd.merge(mean_vol_df,med_vol_df,on='Drug Regimen'),var_vol_df,on='Drug Regimen'),std_dev_vol_df,on='Drug Regimen'),sem_vol_df,on='Drug Regimen')

tumor_vol_summary1_df

# rename columns to "Mean Volume", "Median Volume", "Volume Variance", "Volume Standard Deviation", "SEM"

Unnamed: 0_level_0,Tumor Volume (mm3)_x,Tumor Volume (mm3)_y,Tumor Volume (mm3)_x,Tumor Volume (mm3)_y,Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.393463,50.909965,43.138803,6.568014,0.525862
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [86]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function

tumor_vol_summary2_df = clean_df.groupby(['Drug Regimen'])(['Tumor Volume (mm3)'].mean(), ['Tumor Volume (mm3)'].med(), ['Tumor Volume (mm3)'].var(), ['Tumor Volume (mm3)'].std(), ['Tumor Volume (mm3)'].sem())
tumor_vol_summary2_df


# It is also possible to group a DataFrame by multiple columns
# This returns an object with multiple indexes, however, which can be harder to deal with
#grouped_international_data = converted_ufo.groupby(['country', 'state'])

#grouped_international_data.count().head(20)


AttributeError: 'list' object has no attribute 'mean'

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 

In [146]:
# extract data required for bar chart

samples_per_drug = clean_df.groupby('Drug Regimen')['Mouse ID'].count()
samples_per_drug

Drug Regimen
Capomulin    230
Ceftamin     178
Infubinol    178
Ketapril     188
Naftisol     186
Placebo      181
Propriva     156
Ramicane     228
Stelasyn     181
Zoniferol    182
Name: Mouse ID, dtype: int64

In [147]:
# create lists for chart

#clean_df.groupby('Drug Regimen')['Mouse ID'].apply(list)

drugs = samples_per_drug.groupby('Drug Regimen').apply(list)
drugs

Drug Regimen
Capomulin    [230]
Ceftamin     [178]
Infubinol    [178]
Ketapril     [188]
Naftisol     [186]
Placebo      [181]
Propriva     [156]
Ramicane     [228]
Stelasyn     [181]
Zoniferol    [182]
Name: Mouse ID, dtype: object

In [153]:
# make a bar chart

x_axis = np.arange(len(samples_per_drug))

plt.bar(x_axis, drugs, color='green', alpha=0.7, align='center')
                                                           
# tell matplotlib where to place each of the x axis headers
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, drugs)

# give the chart a title, x label, and y label
plt.title("Number of Samples Per Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Samples")

# plot chart to screen
plt.show()

<IPython.core.display.Javascript object>

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas


In [None]:
# Labels for the sections of our pie chart
labels = ["Humans", "Smurfs", "Hobbits", "Ninjas"]

# The values of each section of the pie chart
sizes = [220, 95, 80, 100]

# The colors of each section of the pie chart
colors = ["red", "orange", "lightcoral", "lightskyblue"]

# Tells matplotlib to seperate the "Humans" section from the others
explode = (0.1, 0, 0, 0)

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
