## Observations and Insights 

In [25]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
import functools
# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merge_df = pd.merge(mouse_metadata, study_results, on="Mouse ID")
merge_df
# Display the data table for preview
merge_df.dtypes

Mouse ID               object
Drug Regimen           object
Sex                    object
Age_months              int64
Weight (g)              int64
Timepoint               int64
Tumor Volume (mm3)    float64
Metastatic Sites        int64
dtype: object

In [26]:
# Checking the number of mice.
mcount = merge_df['Mouse ID'].nunique()
mcount


249

In [27]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mouse_df = merge_df.loc[merge_df.duplicated(subset=["Mouse ID", "Timepoint"]), "Mouse ID"].unique()
duplicate_mouse_df


array(['g989'], dtype=object)

In [28]:
# Optional: Get all the data for the duplicate mouse ID. 
all = merge_df[merge_df['Mouse ID'] == 'g989']
all


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [29]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = merge_df.drop(labels=[908,909,910,911,912,913,914,915,916,917,918,919,920])

clean_df



Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [30]:
#review data to make sure no null values
clean_df.count()

Mouse ID              1880
Drug Regimen          1880
Sex                   1880
Age_months            1880
Weight (g)            1880
Timepoint             1880
Tumor Volume (mm3)    1880
Metastatic Sites      1880
dtype: int64

In [31]:
# Checking the number of mice in the clean DataFrame.
mcount2 = clean_df['Mouse ID'].nunique()
mcount2

248

In [32]:
unique_drugs = clean_df['Drug Regimen'].unique()
unique_drugs

array(['Ramicane', 'Capomulin', 'Infubinol', 'Placebo', 'Ceftamin',
       'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'],
      dtype=object)

In [33]:
#extract drug regimen and tumor volume from dataframe
short_df = clean_df[['Drug Regimen','Tumor Volume (mm3)' ]]
short_df

Unnamed: 0,Drug Regimen,Tumor Volume (mm3)
0,Ramicane,45.000000
1,Ramicane,38.825898
2,Ramicane,35.014271
3,Ramicane,34.223992
4,Ramicane,32.997729
...,...,...
1888,Naftisol,63.145652
1889,Naftisol,65.841013
1890,Naftisol,69.176246
1891,Naftisol,70.314904


In [34]:
# box_ramicane_df = clean_df.boxplot(['Tumor Volume (mm3)'])
# box_ramicane_df

In [40]:
drugs_short_df = short_df.groupby(['Drug Regimen'])
round(drugs_short_df.mean(),3)

Unnamed: 0_level_0,Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,40.676
Ceftamin,52.591
Infubinol,52.885
Ketapril,55.236
Naftisol,54.332
Placebo,54.034
Propriva,52.321
Ramicane,40.217
Stelasyn,54.233
Zoniferol,53.237


In [47]:
tumor_mean = round(drugs_short_df.mean(),3)
tumor_median = round(drugs_short_df.median(),3)
tumor_var = round(drugs_short_df.var(),3)
tumor_std = round(drugs_short_df.std(),3)
tumor_sem = round(drugs_short_df.sem(),3)

In [48]:
tumor_volume = [tumor_mean,tumor_median,tumor_var,tumor_std,tumor_sem]
tumor_volume_stats = functools.reduce(lambda left, right: pd.merge(left, right, on='Drug Regimen'),tumor_volume)
tumor_volume_stats

Unnamed: 0_level_0,Tumor Volume (mm3)_x,Tumor Volume (mm3)_y,Tumor Volume (mm3)_x,Tumor Volume (mm3)_y,Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.676,41.558,24.948,4.995,0.329
Ceftamin,52.591,51.776,39.29,6.268,0.47
Infubinol,52.885,51.821,43.129,6.567,0.492
Ketapril,55.236,53.699,68.554,8.28,0.604
Naftisol,54.332,52.509,66.173,8.135,0.596
Placebo,54.034,52.289,61.168,7.821,0.581
Propriva,52.321,50.446,43.852,6.622,0.544
Ramicane,40.217,40.673,23.487,4.846,0.321
Stelasyn,54.233,52.432,59.451,7.71,0.573
Zoniferol,53.237,51.818,48.533,6.967,0.516


In [38]:
tumor_mean = drugs_short_df.mean()
tumor_median = drugs_short_df.median()
tumor_var = drugs_short_df.var()
tumor_std = drugs_short_df.std()
tumor_sem = drugs_short_df.sem()
# # drugs_short_df(tumor_mean,tumor_median,tumor_var,tumor_std,tumor_sem,left_on='key',right_index=True )

# combine_drugs_df = pd.DataFrame(combine=tumor_mean,tumor_median,tumor_var,tumor_std,tumor_sem)
# combine_drugs_df
#tumor_median
merge_df = pd.merge(tumor_median, tumor_mean, tumor_var, tumor_std, tumor_sem, on='Drug Regimen')
merge_df

TypeError: merge() got multiple values for argument 'on'

In [16]:
# raw_df = pd.merge([tumor_mean, tumor_median])
    
# }
# raw_df


In [22]:
#place dataframes side by side
horizontal_stack = pd.concat([drugs_short_df.mean])

TypeError: cannot concatenate object of type '<class 'method'>'; only Series and DataFrame objs are valid

## Summary Statistics

In [56]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.



TypeError: unhashable type: 'list'

In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.



In [10]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
