## Observations and Insights 

In [96]:
!pip install -U scikit-learn
# !python -m pip show scikit-learn  # to see which version and where scikit-learn is installed
# !python -m pip freeze  # to see all packages installed in the active virtualenv
# !python -c "import sklearn; sklearn.show_versions()"

Collecting scikit-learn
  Downloading scikit_learn-0.24.2-cp38-cp38-win_amd64.whl (6.9 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.1
    Uninstalling scikit-learn-0.24.1:
      Successfully uninstalled scikit-learn-0.24.1
Successfully installed scikit-learn-0.24.2


In [122]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import sem

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
# merged_df = pd.merge(mouse_metadata, study_results, on="Mouse ID", suffixes=("_Bitcoin", "_Dash"))
merged_df = pd.merge(mouse_metadata, study_results, on="Mouse ID")

# Display the data table for preview
merged_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [123]:
# Checking the number of mice.
mice_count = merged_df['Mouse ID'].value_counts().to_frame()
mice_count

Unnamed: 0,Mouse ID
g989,13
m269,10
t198,10
i738,10
r604,10
...,...
x226,1
n482,1
h428,1
l872,1


In [124]:
# Checking the number of mice.
mice_count2 = merged_df['Mouse ID'].nunique()
mice_count2

249

In [125]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicates_bool = merged_df.duplicated(subset=['Mouse ID','Timepoint'])
duplicates = merged_df.loc[duplicates_bool==True]
duplicates

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [126]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_info = merged_df[merged_df.duplicated()]
duplicate_info

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0


In [127]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = merged_df.rename(columns={'Mouse ID':'Mouse_ID','Tumor Volume (mm3)':'Tumor_Volume_mm3'})
clean_df = clean_df[clean_df.Mouse_ID != 'g989']

In [128]:
# Checking the number of mice in the clean DataFrame.
mice_count3 = clean_df['Mouse_ID'].nunique()
mice_count3

248

## Summary Statistics

In [187]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and 
# SEM of the tumor volume for each regimen
drugreg_df = clean_df.groupby('Drug Regimen')
drugreg_tv_df = drugreg_df['Tumor_Volume_mm3']
drugreg_tv_mean = drugreg_tv_df.mean()
drugreg_tv_median = drugreg_tv_df.median()
drugreg_tv_var = drugreg_tv_df.var()
drugreg_tv_std = drugreg_tv_df.std()
drugreg_tv_sem = drugreg_tv_df.sample()
test = [{'Capomulin':1,'Ceftamin':2,'Infubinol':3,'Ketapril':4,
        'Naftisol':5,'Placebo':6,'Propriva':7,'Ramicane':8,
        'Stelasyn':9,'Zoniferol':10}]
test_frame = pd.DataFrame(test)

# https://www.geeksforgeeks.org/creating-a-dataframe-from-pandas-series/
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.agg.html
# https://stackoverflow.com/questions/49512034/get-means-and-sem-in-one-df-with-pandas-groupby
# https://stackoverflow.com/questions/52619344/index-match-using-pandas
# https://www.geeksforgeeks.org/different-ways-to-create-pandas-dataframe/

# Create a bunch of samples, each with sample size of 20
nsamples = 20
div = 25
samples = [drugreg_df.sample(div) for x in range(0,nsamples)]

# samples = []
# for x in range(0,nsamples):
#     samples.append(drugreg_df.sample(div))

# tumor_samples = pd.concat(samples)
# tumor_samples

# Calculate means
means = [sample.Tumor_Volume_mm3.mean() for sample in samples]
# Calculate standard error on means
sems = [sem(s.Tumor_Volume_mm3) for s in samples]

# Determine which sample's mean is closest to the population mean
print(f"The smallest SEM observed was {min(sems)}")
samp_index = sems.index(min(sems))
print(f"The sample with the smallest SEM is sample {samp_index+1}")

# new_df = [{
#     'Mean':[drugreg_tv_mean],'Median':[drugreg_tv_median]
# }]

summary_df = pd.DataFrame({'Mean':drugreg_tv_mean,
                           'Median':drugreg_tv_median,
                           'Variance':drugreg_tv_var,
                           'Standard Deviation':drugreg_tv_std                           
                           })

# summary_df = pd.DataFrame({'Mean':drugreg_tv_mean,
#                            'Median':drugreg_tv_median,
#                            'Variance':drugreg_tv_var,
#                            'Standard Deviation':drugreg_tv_std,
#                            'SEM':drugreg_tv_sem
#                            })
merge_df = pd.merge(summary_df,test_frame.T)

# new_index_df = summary_df.reset_index(drop=True)
# new_index_df.head()

# summary_df
# test_frame.T
# drugreg_tv_df.std()
# drugreg_tv_df.sample()

# remove extra columns and recombine to get the summary statistics table

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.


The smallest SEM observed was 0.5096673212901305
The sample with the smallest SEM is sample 16


MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

In [78]:
tv_mean = clean_df.Tumor_Volume_mm3.mean()
print(tv_mean)
tv_median = clean_df.Tumor_Volume_mm3.median()
tv_variance = clean_df.Tumor_Volume_mm3.var()
tv_std = clean_df.Tumor_Volume_mm3.std()
# mode_scipy = sts.mode(Tumor_Volume_mm3)
print(tv_median)
print(tv_variance)
print(tv_std)

50.435293232255326
48.933453655
79.46290714557298
8.914196943391648


In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and 
# SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.



In [10]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
