## Observations and Insights 

# Executive Summary

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

print(mouse_metadata.head())
print(study_results.head())


  Mouse ID Drug Regimen     Sex  Age_months  Weight (g)
0     k403     Ramicane    Male          21          16
1     s185    Capomulin  Female           3          17
2     x401    Capomulin  Female          16          15
3     m601    Capomulin    Male          22          17
4     g791     Ramicane    Male          11          16
  Mouse ID  Timepoint  Tumor Volume (mm3)  Metastatic Sites
0     b128          0                45.0                 0
1     f932          0                45.0                 0
2     g107          0                45.0                 0
3     a457          0                45.0                 0
4     c819          0                45.0                 0


In [16]:
## Clean data
# Check for duplicate mouse data in metadata
# print(mouse_metadata["Mouse ID"].describe())        # 240 unique mice

# Check study_results - how many unique timepoints?
# print(len(study_results["Timepoint"].unique()))     # shows 10 unique timepoints;
# print(study_results["Timepoint"].describe())        # shows 1893 records. I.e. not 2490 (10x249) so some mice dropped out during study.
# print(study_results["Mouse ID"].describe())

## Where a mouse has duplicate data for a given timepoint, eliminate entirety of that mouse data from study_results
# Index study_results, by timepoint
tp_index_df = study_results.set_index("Timepoint")
print(tp_index_df.head())
# Count number of values of Mouse ID. If a mouse ID returns > 10, means there is duplicate data of said mouse.
print(tp_index_df["Mouse ID"].value_counts())           # mouse ID g989 has duplicate data, delete all data for said mouse ID.

# Look at g989 data
dirty_mouse = study_results.loc[(study_results["Mouse ID"] == "g989")]      # data isn't duplicated, as results aren't duplicated, their different, i.e. 2 results recorded where 1 was required, therefore delete all data.
print(dirty_mouse.shape)

# Delete all g989 mouse ID data: from 1) study_results; 2) mouse_metadata
clean_study_results = study_results.loc[(study_results["Mouse ID"] != "g989")]
clean_mouse_metadata = mouse_metadata.loc[(mouse_metadata["Mouse ID"] != "g989")]

# Compare shape of cleaned vs original data
print(clean_study_results.shape)
print(study_results.shape)          # Shows 13 records were deleted, which is what expected.

print(mouse_metadata.shape)
print(clean_mouse_metadata.shape)   # Shows 1 mouse was deleted, expected

          Mouse ID  Tumor Volume (mm3)  Metastatic Sites
Timepoint                                               
0             b128                45.0                 0
0             f932                45.0                 0
0             g107                45.0                 0
0             a457                45.0                 0
0             c819                45.0                 0
g989    13
b128    10
q787    10
s337    10
q511    10
        ..
f932     1
b447     1
u153     1
t573     1
o848     1
Name: Mouse ID, Length: 249, dtype: int64
(13, 4)
(1880, 4)
(1893, 4)
(249, 5)
(248, 5)


In [20]:
# Combine the data into a single dataset
combined_df = pd.merge(clean_study_results, clean_mouse_metadata, on="Mouse ID")
# print(combined_df.head())

# print(clean_study_results.columns)
# print(clean_mouse_metadata.columns)
# print(combined_df.columns)              # All columns from above to files included
# Display the data table for preview
combined_df.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22


In [27]:
# Checking the number of mice.
print(combined_df["Mouse ID"].describe())
print(mouse_metadata["Mouse ID"].describe())
print(study_results["Mouse ID"].describe())

# New cleaned combined data, has 248 unique mice (i.e. Mouse IDs), shows 1 mouse was dropped (g989).

count     1880
unique     248
top       b128
freq        10
Name: Mouse ID, dtype: object
count      249
unique     249
top       k403
freq         1
Name: Mouse ID, dtype: object
count     1893
unique     249
top       g989
freq        13
Name: Mouse ID, dtype: object


In [29]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
## Where a mouse has duplicate data for a given timepoint, eliminate entirety of that mouse data from study_results
# Index study_results, by timepoint
tp_index_df = study_results.set_index("Timepoint")
print(tp_index_df.head())
# Count number of values of Mouse ID. If a mouse ID returns > 10, means there is duplicate data of said mouse.
print(tp_index_df["Mouse ID"].value_counts())           # mouse ID g989 has duplicate data, delete all data for said mouse ID.


     Mouse ID  Timepoint  Tumor Volume (mm3)  Metastatic Sites
107      g989          0           45.000000                 0
137      g989          0           45.000000                 0
329      g989          5           48.786801                 0
360      g989          5           47.570392                 0
620      g989         10           51.745156                 0
681      g989         10           49.880528                 0
815      g989         15           51.325852                 1
869      g989         15           53.442020                 0
950      g989         20           55.326122                 1
1111     g989         20           54.657650                 1
1195     g989         25           56.045564                 1
1380     g989         30           59.082294                 1
1592     g989         35           62.570880                 2


In [4]:
# Optional: Get all the data for the duplicate mouse ID. 

dirty_mouse = study_results.loc[(study_results["Mouse ID"] == "g989")]      # data isn't duplicated, as results aren't duplicated, their different, i.e. 2 results recorded where 1 was required, therefore delete all data.
print(dirty_mouse)

In [5]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# Done, refer above and here:   combined_df = pd.merge(clean_study_results, clean_mouse_metadata, on="Mouse ID")

In [6]:
# Checking the number of mice in the clean DataFrame.
print(combined_df["Mouse ID"].describe())

## Summary Statistics

In [7]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.



In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 



In [10]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
