## Observations and Insights 

In [105]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as sts
from scipy.stats import sem
import os
import random
import numpy as np

# Study data files
mouse_metadata_path = os.path.join('data','Mouse_metadata.csv')
study_results_path = os.path.join('data','Study_results.csv')

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# View the mouse data and the study results

mouse_metadata.columns
study_results.columns

# study_results

# Combine the data into a single dataset
metadata_studyresults = pd.merge(mouse_metadata, study_results, how='outer', on='Mouse ID')
# Display the data table for preview
print(f"mouse_metadata Data Shape: {mouse_metadata.shape}")
print(f"study_results Data Shape: {study_results.shape}")
print(f"metadata_studyresults Data shape {metadata_studyresults.shape}")
mouse_metadata




mouse_metadata Data Shape: (249, 5)
study_results Data Shape: (1893, 4)
metadata_studyresults Data shape (1893, 8)


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16
...,...,...,...,...,...
244,z314,Stelasyn,Female,21,28
245,z435,Propriva,Female,12,26
246,z581,Infubinol,Female,24,25
247,z795,Naftisol,Female,13,29


In [31]:
study_results

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.000000,0
1,f932,0,45.000000,0
2,g107,0,45.000000,0
3,a457,0,45.000000,0
4,c819,0,45.000000,0
...,...,...,...,...
1888,r944,45,41.581521,2
1889,u364,45,31.023923,3
1890,p438,45,61.433892,1
1891,x773,45,58.634971,4


In [32]:
print(f"metadata_studyresults Data index {metadata_studyresults.columns}")
print(f"metadata_studyresults Data count {metadata_studyresults.count()}")
metadata_studyresults

metadata_studyresults Data index Index(['Mouse ID', 'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)',
       'Timepoint', 'Tumor Volume (mm3)', 'Metastatic Sites'],
      dtype='object')
metadata_studyresults Data count Mouse ID              1893
Drug Regimen          1893
Sex                   1893
Age_months            1893
Weight (g)            1893
Timepoint             1893
Tumor Volume (mm3)    1893
Metastatic Sites      1893
dtype: int64


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [5]:
# Checking the number of mice.
# metadata_studyresults['Mouse ID'].value_counts()
metadata_studyresults['Mouse ID'].nunique()

249

In [53]:
# Checking the number of mice.
metadata_studyresults.count()
metadata_studyresults['Timepoint'].unique()


array([ 0,  5, 10, 15, 20, 25, 30, 35, 40, 45], dtype=int64)

In [60]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
subset_mouseID = metadata_studyresults.loc[metadata_studyresults.duplicated(subset = ['Mouse ID', 'Timepoint']), 'Mouse ID'].unique()

subset_mouseID

array(['g989'], dtype=object)

In [34]:
# Optional: Get all the data for the duplicate mouse ID. 
DupMice = metadata_studyresults.loc[metadata_studyresults['Mouse ID'] == 'g989']
DupMice

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [40]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
dup_Mice = metadata_studyresults[metadata_studyresults['Mouse ID'] == 'g989'].index
metadata_studyresults.drop(dup_Mice, inplace = True)


In [49]:
# Checking the number of mice in the clean DataFrame.
metadata_studyresults['Mouse ID'].nunique()

248

## Summary Statistics

In [141]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.
# Get a list of regimen
Regimen_List = metadata_studyresults['Drug Regimen'].unique()
Regimen_List
Regimen_Dict = {}
TumorVol_Mean_Dict = {}
TumorVol_Median_Dict = {}
TumorVol_Mode_Dict = {}
TumorVol_Var_Dict = {}
TumorVol_Std_Dict = {}
TumorVol_SEM_Dict = {}

for i in Regimen_List:
    Regimen_Dict[i] = metadata_studyresults.loc[metadata_studyresults['Drug Regimen']==i, 'Tumor Volume (mm3)']
for i in Regimen_Dict:
    TumorVol_Mean_Dict[i] = np.mean(Regimen_Dict[i])
    TumorVol_Median_Dict[i] = np.median(Regimen_Dict[i])
    TumorVol_Mode_Dict[i] = sts.mode(Regimen_Dict[i])
    TumorVol_Var_Dict[i] = np.var(Regimen_Dict[i])
    TumorVol_Std_Dict[i] = np.std(Regimen_Dict[i])
    TumorVol_SEM_Dict[i] = sem(Regimen_Dict[i])
    
    
summary_stats_dict= {"MEAN": TumorVol_Mean_Dict, "MEDIAN": TumorVol_Median_Dict, "VARIANCE": TumorVol_Var_Dict, 
                     "STANDARD DEVIATION": TumorVol_Std_Dict, "SEM": TumorVol_SEM_Dict, "MODE": TumorVol_Mode_Dict}

summary_stats_dict_df = pd.DataFrame(summary_stats_dict)
summary_stats_dict_df


Unnamed: 0,MEAN,MEDIAN,VARIANCE,STANDARD DEVIATION,SEM,MODE
Ramicane,40.216745,40.673236,23.383692,4.835669,0.320955,"([45.0], [25])"
Capomulin,40.675741,41.557809,24.839296,4.983904,0.329346,"([45.0], [25])"
Infubinol,52.884795,51.820584,42.886388,6.54877,0.492236,"([45.0], [25])"
Placebo,54.033581,52.288934,60.830138,7.799368,0.581331,"([45.0], [25])"
Ceftamin,52.591172,51.776157,39.069446,6.250556,0.469821,"([45.0], [25])"
Stelasyn,54.233149,52.431737,59.122106,7.68909,0.573111,"([45.0], [24])"
Zoniferol,53.236507,51.818479,48.266689,6.947423,0.516398,"([45.0], [25])"
Ketapril,55.235638,53.698743,68.18893,8.257659,0.60386,"([45.0], [25])"
Propriva,52.322552,50.854632,42.08802,6.487528,0.512884,"([45.0], [26])"
Naftisol,54.331565,52.509285,65.817708,8.112811,0.596466,"([45.0], [25])"


In [81]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function


## Bar and Pie Charts

In [156]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas.

# Create x-axis
Regimen_List = metadata_studyresults['Drug Regimen'].unique()
regimenNo = np.array(range(len(Regimen_List)))

# Create y-axis
Regimen_MouseID_Dict = {}
totalNoMice = []

for drug in Regimen_List:
    Regimen_MouseID_Dict[drug] = metadata_studyresults.loc[metadata_studyresults['Drug Regimen']==drug, 'Mouse ID']
    totalNoMice.append(Regimen_MouseID_Dict[drug].unique())


In [170]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.

# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas.

# Create x-axis
Regimen_List = metadata_studyresults['Drug Regimen'].unique()
regimenNo = np.array(range(len(Regimen_List)))

# Create y-axis
Regimen_MouseID_Dict = {}
Mice = []

for drug in Regimen_List:
    Regimen_MouseID_Dict[drug] = metadata_studyresults.loc[metadata_studyresults['Drug Regimen']==drug, 'Mouse ID']
    Mice.append(Regimen_MouseID_Dict[drug].unique())

totalNoMice = np.array(range(len(Mice)))
print(Mice)

# -----------------------------------------------------------------------
# # Create Plot
# plt.bar(x=Regimen_List, height=totalNoMice, color='b', alpha=0.65, align='center')
# plt.xticks(regimenNo, Regimen_List,rotation = 60)
# plt.title("Total no. of mice per Drug Regimens")
# plt.xlabel("Drug Regimens")
# plt.ylabel("No. of Mice")

[array(['k403', 'g791', 's508', 'm546', 'z578', 'j913', 'n364', 'c758',
       'a644', 'i177', 'j989', 'a520', 'r811', 'i334', 'q610', 'd251',
       'c458', 'a411', 'e662', 'u196', 'q597', 'a444', 'r921', 'w678',
       'y449'], dtype=object), array(['s185', 'x401', 'm601', 'f966', 'u364', 'y793', 'r554', 'm957',
       't565', 'i738', 'w914', 'g288', 'l897', 'b742', 'b128', 'j246',
       'j119', 'w150', 'v923', 'g316', 's710', 'l509', 'r944', 'i557',
       'r157'], dtype=object), array(['a203', 'a251', 'a577', 'a685', 'c139', 'c326', 'c895', 'e476',
       'f345', 'i386', 'k483', 'k804', 'm756', 'n671', 'o809', 'o813',
       'q132', 's121', 'v339', 'v719', 'v766', 'w193', 'w584', 'y163',
       'z581'], dtype=object), array(['a262', 'a897', 'c282', 'c757', 'c766', 'e227', 'i477', 'i669',
       'j235', 'l872', 'n763', 'o302', 'o795', 'q582', 'q787', 'r850',
       's152', 's166', 't718', 't994', 'v409', 'v989', 'w167', 'x773',
       'y478'], dtype=object), array(['a275', 'b447', 

In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
