## Observations and Insights 

In [105]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as sts
from scipy.stats import sem
import os
import random
import numpy as np

# Study data files
mouse_metadata_path = os.path.join('data','Mouse_metadata.csv')
study_results_path = os.path.join('data','Study_results.csv')

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# View the mouse data and the study results

mouse_metadata.columns
study_results.columns

# study_results

# Combine the data into a single dataset
metadata_studyresults = pd.merge(mouse_metadata, study_results, how='outer', on='Mouse ID')
# Display the data table for preview
print(f"mouse_metadata Data Shape: {mouse_metadata.shape}")
print(f"study_results Data Shape: {study_results.shape}")
print(f"metadata_studyresults Data shape {metadata_studyresults.shape}")
mouse_metadata




mouse_metadata Data Shape: (249, 5)
study_results Data Shape: (1893, 4)
metadata_studyresults Data shape (1893, 8)


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16
...,...,...,...,...,...
244,z314,Stelasyn,Female,21,28
245,z435,Propriva,Female,12,26
246,z581,Infubinol,Female,24,25
247,z795,Naftisol,Female,13,29


In [31]:
study_results

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.000000,0
1,f932,0,45.000000,0
2,g107,0,45.000000,0
3,a457,0,45.000000,0
4,c819,0,45.000000,0
...,...,...,...,...
1888,r944,45,41.581521,2
1889,u364,45,31.023923,3
1890,p438,45,61.433892,1
1891,x773,45,58.634971,4


In [32]:
print(f"metadata_studyresults Data index {metadata_studyresults.columns}")
print(f"metadata_studyresults Data count {metadata_studyresults.count()}")
metadata_studyresults

metadata_studyresults Data index Index(['Mouse ID', 'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)',
       'Timepoint', 'Tumor Volume (mm3)', 'Metastatic Sites'],
      dtype='object')
metadata_studyresults Data count Mouse ID              1893
Drug Regimen          1893
Sex                   1893
Age_months            1893
Weight (g)            1893
Timepoint             1893
Tumor Volume (mm3)    1893
Metastatic Sites      1893
dtype: int64


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [5]:
# Checking the number of mice.
# metadata_studyresults['Mouse ID'].value_counts()
metadata_studyresults['Mouse ID'].nunique()

249

In [53]:
# Checking the number of mice.
metadata_studyresults.count()
metadata_studyresults['Timepoint'].unique()


array([ 0,  5, 10, 15, 20, 25, 30, 35, 40, 45], dtype=int64)

In [60]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
subset_mouseID = metadata_studyresults.loc[metadata_studyresults.duplicated(subset = ['Mouse ID', 'Timepoint']), 'Mouse ID'].unique()

subset_mouseID

array(['g989'], dtype=object)

In [34]:
# Optional: Get all the data for the duplicate mouse ID. 
DupMice = metadata_studyresults.loc[metadata_studyresults['Mouse ID'] == 'g989']
DupMice

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [40]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
dup_Mice = metadata_studyresults[metadata_studyresults['Mouse ID'] == 'g989'].index
metadata_studyresults.drop(dup_Mice, inplace = True)


In [49]:
# Checking the number of mice in the clean DataFrame.
metadata_studyresults['Mouse ID'].nunique()

248

## Summary Statistics

In [107]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.
# Get a list of regimen
Regimen_List = metadata_studyresults['Drug Regimen'].unique()
Regimen_List
Regimen_Dict = {}
TumorVol_Mean_Dict = {}
TumorVol_Median_Dict = {}
TumorVol_Mode_Dict = {}
TumorVol_Var_Dict = {}
TumorVol_Std_Dict = {}
TumorVol_SEM_Dict = {}

for i in Regimen_List:
    Regimen_Dict[i] = metadata_studyresults.loc[metadata_studyresults['Drug Regimen']==i, 'Tumor Volume (mm3)']
for i in Regimen_Dict:
    TumorVol_Mean_Dict[i] = np.mean(Regimen_Dict[i])
    TumorVol_Median_Dict[i] = np.median(Regimen_Dict[i])
    TumorVol_Mode_Dict[i] = sts.mode(Regimen_Dict[i])
    TumorVol_Var_Dict[i] = np.var(Regimen_Dict[i])
    TumorVol_Std_Dict[i] = np.std(Regimen_Dict[i])
#     TumorVol_SEM_Dict[i] = sts.normaltest(Regimen_Dict[i])
    TumorVol_SEM_Dict[i] = [sem for i in Regimen_Dict]
    
    
# summary_stats_dict= {"MEAN": TumorVol_Mean_Dict, "MEDIAN": TumorVol_Median_Dict, "MODE": TumorVol_Mode_Dict 
#                      "VARIANCE": TumorVol_Var_Dict, "STANDARD DEVIATION": TumorVol_Std_Dict, "SEM": TumorVol_SEM_Dict}

# summary_stats_dict_df = pd.DataFrame(summary_stats_dict)
# summary_stats_dict_df


# TumorVol_SEM_Dict


In [81]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function


## Bar and Pie Charts

In [109]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas.
Regimen_List = metadata_studyresults['Drug Regimen'].unique()
Regimen_MouseID_Dict = {}

for i in Regimen_List:
    Regimen_MouseID_Dict[i] = metadata_studyresults.loc[metadata_studyresults['Drug Regimen']==i, 'Mouse ID']

Regimen_MouseID_Dict

# #-----------------------------------------------------------------------
# plt.bar(x=x_axis, height=members, color='b', alpha=0.65, align='center')
# plt.xticks(x_axis,gyms)
# plt.title("NYC Gyms")
# plt.xlabel("Gym Name")
# plt.ylabel("No. of Members")


{'Ramicane': 0      k403
 1      k403
 2      k403
 3      k403
 4      k403
        ... 
 449    w678
 450    y449
 451    y449
 452    y449
 453    y449
 Name: Mouse ID, Length: 228, dtype: object, 'Capomulin': 10      s185
 11      s185
 12      s185
 13      s185
 14      s185
         ... 
 440     i557
 1452    r157
 1453    r157
 1454    r157
 1455    r157
 Name: Mouse ID, Length: 230, dtype: object, 'Infubinol': 454     a203
 455     a203
 456     a203
 457     a203
 458     a203
         ... 
 1868    z581
 1869    z581
 1870    z581
 1871    z581
 1872    z581
 Name: Mouse ID, Length: 178, dtype: object, 'Placebo': 474     a262
 475     a262
 476     a262
 477     a262
 478     a262
         ... 
 1829    y478
 1830    y478
 1831    y478
 1832    y478
 1833    y478
 Name: Mouse ID, Length: 181, dtype: object, 'Ceftamin': 484     a275
 485     a275
 486     a275
 487     a275
 488     a275
         ... 
 1851    y865
 1852    y865
 1853    y865
 1854    y865
 1855    y865
 Nam

In [10]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
