## Observations and Insights 

In [125]:
# Steps:
# Dependencies and Setup
# Study data files
# Read the mouse data and the study results
# Combine the data into a single dataset
# Display the data table for preview
# Checking the number of mice.
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
# Optional: Get all the data for the duplicate mouse ID.
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# Checking the number of mice in the clean DataFrame.

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

mouse_metadata = pd.read_csv(mouse_metadata_path)
mouse_metadata = mouse_metadata.rename(columns={
    "Mouse ID": "ID",
    "Age_months": "Age"
})
# age in months
study_results = pd.read_csv(study_results_path)
study_results = study_results.rename(columns={
    "Mouse ID": "ID",
    "Age_months": "Age"
})

results_and_metadata = pd.merge(mouse_metadata,
                                study_results,
                                how="outer",
                                on="ID")
results_and_metadata.head(2)

Unnamed: 0,ID,Drug Regimen,Sex,Age,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0


In [126]:
print(f'\n{mouse_metadata.info()}')
print(f'\n{study_results.info()}')
print(f'Unique IDs: \n{pd.unique(mouse_metadata["ID"])}')
print(f'\n Total unique mice IDs: {len(mouse_metadata["ID"])}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries, 0 to 248
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            249 non-null    object
 1   Drug Regimen  249 non-null    object
 2   Sex           249 non-null    object
 3   Age           249 non-null    int64 
 4   Weight (g)    249 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 9.9+ KB

None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1893 entries, 0 to 1892
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  1893 non-null   object 
 1   Timepoint           1893 non-null   int64  
 2   Tumor Volume (mm3)  1893 non-null   float64
 3   Metastatic Sites    1893 non-null   int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 59.3+ KB

None
Unique IDs: 
['k403' 's185' 'x401' 'm601' 'g791' 's508' 'f966' 'm546' 'z578' 

In [127]:
series_duplicated_rows = results_and_metadata.duplicated(
    subset=["ID", "Timepoint"], keep='first')
series_duplicated_rows2 = results_and_metadata.duplicated(
    subset=["ID", "Timepoint"], keep=False)
results_and_metadata["Duplicated"] = series_duplicated_rows
results_and_metadata["Duplicated2"] = series_duplicated_rows2
duplicated = results_and_metadata.loc[results_and_metadata["Duplicated"] ==
                                      True, :]
duplicated2 = results_and_metadata.loc[results_and_metadata["Duplicated2"] ==
                                      True, :]
#removed duplicates
no_duplicates = results_and_metadata.loc[results_and_metadata["Duplicated"] ==
                                         False, :]
print(f'Removed duplicates for: \n\n{duplicated2[["ID","Timepoint"]]}')
print(f'\nClean dataframe contains: \n\n{len(no_duplicates["ID"])}')

Removed duplicates for: 

       ID  Timepoint
908  g989          0
909  g989          0
910  g989          5
911  g989          5
912  g989         10
913  g989         10
914  g989         15
915  g989         15
916  g989         20
917  g989         20

Clean dataframe contains: 

1888


## Summary Statistics

In [132]:
no_duplicates.columns

Index(['ID', 'Drug Regimen', 'Sex', 'Age', 'Weight (g)', 'Timepoint',
       'Tumor Volume (mm3)', 'Metastatic Sites', 'Duplicated', 'Duplicated2'],
      dtype='object')

In [176]:
# Generate a summary statistics table of mean, median, variance,
# standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate
# the following properties of each drug regimen:
# mean, median, variance, standard deviation, and SEM of the tumor volume.

# Assemble the resulting series into a single summary dataframe.
# Generate a summary statistics table of mean, median, variance,
# standard deviation, and SEM of the tumor volume for each regimen
# Using the aggregation method, produce the same
# summary statistics in a single line

In [175]:
summary = no_duplicates.groupby('Drug Regimen')
summary_mean = summary["Tumor Volume (mm3)"].mean()
summary_median = summary["Tumor Volume (mm3)"].median()
summary_var = summary["Tumor Volume (mm3)"].var()
summary_std = summary["Tumor Volume (mm3)"].std()
summary_sem = summary["Tumor Volume (mm3)"].sem()
summary_list = pd.DataFrame()
summary_list["Average"] = summary_mean
summary_list["Median"] = summary_median
summary_list["Std"] = summary_std
summary_list["Variance"] = summary_var
summary_list["SEM"] = summary_sem
print("    Tummor Volume (mm3) Summary Statistics:")
summary_list

    Tummor Volume (mm3) Summary Statistics:


Unnamed: 0_level_0,Average,Median,Std,Variance,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,4.994774,24.947764,0.329346
Ceftamin,52.591172,51.776157,6.268188,39.290177,0.469821
Infubinol,52.884795,51.820584,6.567243,43.128684,0.492236
Ketapril,55.235638,53.698743,8.279709,68.553577,0.60386
Naftisol,54.331565,52.509285,8.134708,66.173479,0.596466
Placebo,54.033581,52.288934,7.821003,61.168083,0.581331
Propriva,52.393463,50.909965,6.568014,43.138803,0.525862
Ramicane,40.216745,40.673236,4.846308,23.486704,0.320955
Stelasyn,54.233149,52.431737,7.710419,59.450562,0.573111
Zoniferol,53.236507,51.818479,6.966589,48.533355,0.516398


In [182]:
summary[[
    #     'Weight (g)',
    #     'Timepoint',
    'Tumor Volume (mm3)',
    #     'Metastatic Sites',
]].aggregate(func=['mean', 'median', 'std', 'var', 'sem'])

Unnamed: 0_level_0,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3)
Unnamed: 0_level_1,mean,median,std,var,sem
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Capomulin,40.675741,41.557809,4.994774,24.947764,0.329346
Ceftamin,52.591172,51.776157,6.268188,39.290177,0.469821
Infubinol,52.884795,51.820584,6.567243,43.128684,0.492236
Ketapril,55.235638,53.698743,8.279709,68.553577,0.60386
Naftisol,54.331565,52.509285,8.134708,66.173479,0.596466
Placebo,54.033581,52.288934,7.821003,61.168083,0.581331
Propriva,52.393463,50.909965,6.568014,43.138803,0.525862
Ramicane,40.216745,40.673236,4.846308,23.486704,0.320955
Stelasyn,54.233149,52.431737,7.710419,59.450562,0.573111
Zoniferol,53.236507,51.818479,6.966589,48.533355,0.516398


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.



In [10]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
