# Pymaceuticals Inc.
---

### Analysis
1. Mice treated with Capomulin or Ramicane had lower average, median, and standard deviation of tumor volumes compared to the other drug regimens. The final tumor volume at the end of the 45-day testing period for mice treated with Capomulin or Ramicane also had narrower IQRs than the those of the other drug regimens. 
2. Mouse b128 treated with Capomulin experienced a reduction in tumor volume (mm3) of approximately 10% (or a decrease in tumor volume of 6 mm3) over the 45-day testing period.
3. For mice treated with Capomulin, mouse weight (g) and average tumor volume (mm3) are positively correlated. Based on the OLS regression model, for each gram a mouse weighs (beyond approximately 14 grams) corresponds to an increase in average tumor volume of 0.95 mm3. 

### Other Considerations
1. It would be worth assessing on aggregate how much in raw and percentage terms the average tumor volume of mice on each of the treatments decreases over the testing period. 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
study_df = pd.merge(study_results, mouse_metadata, on="Mouse ID",how="left")

# Display the data table for preview
study_df.head()

In [None]:
# Checking the number of mice.
len(study_df['Mouse ID'].unique())

In [None]:
# Look for duplicates between Mouse ID and Timepoint pairs
dupes = study_df[study_df.duplicated(['Mouse ID', 'Timepoint'], keep=False)]
dupes

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
dupe_list = dupes['Mouse ID'].unique()
dupe_list

In [None]:
# Optional: Get all the data for the duplicate mouse ID.
dupe_mouse = study_df[study_df['Mouse ID'].isin(dupe_list)]
dupe_mouse

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID (aka remove all records for that mouse)
# There are multiple ways of doing this and it is good to search online to see the suggestions.
study_clean_df = pd.DataFrame(study_df[~study_df['Mouse ID'].isin(dupe_list)])

In [None]:
# Checking the number of mice in the clean DataFrame.
len(study_clean_df['Mouse ID'].unique())

## Summary Statistics

In [None]:
# Method 1:  Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumor volume for each regimen
# by using groupby and summary statistical methods to calculate the following 
#properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

by_regimen = study_clean_df.groupby(['Drug Regimen'])
mean = by_regimen['Tumor Volume (mm3)'].mean()
median = by_regimen['Tumor Volume (mm3)'].median()
var = by_regimen['Tumor Volume (mm3)'].var()
std = by_regimen['Tumor Volume (mm3)'].std()
sem = by_regimen['Tumor Volume (mm3)'].sem()

In [None]:
summary_stats = pd.DataFrame({'Mean Tumor Volume' : mean,
                             'Median Tumor Volume' : median,
                             'Tumor Volume Variance' : var,
                             'Tumor Volume Std. Dev.' : std,
                             'Tumor Volume Std. Err.' : sem})
summary_stats

In [None]:
# Method 2:  Generate a summary statistics table of 
# mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# by using the aggregation method, produce the same summary statistics in a single line
summary_stats_agg = by_regimen['Tumor Volume (mm3)'].agg(['mean','median','var','std','sem'])
summary_stats_agg = summary_stats_agg.rename(columns={'mean' : 'Mean Tumor Volume',
                             'median' : 'Median Tumor Volume',
                             'var' : 'Tumor Volume Variance',
                             'std' : 'Tumor Volume Std. Dev.',
                             'sem' : 'Tumor Volume Std. Err.'})
summary_stats_agg

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
bar_regimen = by_regimen['Timepoint'].count()
bar_regimen = bar_regimen.sort_values(ascending=False)
bar_regimen.plot(kind='bar')
plt.ylabel('Number of Mice Tested')
plt.title('Pandas: Number of Mice Tested per Drug')

In [None]:
# Generate a bar plot showing the total number of timepoints 
# for all mice tested for each drug regimen using pyplot.
x_axis = np.arange(1,len(bar_regimen) + 1)
plt.bar(x_axis,bar_regimen,width=.6)
plt.xticks(x_axis,bar_regimen.index,rotation='vertical')
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Mice Tested')
plt.title('Matplotlib: Number of Mice Tested per Drug')

In [None]:
# Method1:  Generate a pie plot showing the distribution 
# of female versus male mice using Pandas
by_sex = study_clean_df.groupby(['Sex'])
by_sex = by_sex['Sex'].count()
by_sex = by_sex.sort_values(ascending=False)
by_sex.plot(kind='pie',autopct = lambda p:f'{p:.1f}%')
plt.title('Pandas: Percentage of Male vs Female Measurements Taken')

In [None]:
# Method 2:  Generate a pie plot showing the distribution 
# of female versus male mice using matplotlib (aka pyplot, plt)
labels = by_sex.index
plt.pie(by_sex,labels=labels,autopct = lambda p:f'{p:.1f}%')
plt.ylabel('Sex')
plt.title('Matplotlib: Percentage of Male vs Female Measurements Taken')

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
treatments_subset = study_clean_df.loc[study_clean_df['Drug Regimen'].isin(
                                       ['Capomulin','Ramicane','Infubinol','Ceftamin']),:]
final_tp = treatments_subset.groupby(['Drug Regimen','Mouse ID'])
final_tp_df = pd.DataFrame(final_tp['Timepoint'].max()).copy()

In [None]:
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
final_tp_df = pd.merge(final_tp_df,study_clean_df,how='left',
                       on=['Drug Regimen','Mouse ID','Timepoint'])

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = final_tp_df['Drug Regimen'].unique()
tumor_volume = []
treatments

# generate list of lists which hold each mouse's final tumor volume per treatment
for treatment in treatments:
    volume_temp = final_tp_df.loc[final_tp_df['Drug Regimen'] == treatment,['Tumor Volume (mm3)']]
    volume_temp = volume_temp['Tumor Volume (mm3)'].to_list()
    tumor_volume.append(volume_temp)

In [None]:
# loop through treatments and identify outliers based on upper and lower bounds
for x in range(len(tumor_volume)):
    quartiles = pd.Series(tumor_volume[x]).quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    upper_bound = upperq + (1.5*iqr)
    lower_bound = lowerq - (1.5*iqr)
    
    for num in (tumor_volume[x]):
        if (num > upper_bound) or (num < lower_bound):
            print(f"Treatment {treatments[x]} has a potential outlier at {num} .")

In [None]:
# Generate boxplots for final tumor volume for each of the four drug regimens
fig, ax = plt.subplots()
flierprops = dict(marker='o', markerfacecolor='r',markersize=10)
ax.boxplot(tumor_volume,flierprops = flierprops)
ax.set_xticklabels(treatments)
plt.ylabel('Final Tumor Volume (mm3)')
plt.title('Boxplot: Final Tumor Volume (mm3) by Drug Regimen')

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
capomulin = study_clean_df.loc[(study_clean_df['Drug Regimen'] == 'Capomulin'),:]

# print all the mice who were treated with Capomulin and the number of timepoints recorded
capomulin['Mouse ID'].value_counts()

In [None]:
# retain only data for mouse 'b128' and sort data by Timepoint variable
capomulin_b128 = capomulin.loc[capomulin['Mouse ID'] == 'b128',:]
capomulin_b128 = capomulin_b128.sort_values(by='Timepoint')

In [None]:
# Generate a line chart of tumor volume over time for mouse 'b128' 
# being treated with Capomulin
x_axis_cap = np.arange(0,len(capomulin_b128['Timepoint']))
labels_cap = capomulin_b128['Timepoint']
y_axis_cap = capomulin_b128['Tumor Volume (mm3)']
plt.plot(x_axis_cap,y_axis_cap)
plt.xticks(x_axis_cap, labels=labels_cap)
plt.title('Capomulin Treatment of Mouse b128')
plt.ylabel('Tumor Volume (mm3)')
plt.xlabel('Timepoint (days)')
plt.xlim(0,len(x_axis_cap)-1)

In [None]:
# Generate a scatter plot of average tumor volume vs. average mouse weight 
# of each mouse treated with the Capomulin regimen
cap_by_mouse = capomulin.groupby('Mouse ID')
avg_weight = cap_by_mouse['Weight (g)'].mean()
avg_tumor_size = cap_by_mouse['Tumor Volume (mm3)'].mean()

plt.scatter(avg_weight,avg_tumor_size)
plt.xlabel('Average Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.title('Average Weight to Average Tumor Volume of Mice Treated with Capomulin')

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
correlation = st.pearsonr(avg_weight,avg_tumor_size)

In [None]:
# calculate linear regression model
(slope,intercept,rvalue,pvalue,stderr) = st.linregress(avg_weight, avg_tumor_size)
best_fit = avg_weight * slope + intercept
line_annotate = f"y = {round(slope,2)}x + {round(intercept,2)} "

# scatter plot with best fit line superimposed
plt.scatter(avg_weight,avg_tumor_size)
plt.plot(avg_weight,best_fit,color='red')
plt.annotate(line_annotate,(21,36), color='red',fontsize=12)
plt.xlabel('Average Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.title('Average Weight to Average Tumor Volume of Mice Treated with Capomulin')

print("The correlation between average mouse weight and average tumor volume" +
      f" for mice under the Capomulin regimen is {round(correlation[0],2)}. ")