## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import sem
from scipy.stats import linregress

# Study data files
mouse_metadata_path = "../Resources/Mouse_metadata.csv"
study_results_path = "../Resources/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
results_df =pd.merge(mouse_metadata,study_results, how='outer', on='Mouse ID' )
# Display the data table for preview
results_df.head()


In [None]:
results_cols = ['Mouse_ID', 'Drug_Regimen', 'Sex', 'Age_months', 'Weight', 'Timepoint', 
              'Tumor_Vol_mm3', 'Metastatic_Sites']

In [None]:
results_df.columns = results_cols
results_df.head()

In [None]:
# Checking the number of mice.
TL_Mice= results_df['Mouse_ID'].count()
print(TL_Mice)

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
results_df.duplicated(subset=['Mouse_ID', 'Timepoint']).sum()

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
results_df.loc[results_df.duplicated(subset=['Mouse_ID', 'Timepoint']), :]

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
results_df = results_df.drop_duplicates(subset=['Mouse_ID', 'Timepoint'], keep='last')

In [None]:
# Checking the number of mice in the clean DataFrame.
results_df['Mouse_ID'].shape

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
result_mean = round(results_df.groupby('Drug_Regimen').Tumor_Vol_mm3.mean(),2)
result_median = round(results_df.groupby('Drug_Regimen').Tumor_Vol_mm3.median(),2)
result_variance = round(results_df.groupby('Drug_Regimen').Tumor_Vol_mm3.var(),2)
result_std = round(results_df.groupby('Drug_Regimen').Tumor_Vol_mm3.std(),2)
result_sem =  round(results_df.groupby('Drug_Regimen').Tumor_Vol_mm3.sem(),2)


Summary_Stat = pd.concat([result_mean, result_median, result_variance, result_std, result_sem], 
                         axis = 'columns', sort = True)
Summary_Stat.columns = ['Mean', 'Median', 'Variance', 'Standard Deviation', 'SEM']
Summary_Stat

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
Regimen_Tumor = results_df.groupby(['Drug_Regimen'])

Agg_Stat= round(Regimen_Tumor.agg(['mean', 'median', 'var', 'std', 'sem']),2)

Aggr_Stat = Agg_Stat.Tumor_Vol_mm3
Aggr_Stat.columns = ['Mean', 'Median', 'Variance', 'Standard Deviation', 'SEM']
Aggr_Stat


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
group_regimen = results_df.groupby('Drug_Regimen')
sum_regimen = group_regimen.count()
regimen_bar = sum_regimen.plot(kind = 'bar', y = 'Tumor_Vol_mm3', 
                               title = 'Pymaceuticals Summary Drug Regimen', label = 'Tumor Vol', rot = '45')
regimen_bar.set_ylabel('Total Test Count')

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
measurement = results_df.groupby(['Drug_Regimen'], as_index=True).agg('count')
measurement = measurement['Tumor_Vol_mm3']
x_axis = measurement.index
plt.figure(figsize=(10,4))
plt.bar(x_axis, measurement, alpha = 0.5, align= 'center', color = 'b')
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, x_axis, rotation = '45')
plt.xlim(-0.75, len(x_axis)-0.25)
plt.ylim(0, max(measurement)+10)
plt.title ("Pymaceuticals Summary")
plt.xlabel ("Drug Regimen")
plt.ylabel("Tumor Vol Test Count")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
gender_group = results_df.groupby('Sex')
sum_gender = gender_group.count()
gender_pie = sum_gender.plot(kind ='pie', y='Mouse_ID', title = 'Mice Distribution', 
                            colors=['r', 'b'], explode=[0,0.1])
gender_pie.set_ylabel('Mouse Count')
plt.axis("equal")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

gender_Male = results_df.loc[results_df['Sex']=='Male']
Male_Mouse = len(gender_Male)
gender_Female = results_df.loc[results_df['Sex']=='Female']
Female_Mouse = len(gender_Female)
labels = ["Male", "Female"]
size = [Male_Mouse, Female_Mouse]
colors = ['b', 'pink']
explode = (0,0.1)
plt.pie(size, explode=explode, labels = labels, colors = colors, 
       autopct="%1.1f%%", shadow=True, startangle=140)
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

Regimen_Filter= results_df[results_df.Drug_Regimen.isin(['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin'])]

Regimen_Vol = pd.DataFrame(Regimen_Filter,columns=['Drug_Regimen', 'Tumor_Vol_mm3'])
Regimen_Vol = round(Regimen_Vol.groupby(['Drug_Regimen']).agg('max'),2)

Regimen_TimePoint = pd.DataFrame(Regimen_Filter,columns=['Drug_Regimen', 'Timepoint'])
Regimen_TimePoint =Regimen_TimePoint.groupby(['Drug_Regimen']).agg('max')

Regimen_Summary =pd.merge(Regimen_Vol, Regimen_TimePoint, how='left', on='Drug_Regimen')
Regimen_Summary

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
Regimens = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol = []

tumor_vol = [x for x in results_df['Tumor_Vol_mm3']]
for x in results_df.iterrows(Regimens):
    #tumor_vol.append(['Tumor_Vol_mm3']['tumor_vol'])
    print(Regimens)
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
  # Locate the rows which contain mice on each drug and get the tumor volumes

    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    
quartiles = results_df['Tumor_Vol_mm3'].quantile([.25,.5,.75])
lowerq = round(quartiles[0.25],2)
upperq = round(quartiles[0.75],2)
iqr = upperq-lowerq
print(f"The lower quartile of occupancy is: {lowerq}")
print(f"The upper quartile of occupancy is: {upperq}")
print(f"The interquartile range of occupancy is: {iqr}")

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
capomulin = results_df.loc[results_df.Drug_Regimen=="Capomulin"]['Tumor_Vol_mm3']
ramicane = results_df.loc[results_df.Drug_Regimen=="Ramicane"]['Tumor_Vol_mm3']
infubinol = results_df.loc[results_df.Drug_Regimen=="Infubinol"]['Tumor_Vol_mm3']
ceftamin = results_df.loc[results_df.Drug_Regimen=="Ceftamin"]['Tumor_Vol_mm3']

labels = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']
plt.boxplot([capomulin, ramicane, infubinol, ceftamin], labels= labels)
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
regimen = results_df.loc[results_df['Drug_Regimen']=='Capomulin']
x_axis = regimen['Tumor_Vol_mm3']
y_axis = regimen['Timepoint']
plt.figure(figsize=(10,4))
plt.plot(x_axis,marker='o', label = 'Tumor Vol')
plt.plot(y_axis, marker ='^', label = 'Time Point')
plt.title("Capomulin - Tumor volume vs time point")
plt.legend(loc = "best")
plt.show()


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
regimen = results_df.loc[results_df['Drug_Regimen']=='Capomulin']
regimen.plot(kind = 'scatter', x='Tumor_Vol_mm3', y='Weight', title = "Tumor Vol vs Mouse Weight")
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
regimen = results_df.loc[results_df['Drug_Regimen']=='Capomulin']
weight = regimen.iloc[:,4]
tumor = regimen.iloc[:,6]
correlation = st.pearsonr(weight,tumor)
print(f"The correlation between mouse weight and aeverage tumor volume for Capomuilin regimen is {round(correlation[0],2)}")
x_values = weight
y_values = tumor
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(6,10),fontsize=15,color="red")
plt.xlabel('Mouse Weight')
plt.ylabel('Average Tumor volume')
plt.show()