## Observations and Insights

* **Observation #1**
* **Observation #2**
* **Observation #3**

In [None]:
# Import Dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
from scipy.stats import linregress

## Read CSVs and Merge DataFrames

In [None]:
# Define a path to the required data files
mouse_data_path = 'data/Mouse_metadata.csv'
study_results_path = 'data/Study_results.csv'

In [None]:
# Use Pandas to read in the csv files
mouse_data = pd.read_csv(mouse_data_path)
study_results = pd.read_csv(study_results_path)

In [None]:
# Merge the csv files on a common column so we have one dataset to work with and display the DataFrame
mouse_results_df = pd.merge(mouse_data, study_results, on = 'Mouse ID')
mouse_results_df

## Clean the Data- Remove Duplicates

In [None]:
# Find the number of mice in the merged dataset
mice_count = len(mouse_results_df['Mouse ID'].unique())
mice_count

In [None]:
# Find the duplicate Mouse IDs using duplicated and df.loc functions
duplicates = mouse_results_df.loc[mouse_results_df.duplicated(subset=['Mouse ID','Timepoint']),'Mouse ID'].unique()
duplicates

# Display the data for the duplicated Mouse IDs- based on the DataFrame, we should have five fewer rows in the cleaned DataFrame
display_duplicates = mouse_results_df.loc[mouse_results_df['Mouse ID'] == 'g989',:]
display_duplicates

In [None]:
# Remove duplicate Mouse IDS by removing those IDs with duplicate timepoints 
reduced_mice_df = mouse_results_df.drop_duplicates(subset=['Mouse ID','Timepoint'],keep = 'first')
reduced_mice_df

In [None]:
# Check the number of mice in the cleaned reduced_mice_df DataFrame remained the same
reduced_mice_count = len(reduced_mice_df['Mouse ID'].unique())
reduced_mice_count

## Summary Statistics

### Summary Statistics Table using Groupby and DataSeries

In [None]:
# Create a groupby object based on 'Drug Regimen'
drug_regimen_grouped = reduced_mice_df.groupby('Drug Regimen')

In [None]:
# Calculate the mean of the 'Tumor Volume' from the regimen_grouped Groupby object
drug_regimen_mean = drug_regimen_grouped['Tumor Volume (mm3)'].mean()
drug_regimen_mean

In [None]:
# Calculate the median of the 'Tumor Volume' from the regimen_grouped Groupby object
drug_regimen_median = drug_regimen_grouped['Tumor Volume (mm3)'].median()
drug_regimen_median

In [None]:
# Calculate the variance of the 'Tumor Volume' from the regimen_grouped Groupby object
drug_regimen_var = drug_regimen_grouped['Tumor Volume (mm3)'].var()
drug_regimen_var

In [None]:
# Calculate the standard deviation of the 'Tumor Volume' from the regimen_grouped Groupby object
drug_regimen_stdev = drug_regimen_grouped['Tumor Volume (mm3)'].std()
drug_regimen_stdev

In [None]:
# Calculate the standard error of the 'Tumor Volume' from the regimen_grouped Groupby object
drug_regimen_sem = drug_regimen_grouped['Tumor Volume (mm3)'].sem(ddof = 1)
drug_regimen_sem

In [None]:
# Create a summary statistics DataFrame for Tumor Volume by Drug Regimen with the statistics above; round the statistics to two decimal places
summary_df_one = pd.DataFrame({'Mean': round(drug_regimen_mean,2), 
                               'Median': round(drug_regimen_median,2),
                               'Variance': round(drug_regimen_var,2), 
                               'St. Deviation': round(drug_regimen_stdev,2), 
                               'St. Error': round(drug_regimen_sem,2)})
summary_df_one

### Summary Statistics Table using Aggregeation Method

In [None]:
# Create a summary statistics DataFrame for Tumor Volume by Drug Regimen using the aggregation method
summary_df_two = drug_regimen_grouped['Tumor Volume (mm3)'].aggregate(['mean','median','var','std','sem'])
summary_df_two

In [None]:
# Rename columns and round data points in the summary_df_two DataFrame
summary_df_two['mean'] = round(summary_df_two['mean'],2)
summary_df_two['median'] = round(summary_df_two['median'],2)
summary_df_two['var'] = round(summary_df_two['var'],2)
summary_df_two['std'] = round(summary_df_two['std'],2)
summary_df_two['sem'] = round(summary_df_two['sem'],2)

summary_df_two = summary_df_two.rename(columns={'mean': 'Mean', 'median': 'Median', 
                                                'var':'Variance', 'std': 'St. Deviation', 'sem': 'St. Error'})
summary_df_two

## Bar and Pie Charts

### Timepoints per Drug Regimen Bar Chart (Pandas)

In [None]:
# Create a 'Drug Regimen' Data Series to plot
drug_regimen = drug_regimen_grouped['Timepoint'].count()
drug_regimen

In [None]:
# Plot the 'Drug Regimen' DataSeries above using Pandas to visually show the total number of timepoints for all mice tested for each drug regimen
bar_chartpd = drug_regimen.plot(kind = 'bar', color = 'navy', alpha = 1, figsize = (7.5,6), rot = 45, width = 0.6)

# Add chart title and axes labels
bar_chartpd.set_title('Total Timepoints per Drug Regimen', fontsize = 15)
bar_chartpd.set_xlabel('Drug Regimen')
bar_chartpd.set_ylabel('Number of Timepoints')

# Add limits to the x- and y-axis boundaries
bar_chartpd.set_xlim(-0.75, len(drug_regimen)-0.25)
bar_chartpd.set_ylim(0, max(drug_regimen)+20)

# Diplay the chart
plt.show()

### Timepoints per Drug Regimen Bar Chart (Matplotlib)

In [None]:
# Define the data for the x- and y-axis
x_axis = np.arange(len(drug_regimen))
y_axis = drug_regimen

# Define the x-axis tick marks for the bar chart and their location
xticks = [drug_regimen.index.values]
tick_locations = [x for x in x_axis]

In [None]:
# Plot the total number of timepoints for all mice tested for each drug regimen using Matplotlib and the x- and y-axis definitions above
plt.figure(figsize = (7.5,6))
bar_chartplt = plt.bar(x_axis, y_axis, color = 'navy', alpha = 1, align = 'center', width = 0.6)

# Add chart title and axis labels
plt.title('Total Timepoints per Drug Regimen', fontsize =15)
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Timepoints')

# Add correct labels to x-axis and set rotation
plt.xticks(tick_locations, drug_regimen.index, rotation = 45)

# Add limits to the x- and y-axis boundaries
plt.xlim(-0.75, len(drug_regimen)-0.25)
plt.ylim(0, max(drug_regimen)+20)

plt.show()

### Female vs. Male Pie Chart (Pandas)

In [None]:
# Create a Groupby object based on 'Sex' and count the number of mice in the two categories using the .count() function
mice_gender = reduced_mice_df.groupby('Sex')['Mouse ID'].count()
mice_gender

In [None]:
# Plot the Gender Data Series above using Pandas to show the gender distribution of mice tested
colors = ['deeppink', 'royalblue']
explode = [0.05,0]
pie_chartpd = mice_gender.plot(kind = 'pie', y = mice_gender, colors = colors, explode = explode,
                               labels = ['',''], autopct="%0.2f%%", shadow=True, startangle=90, fontsize = 12)

# Remove the ylabel and set a title for the chart
pie_chartpd.set_ylabel('')
pie_chartpd.set_title('Gender Proportion of Mice Tested')

# Add a legend to the chart, make the axes equal, and display the chart
pie_chartpd.legend(labels = mice_gender.index, loc = 'best')
plt.axis('equal')
plt.show()

### Female vs. Male Pie Chart (Matplotlib)

In [None]:
# Plot the Gender Data Series above using Matplotlib to show the gender distribution of mice tested
colors = ['deeppink', 'royalblue']
explode = [0.05,0]
pie_chartplt = plt.pie(mice_gender, colors = colors, explode = explode,
                       labels = ['',''], autopct="%0.2f%%", shadow=True, startangle=90, textprops={'fontsize': 12})

# Add a title to the chart
plt.title('Gender Proportion of Mice Tested')

# Add a legend to the chart, make the axes equal, and display the chart
plt.legend(labels = mice_gender.index, loc = 'best')
plt.axis('equal')
plt.show()

## Quartiles, Outliers, and Boxplots
#### Calculate the final tumor volume of each mouse across four of the treatment regimens: Capomulin, Ramicane, Infubinol, and Ceftamin

In [None]:
# Create a grouped DataFrame that shows the last (greatest) time point for each mouse
timepoint_df = reduced_mice_df.groupby('Mouse ID')['Timepoint'].max()
timepoint_df

# Merge this grouped DataFrame with the original cleaned DataFrame
timepoint_merged_df = pd.merge(reduced_mice_df, timepoint_df, on = ['Mouse ID','Timepoint'])
timepoint_merged_df

In [None]:
# Create a list that holds the treatment names, as well as a second, empty list to hold the tumor volume data
treatments = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']
treatment_volume = []

In [None]:
# Loop through each drug in the treatment list, locating the rows in the merged DataFrame that correspond to each treatment and store the tumor volume value in a variable tumor_volume 
for treatment in treatments:
    treatment_row = timepoint_merged_df.loc[timepoint_merged_df['Drug Regimen'] == treatment]
    tumor_volume = treatment_row['Tumor Volume (mm3)']
# Append the final tumor volumes for each drug to the empty list
    treatment_volume.append(tumor_volume)

### Capomulin Outliers

In [None]:
# Calculate quartiles and the IQR for Capomulin
cap_quartiles = treatment_volume[0].quantile([0.25,0.5,0.75])
cap_lowerq = cap_quartiles[0.25]
cap_upperq = cap_quartiles[0.75]
cap_iqr = cap_upperq - cap_lowerq
cap_median = cap_quartiles [0.5]

print(f'The lower quartile for {treatments[0]} is: {(round(cap_lowerq,4))}.')
print(f'The upper quartile for {treatments[0]} is: {(round(cap_upperq,4))}.')
print(f'The inter-quartile range for {treatments[0]} is: {(round(cap_iqr,4))}.')
print(f'The median for {treatments[0]} is: {(round(cap_median,4))}.')

# Calculate the upper and lower bounds for Capomulin
cap_lower_bound = cap_lowerq - (1.5*cap_iqr)
cap_upper_bound = cap_upperq + (1.5*cap_iqr)

print(f'For {treatments[0]}: any tumor volume value below {(round(cap_lower_bound,4))} or above {(round(cap_upper_bound,4))} is an outlier')

# Locate any Capomulin outliers in the DataFrame if any exist using df.loc
cap_outliers = timepoint_merged_df.loc[((timepoint_merged_df['Drug Regimen'] == treatments[0]) & (timepoint_merged_df['Tumor Volume (mm3)'] < cap_lower_bound)) | 
                                       ((timepoint_merged_df['Drug Regimen'] == treatments[0]) & (timepoint_merged_df['Tumor Volume (mm3)'] > cap_upper_bound))]
cap_outliers

### Ramicane Outliers

In [None]:
# Calculate the quartiles and IQR for Ramicane
ram_quartiles = treatment_volume[1].quantile([0.25,0.5,0.75])
ram_lowerq = ram_quartiles[0.25]
ram_upperq = ram_quartiles[0.75]
ram_iqr = ram_upperq - ram_lowerq
ram_median = ram_quartiles [0.5]

print(f'The lower quartile for {treatments[1]} is: {(round(ram_lowerq,4))}.')
print(f'The upper quartile for {treatments[1]} is: {(round(ram_upperq,4))}.')
print(f'The inter-quartile range for {treatments[1]} is: {(round(ram_iqr,4))}.')
print(f'The median for {treatments[1]} is: {(round(ram_median,4))}.')

# Calculate the upper and lower bounds for Ramicane
ram_lower_bound = ram_lowerq - (1.5*ram_iqr)
ram_upper_bound = ram_upperq + (1.5*ram_iqr)

print(f'For {treatments[1]}: any tumor volume value below {(round(ram_lower_bound,4))} or above {(round(ram_upper_bound,4))} is an outlier')

# Locate any Ramicane outliers in the DataFrame if any exist using df.loc
ram_outliers = timepoint_merged_df.loc[((timepoint_merged_df['Drug Regimen'] == treatments[1]) & (timepoint_merged_df['Tumor Volume (mm3)'] < ram_lower_bound)) | 
                                       ((timepoint_merged_df['Drug Regimen'] == treatments[1]) & (timepoint_merged_df['Tumor Volume (mm3)'] > ram_upper_bound))]
ram_outliers

### Infubinol Outliers

In [None]:
# Calculate the quartiles and IQR for Infubinol
inf_quartiles = treatment_volume[2].quantile([0.25,0.5,0.75])
inf_lowerq = inf_quartiles[0.25]
inf_upperq = inf_quartiles[0.75]
inf_iqr = inf_upperq - inf_lowerq
inf_median = inf_quartiles [0.5]

print(f'The lower quartile for {treatments[2]} is: {(round(inf_lowerq,4))}.')
print(f'The upper quartile for {treatments[2]} is: {(round(inf_upperq,4))}.')
print(f'The inter-quartile range for {treatments[2]} is: {(round(inf_iqr,4))}.')
print(f'The median for {treatments[2]} is: {(round(inf_median,4))}.')

# Calculate the upper and lower bounds for Infubinol
ram_lower_bound = ram_lowerq - (1.5*ram_iqr)
inf_lower_bound = inf_lowerq - (1.5*inf_iqr)
inf_upper_bound = inf_upperq + (1.5*inf_iqr)

print(f'For {treatments[2]}: any tumor volume value below {(round(inf_lower_bound,4))} or above {(round(inf_upper_bound,4))} is an outlier')

# Locate any Ramicane outliers in the DataFrame if any exist using df.loc
inf_outliers = timepoint_merged_df.loc[((timepoint_merged_df['Drug Regimen'] == treatments[2]) & (timepoint_merged_df['Tumor Volume (mm3)'] < inf_lower_bound)) | 
                                       ((timepoint_merged_df['Drug Regimen'] == treatments[2]) & (timepoint_merged_df['Tumor Volume (mm3)'] > inf_upper_bound))]
inf_outliers

### Ceftamin Outliers

In [None]:
# Calculate the quartiles and IQR for Ceftamin
ceft_quartiles = treatment_volume[3].quantile([0.25,0.5,0.75])
ceft_lowerq = ceft_quartiles[0.25]
ceft_upperq = ceft_quartiles[0.75]
ceft_iqr = ceft_upperq - ceft_lowerq
ceft_median = ceft_quartiles [0.5]

print(f'The lower quartile for {treatments[3]} is: {(round(ceft_lowerq,4))}.')
print(f'The upper quartile for {treatments[3]} is: {(round(ceft_upperq,4))}.')
print(f'The inter-quartile range for {treatments[3]} is: {(round(ceft_iqr,4))}.')
print(f'The median for {treatments[3]} is: {(round(ceft_median,4))}.')

# Calculate the upper and lower bounds for Ceftamin
ceft_lower_bound = ceft_lowerq - (1.5*ceft_iqr)
ceft_upper_bound = ceft_upperq + (1.5*ceft_iqr)

print(f'For {treatments[3]}: any tumor volume value below {(round(ceft_lower_bound,4))} or above {(round(ceft_upper_bound,4))} is an outlier')

# Locate any Ceftamin outliers in the DataFrame if any exist using df.loc
ceft_outliers = timepoint_merged_df.loc[((timepoint_merged_df['Drug Regimen'] == treatments[3]) & (timepoint_merged_df['Tumor Volume (mm3)'] < ceft_lower_bound)) | 
                                       ((timepoint_merged_df['Drug Regimen'] == treatments[3]) & (timepoint_merged_df['Tumor Volume (mm3)'] > ceft_upper_bound))]
ceft_outliers

### Boxplots

In [None]:
# Create boxplots for the final tumor volumes per drug regimen; display in the same chart
fig, ax = plt.subplots()

# Set chart title and axis labels
ax.set_title('Final Tumor Volume per Drug Regimen')
ax.set_xlabel('Drug Regimen')
ax.set_ylabel('Final Tumor Volume (mm3)')

# Change the size, color, and style of any outliers
outliers = dict(markerfacecolor = 'lime', marker = '*', markersize = 10)

# Plot the boxplots
ax.boxplot(treatment_volume, labels = treatments, flierprops = outliers)
plt.show()

## Line Plot and Scatter Plot

In [None]:
# Locate all Mouse IDs that were treated with the Capomulin drug
capomulin_mice = reduced_mice_df.loc[reduced_mice_df['Drug Regimen'] == 'Capomulin']
capomulin_mice
capomulin_mice['Mouse ID'].unique()

In [None]:
# Locate all rows of data for the mouse s185 to create a line plot
capomulin_mice_data = capomulin_mice.loc[capomulin_mice['Mouse ID'] == 's185']
capomulin_mice_data

In [None]:
# Create a line plot of Timepoint vs. Tumor Volume for mouse #s185
x_axis_lp = capomulin_mice_data['Timepoint']
y_axis_lp = capomulin_mice_data['Tumor Volume (mm3)']

plt.plot(x_axis_lp, y_axis_lp, '--', label = 'Tumor Volume', marker = 'o', color = 'darkviolet')
plt.xlabel('Timepoint')
plt.ylabel('Tumor Volume (mm3)')
plt.title('Mouse #s185 Tumor Volume vs. Timepoint', fontsize = 12)
plt.show()

In [None]:
# Create a scatter plot of tumor volume versus mouse weight for the Capomulin treatment regimen
capomulin_mice_grouped = capomulin_mice.groupby('Mouse ID').mean()
average_weight = capomulin_mice_grouped['Weight (g)']
average_tumor_vol = capomulin_mice_grouped['Tumor Volume (mm3)']

x_axis_sp = average_weight
y_axis_sp = average_tumor_vol
plt.scatter(x_axis_sp, y_axis_sp, marker = 'o', facecolor = 'blue', edgecolor = 'blue', s = x_axis_sp)
plt.xlabel('Avg. Weight (g)')
plt.ylabel('Avg. Tumor Volume (mm3)')
plt.title('Mice Weight vs. Tumor Volume', fontsize = 14)
plt.show()

## Linear Regression

In [None]:
# Calculate the correlation coefficient for mouse weight and average tumor volume for the Capomulin regimen
correlation = st.pearsonr(average_weight, average_tumor_vol)
print(f'The correlation between average mouse weight and average tumor volume for the Capomulin regimen is {(round(correlation[0],2))}.')

## Calculate the linear regression model for mouse weight and average tumor volume and plot the model on top of the previous scatter plot

In [None]:
x_axis_lreg = average_weight
y_axis_lreg = average_tumor_vol

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_axis_lreg, y_axis_lreg)
regress_values = x_axis_lreg * slope + intercept
line_eq = (f'y = {(round(slope,2))}x  + {(round(intercept,2))}')

plt.scatter(x_axis_sp, y_axis_sp, marker = 'o', facecolor = 'blue', edgecolor = 'blue', s = x_axis_sp)
plt.plot(x_axis_lreg, regress_values, 'g')
plt.annotate(line_eq, (18,37), fontsize = 12, color = 'green', size = 14)

plt.xlabel('Avg. Weight (g)')
plt.ylabel('Avg. Tumor Volume (mm3)')
plt.title('Mice Weight vs. Tumor Volume', fontsize = 14)
plt.show()

print(f'The r-square value is: {round(rvalue**2,2)}.')
print(line_eq)

In [None]:
# HELLO WORLD