## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
import seaborn as sn

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset

merged_df = pd.merge(mouse_metadata, study_results, on = ("Mouse ID" , "Mouse ID"))


# Display the data table for preview

merged_df

In [None]:
# Checking the number of mice.
mice_num = merged_df["Mouse ID"].nunique()

mice_num

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

dup_mice_ids = merged_df.loc[merged_df.duplicated(['Timepoint', 'Mouse ID']),'Mouse ID'].unique()

dup_mice_ids

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 

duplicate_mice_data = merged_df.loc[merged_df["Mouse ID"] == "g989"]

duplicate_mice_data

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = merged_df[merged_df["Mouse ID"]!= 'g989']

clean_df.head() 

In [None]:
# Checking the number of mice in the clean DataFrame.
mice_num = clean_df["Mouse ID"].nunique()

mice_num

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

#First we groupby drug regimen
grouped_regimen_df = clean_df.groupby(["Drug Regimen"])
grouped_regimen_df.head()

#Now calculating the following properties of each drug regimen(for grouped df the following formula to be used, np cant be used)
mean_group = grouped_regimen_df["Tumor Volume (mm3)"].mean()
median_group =grouped_regimen_df["Tumor Volume (mm3)"].median()
variance_group=grouped_regimen_df["Tumor Volume (mm3)"].var()
standard_dev_group=grouped_regimen_df["Tumor Volume (mm3)"].std()
sem_group=grouped_regimen_df["Tumor Volume (mm3)"].sem()

#Create a dataframe with the above calculated datas
summary_df=pd.DataFrame({"Tumor Volume (mm3)Mean":mean_group, 
                         "Tumor Volume (mm3)Median":median_group, 
                         "Tumor Volume (mm3)VAR":variance_group, 
                         "Tumor Volume (mm3)STD":standard_dev_group,
                         "Tumor Volume (mm3)SEM":sem_group})
#Displaying the dataframe
summary_df 

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line

#First,we have to groupby drug regimen
grouped_regimen_df = clean_df.groupby(["Drug Regimen"])

#create a dataframe 
summary_two_df = grouped_regimen_df.agg(['mean','median','var','std','sem'])["Tumor Volume (mm3)"]
summary_two_df

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.

# Set x axis (drugs) and y axis (data_points)
drugs = drug_df.index
data_points = drug_df['Data Points']

In [None]:
# Generate bar plot
drug_bar = clean_df["Drug Regimen"].value_counts().plot.bar(width =0.5)

# Set labels 
drug_bar.set_xlabel("Drug Regimen")
drug_bar.set_ylabel("Number of Data Points")
drug_bar.set_title("Number of Data Points for Each Treatment Regimen")

#drug_bar.count().head(10)

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas

total_mice = clean_df["Mouse ID"].nunique()
gender_group = clean_df.groupby(["Sex"])
#gender_group.count().head(20)
gender_count = gender_group["Mouse ID"].nunique()
#gender_count

percent_by_gender = total_mice/ gender_count *100

gender_bar_df = pd.DataFrame({"% BY GENDER":percent_by_gender,"TOTAL":gender_count})

gender_pie_chart = gender_bar_df.plot.pie(y="% BY GENDER", figsize=(5,5), startangle=140, shadow = True, autopct="%1.1f%%")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

plt.pie(percent_by_gender, startangle=140, shadow = True, autopct="%1.1f%%")

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse

max_time_df = pd.DataFrame(clean_df.groupby('Mouse ID')['Timepoint'].max().sort_values().reset_index().rename(columns={'Timepoint': 'max_timepoint'}))
max_time_df

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
merge_max_time_df = pd.merge(max_time_df, clean_df, on='Mouse ID')


merge_max_time_df

In [None]:
# Put treatments into a list for for loop (and later for plot labels)

final_drugs = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

# Create empty list to fill with tumor vol data (for plotting)

drug_values = []
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for drug in final_drugs:
    
    temp_merged_df = merge_max_time_df.loc[merge_max_time_df['Drug Regimen'] == drug]
    
    # Create subset dataframes that narrow down to rows only containing the final timepoint for each regimen
    final_volume_df = temp_merged_df.loc[temp_merged_df['Timepoint'] == temp_merged_df['max_timepoint']]
    
    # Create a series that consists of all final volume values for each regimen
    values = final_volume_df['Tumor Volume (mm3)']
    drug_values.append(values)
    
    # Calculate and print the interquartile range (IQR) for each regimen
    quartiles = values.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    print(f'IQR for {drug}: {iqr}')
    
    # Find upper and lower bounds to help identify outliers for each regimen
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    print(f'Lower Bound for {drug}: {lower_bound}')
    print(f'Upper Bound for {drug}: {upper_bound}')
    
    # Quantitatively check for outliers
    outliers_count = (values.loc[(final_volume_df['Tumor Volume (mm3)'] >= upper_bound) | 
                                        (final_volume_df['Tumor Volume (mm3)'] <= lower_bound)]).count()
    print(f'Number of {drug} outliers: {outliers_count}')
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
plt.boxplot(drug_values)

plt.title('Final Tumor Volume by Drug')
plt.ylabel('Final Tumor Volume (mm3)')
plt.xticks([1, 2, 3, 4], ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin'])

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
# find a mouse treated with CAP
cap_mouse = merge_max_time_df.loc[merge_max_time_df["Drug Regimen"] == "Capomulin"].reset_index()

cap_mouse_1 = cap_mouse.loc[cap_mouse["Mouse ID"] == "l509"]
cap_mouse_1

plt.plot(cap_mouse_1["Timepoint"], cap_mouse_1["Tumor Volume (mm3)"], marker = 'x')


plt.xlabel("TIMEPOINT")
plt.ylabel("TUMOR VOLUME")
plt.title("MOUSE L509")

# cap_mouse = merge_max_time_df.loc[merge_max_time_df["Drug Regimen"] == "Capomulin"].reset_index()
# cap_mouse

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
cap_mouse_tumor_vol = pd.DataFrame(cap_mouse.groupby('Mouse ID')['Tumor Volume (mm3)'].mean().sort_values()).reset_index().rename(columns={'Tumor Volume (mm3)': 'avg_tumor_vol'})
cap_mouse_tumor_vol

cap_mouse_weight = pd.DataFrame(cap_mouse.groupby('Mouse ID')['Weight (g)'].mean())
cap_mouse_weight

cap_mouse_vol_weight = pd.merge(cap_mouse_tumor_vol,cap_mouse_weight, on='Mouse ID')
cap_mouse_vol_weight

cap_mouse_vol_weight.plot(kind="scatter", x="Weight (g)", y="avg_tumor_vol", grid=True, figsize=(8,8), title = "AVG TUMOR VOLUME BY WEIGHT")
plt.show()


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen


plt.scatter(tumor_volume,weight)
plt.plot(tumor_volume,vc_fit,"--")

plt.annotate(line_eq,(40,17),fontsize=15,color="red")


plt.xlabel("WEIGHT (g)")
plt.ylabel("AVERAGE TUMOR VOLUME (mm3)")
plt.title('AVERGE TUMOR VOLUME BY WEIGHT')
plt.show()
# for mouse weight and average tumor volume for the Capomulin regimen