## Observations and Insights 

In [None]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# # Get length of both files
# print(len(mouse_metadata))
# print(len(study_results))

# Combine the data into a single dataset
all_mouse_data_df = pd.merge(mouse_metadata, study_results, on="Mouse ID")

# Display the data table for preview
all_mouse_data_df


In [None]:
# Check the number of mice.
num_mice = len(all_mouse_data_df)
print(f"The number of mice: {num_mice}")

In [None]:
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice = all_mouse_data_df[all_mouse_data_df.duplicated(['Mouse ID', 'Timepoint'])]
duplicate_mice

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_mouse_data = all_mouse_data_df.drop_duplicates(subset=['Mouse ID', 'Timepoint'], keep='last')
clean_mouse_data

In [None]:
# Check the number of mice in the clean DataFrame.
num_mice_clean = len(clean_mouse_data)
print(f"The number of mice: {num_mice_clean}")

## Summary Statistics

In [None]:
# Method 1:  Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
clean_mouse_data[['Tumor Volume (mm3)', 'Drug Regimen']].describe()

# This method is the most straighforward, creating multiple series and putting them all together at the end.



In [None]:
# Method 2(optional): Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
clean_mouse_data[['Tumor Volume (mm3)', 'Drug Regimen']].groupby('Drug Regimen').describe()
# This method produces everything in a single groupby function. (https://pandas.pydata.org/pandas-docs/version/0.22.0/generated/pandas.core.groupby.DataFrameGroupBy.agg.html)


## Bar and Pie Charts

In [None]:
# Use Pandas to generate a bar plot showing the total number of mice in each treatment regimen 
# throughout the course of the study. 

## Note: this plot will be identical to the one that uses Pyplot

clean_mouse_data.groupby('Drug Regimen')['Mouse ID'].nunique().plot(kind='bar')

plt.title('Number of Mice per Treatment Regimen')
plt.ylabel('Number of Mice')
plt.show()


In [None]:
# Use Pyplot to generate a bar plot showing the total number of mice treatment in each treatment regimen throughout the course of the study.

##  Note: this plot will be identical to the one that uses Pandas

x_axis=clean_mouse_data['Drug Regimen'].unique()
plt.figure(figsize=(7,7))
plt.bar(x_axis, clean_mouse_data.groupby('Drug Regimen')['Mouse ID'].nunique())
plt.title('Number of Mice per Treatment Regimen')
plt.ylabel('Number of Mice')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Use Pandas to generate a pie plot showing the distribution of female versus male mice

## Note: this plot will be identical to the one that uses Pyplot

clean_mouse_data.groupby('Sex')['Mouse ID'].nunique().plot(kind='pie')

plt.title('Male vs. Female Mice')
plt.show()

In [None]:
# Use Pyplot to generate a pie plot showing the distribution of female versus male mice

##  Note: this plot will be identical to the one that uses Pandas

x_axis=clean_mouse_data['Sex'].unique()
plt.figure(figsize=(7,7))
plt.bar(x_axis, clean_mouse_data.groupby('Sex')['Mouse ID'].nunique())
plt.title('Male vs. Female Mice')
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the drug regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
clean_mouse_data.head(50)

# Start by getting the last (latest) timepoint for each mouse
latest_timepoint = clean_mouse_data[clean_mouse_data.Timepoint.eq(45)]
latest_timepoint


# filter dataframe to only show the 4 drug regimens listed above
filtered_drug_regimen = latest_timepoint.loc[(latest_timepoint['Drug Regimen'] == 'Capomulin') | \
                                             (latest_timepoint['Drug Regimen'] == 'Ramicane') | \
                                             (latest_timepoint['Drug Regimen'] == 'Infubinol') | \
                                             (latest_timepoint['Drug Regimen'] == 'Ceftamin') \
                                            ]

filtered_drug_regimen

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint

In [None]:
# Put the four drug regimens into a list that can be iterated over in a for-loop 
# (and later used for plot labels)
regimen_list = filtered_drug_regimen["Drug Regimen"]

# Create empty list to fill with tumor vol data (for plotting)
tumor_data = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
regimen_tumor_df = pd.DataFrame()
# for i in regimen_list:
#     regimen_list = filtered_drug_regimen[filtered_drug_regimen["Drug Regimen"]==i]["Tumor Volume (mm3)"].sum()
#     regimen_tumor_df = regimen_tumor_df.append({"Drug Regimen": i, "Total Tumor Volume":regimen_list}, ignore_index=True)
# regimen_tumor_df

for i in each_mouse:
    tumor_data = filtered_drug_regimen[filtered_drug_regimen["Mouse ID"]==i]["Tumor Volume (mm3)"].sum()
    # Locate the rows which contain mice on each drug and get the tumor volumes
    regimen_list_per = filtered_drug_regimen[filtered_drug_regimen["Mouse ID"]==i]["Drug Regimen"].unique()
    
    # add subset 
    regimen_tumor_df = regimen_tumor_df.append({"Mouse ID": i, "Drug Regimen": regimen_list_per, "Total Tumor Volume":tumor_data}, ignore_index=True)
regimen_tumor_df
   
    
    # Determine outliers using upper and lower bounds
sorted_by_volume = regimen_tumor_df.sort_values(by='Total Tumor Volume', ascending=False)
sorted_tumor = sorted_by_volume['Total Tumor Volume']
sorted_tumor

quartiles = sorted_tumor.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of tumor volume is: {lowerq}")
print(f"The upper quartile of tumor volume is: {upperq}")
print(f"The interquartile range of tumor volume is: {iqr}")
print(f"The the median of tumor volume is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
ax1.set_title('Final Tumor Volume')
ax1.set_ylabel('Tumor Volume (mm3)')
ax1.boxplot(sorted_tumor)
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. timepoint for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen. 
# Note: this means mouse weight goes on the x-axis, with average tumor volume on the y-axis. 


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen.
