In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

In [None]:
# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

In [None]:
# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [None]:
# Combine the data into a single dataset
# Display the data table for preview
lab_rats = pd.merge(mouse_metadata, study_results, on="Mouse ID",suffixes = ("Metadata", "Study"))
lab_rats

In [None]:
# Checking the number of mice.
len(lab_rats['Mouse ID'].value_counts())

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
#find duplicate mouse ID values
dupes = lab_rats.drop_duplicates(subset=['Mouse ID','Timepoint'], keep = False)
dupes

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mouse_IDs = dupes['Mouse ID'].unique()

In [None]:
# Checking the number of mice in the clean DataFrame.
len(mouse_IDs)

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
#and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties 
#of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
tumor_data = lab_rats.groupby('Drug Regimen')['Tumor Volume (mm3)']
tumor_mean = tumor_data.mean()
tumor_median = tumor_data.median()
tumor_variance = tumor_data.var()
tumor_sd = tumor_data.std()
tumor_SEM = tumor_data.sem()
tumor_stats = pd.DataFrame({"Mean" : tumor_mean, "Median" : tumor_median,
                            "Variance" :tumor_variance,"Standard Deviation":tumor_sd,
                            "SEM" :tumor_SEM})
tumor_stats

In [None]:
# Using the aggregation method, produce the same summary statistics in a single line
aggregation = tumor_data.aggregate(['mean', 'median','var','std', 'sem'])
aggregation

In [None]:
# Generate a bar plot showing the total number of measurements taken on 
#each drug regimen using pandas.


In [None]:
# Generate a bar plot showing the total number of measurements taken 
#on each drug regimen using pyplot.

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
f_v_m = dupes.drop_duplicates(subset=['Mouse ID'], keep = "last").groupby('Sex').count()
f_v_m.plot(kind="pie", y ='Mouse ID', autopct='%1.1f%%')
plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
plt.pie(f_v_m['Mouse ID'], labels = ("Female", "Male"),colors = ('Pink', 'Blue'), autopct='%1.1f%%')
plt.show()

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
last_timepoint = lab_rats.sort_values(by=['Timepoint'])
last_timepoint = last_timepoint.drop_duplicates(subset='Mouse ID', keep="last")

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
last_timepoint

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = last_timepoint['Drug Regimen'].unique()

# Create empty list to fill with tumor vol data (for plotting)
tumor_data = []

In [None]:
# Locate the rows which contain mice on each drug and get the tumor volumes
for treatment in treatments:
    tumor_volume = last_timepoint.loc[treatment, 'Tumor Volume (mm3)']
    tumor_data.append(tumor_volume)
tumor_data

In [None]:
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
# add subset     
box_data = last_timepoint.groupby('Drug Regimen')
box_plot = box_data['Tumor Volume (mm3)'].describe()
box_iqr = box_plot['75%'] - box_plot['25%']
box_plot['IQR'] = box_iqr
box_plot['Upper Bound'] = box_plot['25%'] - (1.5*box_plot['IQR'])
box_plot['Lower Bound'] = box_plot['75%'] + (1.5*box_plot['IQR'])
box_plot

In [None]:
print(last_timepoint.dtypes)

In [None]:
# Determine outliers using upper and lower bounds
outliers=[]

for treatment in treatments: 
    for mouse in last_timepoint:
        drug = last_timepoint['Drug Regimen'].astype(str)
        upperbound = box_plot.loc[treatment,'Upper Bound']
        lowerbound = box_plot.loc[treatment,'Lower Bound']  
        print(drug)
        print(treatment)
    break
        #if drug == treatment:
            #last_timepoint['Upper Bound'] = upperbound
            #last_timepoint['Lower Bound'] = lowerbound
last_timepoint
      
        
   
    
  


In [None]:
test = last_timepoint[last_timepoint['Tumor Volume (mm3)'].between(box_plot.loc['Ramicane','Upper Bound'], box_plot.loc['Ramicane','Lower Bound']) == False]
test


In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
