In [111]:
#import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
#increase display width to view all columns at once
pd.options.display.width = 1200

#set paths for data
mouse_metadata_path = "/Users/bitty/matplotlib-challenge/matplotlib-challenge/Resources/Mouse_metadata.csv"
study_results_path = "/Users/bitty/matplotlib-challenge/matplotlib-challenge/Resources/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

#look at data
print(mouse_metadata)
print(study_results)



    Mouse ID Drug Regimen     Sex  Age_months  Weight (g)
0       k403     Ramicane    Male          21          16
1       s185    Capomulin  Female           3          17
2       x401    Capomulin  Female          16          15
3       m601    Capomulin    Male          22          17
4       g791     Ramicane    Male          11          16
..       ...          ...     ...         ...         ...
244     z314     Stelasyn  Female          21          28
245     z435     Propriva  Female          12          26
246     z581    Infubinol  Female          24          25
247     z795     Naftisol  Female          13          29
248     z969     Naftisol    Male           9          30

[249 rows x 5 columns]
     Mouse ID  Timepoint  Tumor Volume (mm3)  Metastatic Sites
0        b128          0           45.000000                 0
1        f932          0           45.000000                 0
2        g107          0           45.000000                 0
3        a457          0    

In [112]:
# Combine & Display the data table for preview
full_data_set = pd.merge(mouse_metadata, study_results, on='Mouse ID', how='outer')
full_data_set.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [113]:
# Checking the number of mice.
full_data_set.count()

Mouse ID              1893
Drug Regimen          1893
Sex                   1893
Age_months            1893
Weight (g)            1893
Timepoint             1893
Tumor Volume (mm3)    1893
Metastatic Sites      1893
dtype: int64

In [114]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicates = full_data_set[full_data_set.duplicated(keep=False)]
duplicates

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0


In [115]:
cleaned_data_set = full_data_set.drop_duplicates(subset=['Mouse ID', 'Timepoint'])
cleaned_data_set.head(10)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
5,k403,Ramicane,Male,21,16,25,33.464577,1
6,k403,Ramicane,Male,21,16,30,31.099498,1
7,k403,Ramicane,Male,21,16,35,26.546993,1
8,k403,Ramicane,Male,21,16,40,24.365505,1
9,k403,Ramicane,Male,21,16,45,22.050126,1


In [116]:
#get a clean duplicate count
cleaned_data_set.count()

Mouse ID              1888
Drug Regimen          1888
Sex                   1888
Age_months            1888
Weight (g)            1888
Timepoint             1888
Tumor Volume (mm3)    1888
Metastatic Sites      1888
dtype: int64

In [117]:
#cleaned_data_set.value_counts(['Mouse ID'])

## Summary Statistics

In [118]:
print(cleaned_data_set["Drug Regimen"].unique())

['Ramicane' 'Capomulin' 'Infubinol' 'Placebo' 'Ceftamin' 'Stelasyn'
 'Zoniferol' 'Ketapril' 'Propriva' 'Naftisol']


In [119]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
grouped_by_drug_regimen = cleaned_data_set.groupby("Drug Regimen")
print(grouped_by_drug_regimen.head())


    Mouse ID Drug Regimen     Sex  Age_months  Weight (g)  Timepoint  Tumor Volume (mm3)  Metastatic Sites
0       k403     Ramicane    Male          21          16          0           45.000000                 0
1       k403     Ramicane    Male          21          16          5           38.825898                 0
2       k403     Ramicane    Male          21          16         10           35.014271                 1
3       k403     Ramicane    Male          21          16         15           34.223992                 1
4       k403     Ramicane    Male          21          16         20           32.997729                 1
10      s185    Capomulin  Female           3          17          0           45.000000                 0
11      s185    Capomulin  Female           3          17          5           43.878496                 0
12      s185    Capomulin  Female           3          17         10           37.614948                 0
13      s185    Capomulin  Female    

In [120]:
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
av_tumor_size_by_reg = pd.DataFrame(cleaned_data_set.groupby(["Drug Regimen"]).mean().round(2))
av_tumor_size_by_reg = av_tumor_size_by_reg.drop(columns=["Age_months", "Weight (g)", "Timepoint", "Metastatic Sites"])
print(av_tumor_size_by_reg)


              Tumor Volume (mm3)
Drug Regimen                    
Capomulin                  40.68
Ceftamin                   52.59
Infubinol                  52.88
Ketapril                   55.24
Naftisol                   54.33
Placebo                    54.03
Propriva                   52.39
Ramicane                   40.22
Stelasyn                   54.23
Zoniferol                  53.24


In [155]:
#median
median_tumor_vol = pd.DataFrame(cleaned_data_set.groupby(["Drug Regimen"]).median().round(2))
median_tumor_vol = median_tumor_vol.drop(columns=["Age_months", "Weight (g)", "Timepoint", "Metastatic Sites"])
print(median_tumor_vol)


              Tumor Volume (mm3)
Drug Regimen                    
Capomulin                  41.56
Ceftamin                   51.78
Infubinol                  51.82
Ketapril                   53.70
Naftisol                   52.51
Placebo                    52.29
Propriva                   50.91
Ramicane                   40.67
Stelasyn                   52.43
Zoniferol                  51.82


In [145]:
#standard dev
#std_tumor_vol = np.std(cleaned_data_set.groupby(["Drug Regimen"]))
#std_tumor_vol = np.std(std_tumor_vol, ddof=0)
#std_tumor_vol

In [123]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [124]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.



In [125]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.



In [126]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [127]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [128]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [129]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [130]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [131]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [132]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [133]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
