In [38]:
%matplotlib notebook

In [39]:
# Import dependencies
#Numpy for calculations and matplotlib for charting
import numpy as np
import matplotlib.pyplot as plt

#pandas for data frames
import pandas as pd

In [40]:
#load in CSV files
mouse_data_file = "Resources/Mouse_metadata.csv"
study_results_file = "Resources/Study_results.csv"

#read data files
mouse_data_df = pd.read_csv(mouse_data_file)
study_results_df = pd.read_csv(study_results_file)

In [41]:
#combine data into single dataset
combined_mouse_df = pd.merge(mouse_data_df, study_results_df, on = "Mouse ID", how="outer")

combined_mouse_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [42]:
#check number of mice
mice_total = len(combined_mouse_df)
unique_mice = combined_mouse_df["Mouse ID"].value_counts()

#get duplicate mice 
duplicate_mice = combined_mouse_df.duplicated(subset = ["Mouse ID","Timepoint"])

#drop duplicate mice from data
clean_mouse_df = combined_mouse_df.drop_duplicates(subset = ['Mouse ID', 'Timepoint'])
clean_mouse_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [43]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

#create groupby for drug regimen
grouped_drug = clean_mouse_df.groupby(["Drug Regimen"])

#calculate mean of tumor volume
tumor_mean = grouped_drug["Tumor Volume (mm3)"].mean()

#calculate median of tumor volume
tumor_median = grouped_drug["Tumor Volume (mm3)"].median()

#calculate variance
tumor_variance = grouped_drug["Tumor Volume (mm3)"].var()

#calculate standard deviation
tumor_sd = grouped_drug["Tumor Volume (mm3)"].std()

#calculate SEM
tumor_sem = grouped_drug["Tumor Volume (mm3)"].sem()

#create summary data frame
volume_summary_df = pd.DataFrame({"Mean": tumor_mean, "Median": tumor_median, "Variance": tumor_variance,
                                 "Standard Deviation": tumor_sd, "Standard Error": tumor_sem})
volume_summary_df

Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,Standard Error
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.393463,50.909965,43.138803,6.568014,0.525862
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [44]:
# ## Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.

# #calculate total number of measurements for each drug regimen and create data frame
# total_measurements = grouped_drug["Drug Regimen"].count()
# drug_total_df = pd.DataFrame({"Total Measurements": total_measurements})

# #set up bar chart
# drug_total_df.plot(kind="bar", figsize=(8,10))

# # Set a title for the chart
# plt.title("Total Measurements Per Drug Regimen")
#plt.ylabel("Total Measurements")

# plt.show()
# plt.tight_layout()

In [58]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.

#set up parameters for bar chart
x_axis = np.arange(len(total_measurements))

tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, ["Capomulin", "Ceftamin", "Infubinol", "Ketapril", "Naftisol", "Placebo",
                           "Propriva", "Ramicane", "Stelasyn", "Zoniferol"], rotation="vertical")

#plot bar chart
plt.bar(x_axis, total_measurements, color='r', alpha=0.5, align="center")

# Set a Title and labels
plt.title("Total Measurements Per Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Total Measurements")

#set figure size
plt.tight_layout()

plt.show()

<IPython.core.display.Javascript object>

In [46]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [47]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

