## Observations and Insights 

In [1]:
%matplotlib notebook

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_df = pd.merge(mouse_metadata, study_results, on="Mouse ID", how="left")

# Display the data table for preview
combined_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [3]:
# check data
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1893 entries, 0 to 1892
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Mouse ID            1893 non-null   object 
 1   Drug Regimen        1893 non-null   object 
 2   Sex                 1893 non-null   object 
 3   Age_months          1893 non-null   int64  
 4   Weight (g)          1893 non-null   int64  
 5   Timepoint           1893 non-null   int64  
 6   Tumor Volume (mm3)  1893 non-null   float64
 7   Metastatic Sites    1893 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 133.1+ KB


In [4]:
# Checking the number of mice.
count_mice = combined_df['Mouse ID'].nunique()
count_mice

249

In [5]:
# check number of data records (includes header)
count_records = combined_df['Mouse ID'].count()
count_records

1893

In [8]:
# checking for duplicate data entries by Mouse ID and Timepoint.
# Optional: Get all the data for the duplicate mouse ID.
duplicate_entry = combined_df[combined_df.duplicated(subset=['Mouse ID', 'Timepoint'])]
print("Duplicate Entries are:", duplicate_entry, sep='\n')

Duplicate Entries are:
    Mouse ID Drug Regimen     Sex  Age_months  Weight (g)  Timepoint  \
909     g989     Propriva  Female          21          26          0   
911     g989     Propriva  Female          21          26          5   
913     g989     Propriva  Female          21          26         10   
915     g989     Propriva  Female          21          26         15   
917     g989     Propriva  Female          21          26         20   

     Tumor Volume (mm3)  Metastatic Sites  
909           45.000000                 0  
911           47.570392                 0  
913           49.880528                 0  
915           53.442020                 0  
917           54.657650                 1  


In [10]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = combined_df.drop_duplicates(subset=['Mouse ID', 'Timepoint'])
clean_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [12]:
# reset index and remove old index column
clean_df = clean_df.reset_index(drop = True)
clean_df.tail(3)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
1885,z969,Naftisol,Male,9,30,35,69.176246,4
1886,z969,Naftisol,Male,9,30,40,70.314904,4
1887,z969,Naftisol,Male,9,30,45,73.867845,4


In [13]:
# Checking the number of mice in the clean DataFrame.
count_mice = clean_df['Mouse ID'].nunique()
count_mice

249

In [11]:
# check number of data records (includes header)
count_records = clean_df['Mouse ID'].count()
count_records

1888

In [12]:
# get number and names of drugs in study
drugs_unique_num = clean_df['Drug Regimen'].nunique()
drugs_unique = clean_df['Drug Regimen'].unique()
print(drugs_unique_num)
print(drugs_unique)

10
['Ramicane' 'Capomulin' 'Infubinol' 'Placebo' 'Ceftamin' 'Stelasyn'
 'Zoniferol' 'Ketapril' 'Propriva' 'Naftisol']


## Summary Statistics

In [13]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them
# all together at the end.

In [50]:
# create all the required Series

# create a Series of the mean tumor volume by Drug Regimen 
series_mean_vol = pd.Series(clean_df.groupby(['Drug Regimen'])['Tumor Volume (mm3)'].mean())
#series_mean_vol

# create a Series of the median tumor volume by Drug Regimen 
series_med_vol = pd.Series(clean_df.groupby(['Drug Regimen'])['Tumor Volume (mm3)'].median())
#series_med_vol

# create a Series of the tumor volume variance by Drug Regimen 
series_var_vol = pd.Series(clean_df.groupby(['Drug Regimen'])['Tumor Volume (mm3)'].var())
#series_var_vol

# create a Series of the tumor volume standard deviation by Drug Regimen 
series_std_dev_vol = pd.Series(clean_df.groupby(['Drug Regimen'])['Tumor Volume (mm3)'].std())
#series_std_dev_vol

# create a Series of the tumor volume SEM by Drug Regimen 
series_sem_vol = pd.Series(clean_df.groupby(['Drug Regimen'])['Tumor Volume (mm3)'].sem())
#series_sem_vol

Drug Regimen
Capomulin    0.329346
Ceftamin     0.469821
Infubinol    0.492236
Ketapril     0.603860
Naftisol     0.596466
Placebo      0.581331
Propriva     0.525862
Ramicane     0.320955
Stelasyn     0.573111
Zoniferol    0.516398
Name: Tumor Volume (mm3), dtype: float64

In [None]:
df_first_name = pd.DataFrame(series_first_name, columns = ['First Name'])
df_last_name = pd.DataFrame(series_last_name, columns = ['Last Name'])
df_age = pd.DataFrame(series_age, columns = ['Age'])

df_all = pd.concat([df_first_name, df_last_name, df_age], axis=1)
print(df_all)
print(type(df_all))

In [44]:
# convert the Series into a DataFrame
# mean_vol_df = pd.DataFrame(mean_vol)
# mean_vol_df = mean_vol_df.transpose()
# mean_vol_df

mean_vol_df = pd.DataFrame(series_mean_vol)
mean_vol_df.head()

Unnamed: 0_level_0,Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,40.675741
Ceftamin,52.591172
Infubinol,52.884795
Ketapril,55.235638
Naftisol,54.331565


In [21]:
# place all of the data into a summary DataFrame and display
tumor_vol_summary_df = pd.DataFrame(
    {'Mean Volume': mean_vol,
     'Median Volume': med_vol,
     'Volume Variance': var_vol,
     'Volume Standard Deviation': std_dev_vol,
     'SEM': sem_vol
    })
tumor_vol_summary_df

Unnamed: 0,Mean Volume,Median Volume,Volume Variance,Volume Standard Deviation,SEM
0,Drug Regimen Capomulin 40.675741 Ceftamin ...,Drug Regimen Capomulin 41.557809 Ceftamin ...,Drug Regimen Capomulin 24.947764 Ceftamin ...,Drug Regimen Capomulin 4.994774 Ceftamin ...,Drug Regimen Capomulin 0.329346 Ceftamin ...


In [19]:
# place all of the data into a summary DataFrame and display
#tumor_vol_summary1_df = pd.merge(pd.merge(pd.merge(pd.merge(mean_vol_df,med_vol_df,on='Drug Regimen'),var_vol_df,on='Drug Regimen'),std_dev_vol_df,on='Drug Regimen'),sem_vol_df,on='Drug Regimen')

#tumor_vol_summary1_df

# rename columns to "Mean Volume", "Median Volume", "Volume Variance", "Volume Standard Deviation", "SEM"

Unnamed: 0_level_0,Tumor Volume (mm3)_x,Tumor Volume (mm3)_y,Tumor Volume (mm3)_x,Tumor Volume (mm3)_y,Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.393463,50.909965,43.138803,6.568014,0.525862
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [20]:
##############################################################################################################################

In [21]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function

In [22]:
# OPTION 1
# create summary statistics table using groupby
#tumor_vol_summary2_df = clean_df.groupby('Drug Regimen').agg(\
    #vol_mean = ('Tumor Volume (mm3)', mean),\
    #vol_median = ('Tumor Volume (mm3)', median),\
    #vol_var = ('Tumor Volume (mm3)', var),\
    #vol_stdev = ('Tumor Volume (mm3)', std),\
    #vol_sem = ('Tumor Volume (mm3)', sem)\
#)

# display the summary statistics table
#tumor_vol_summary2_df

# ----------------------------------------------------------------------------------

# OPTION 2
# group the data frame by Drug Regimen and extract a number of stats from each group
clean_df.groupby(
    ['Drug Regimen']
).agg(
    {
        # find the mean, median, var, std, sem of the Tumor Volume (mm3) column
        'Tumor Volume (mm3)': [mean, median, var, std, sem]
    }
)

# -------------------------------------------------------------------------------------

# Group the data frame by month and item and extract a number of stats from each group
#data.groupby(
    #['month', 'item']
#).agg(
    #{
        # Find the min, max, and sum of the duration column
        #'duration': [min, max, sum],
        # find the number of network type entries
        #'network_type': "count",
        # minimum, first, and number of unique dates
        #'date': [min, 'first', 'nunique']
    #}
#)

# --------------------------------------------------------------

#data[data['item'] == 'call'].groupby('month').agg(
    #max_duration=pd.NamedAgg(column='duration', aggfunc=max),
    #min_duration=pd.NamedAgg(column='duration', aggfunc=min),
    #total_duration=pd.NamedAgg(column='duration', aggfunc=sum),
    #num_days=pd.NamedAgg(
        #column="date", 
        #aggfunc=lambda x: (max(x) - min(x)).days)    
#)

# ---------------------------------------------------------------

#data[data['item'] == 'call'].groupby('month').agg(
    # Get max of the duration column for each group
    #max_duration=('duration', max),
    # Get min of the duration column for each group
    #min_duration=('duration', min),
    # Get sum of the duration column for each group
    #total_duration=('duration', sum),
    # Apply a lambda to date column
    #num_days=("date", lambda x: (max(x) - min(x)).days)    
#)

NameError: name 'mean' is not defined

## Bar and Pie Charts

In [61]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 

In [62]:
# create a dataframe grouping 
drug_names = clean_df.groupby('Drug Regimen')
drug_names

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002111070B470>

In [63]:
# create a dataframe counting the number of data points for each Drug Regimen
samples_per_drug = pd.DataFrame(drug_names['Drug Regimen'].count())
samples_per_drug

Unnamed: 0_level_0,Drug Regimen
Drug Regimen,Unnamed: 1_level_1
Capomulin,230
Ceftamin,178
Infubinol,178
Ketapril,188
Naftisol,186
Placebo,181
Propriva,156
Ramicane,228
Stelasyn,181
Zoniferol,182


In [64]:
# create a bar chart 
summary_bar_chart1 = samples_per_drug.plot(kind='bar', color="g", fontsize=14, width=0.75, figsize=(8, 6))

<IPython.core.display.Javascript object>

In [65]:
# set the labels
summary_bar_chart1.set_xlabel("Drug Regimen", fontsize=18)
summary_bar_chart1.set_ylabel("Number of Samples", fontsize=18)
summary_bar_chart1.set_title("Sample Count for each Drug Regimen", fontsize=20)
plt.tight_layout()

# save and plot the figure
plt.savefig("figures/summary_bar_chart1.png")
plt.show()

In [66]:
#########################################################################################################################

In [67]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.

In [68]:
# extract data required for bar chart
samples_per_drug = clean_df.groupby('Drug Regimen')['Mouse ID'].count()
samples_per_drug

Drug Regimen
Capomulin    230
Ceftamin     178
Infubinol    178
Ketapril     188
Naftisol     186
Placebo      181
Propriva     156
Ramicane     228
Stelasyn     181
Zoniferol    182
Name: Mouse ID, dtype: int64

In [69]:
# create lists for chart

#clean_df.groupby('Drug Regimen')['Mouse ID'].apply(list)

drugs = samples_per_drug.groupby('Drug Regimen').apply(list)
drugs

Drug Regimen
Capomulin    [230]
Ceftamin     [178]
Infubinol    [178]
Ketapril     [188]
Naftisol     [186]
Placebo      [181]
Propriva     [156]
Ramicane     [228]
Stelasyn     [181]
Zoniferol    [182]
Name: Mouse ID, dtype: object

In [None]:
# make a bar chart

x_axis = np.arange(len(samples_per_drug))

plt.bar(x_axis, samples_per_drug, color='green', align='center', width=0.75)
                                                           
# tell matplotlib where to place each of the x axis headers
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, samples_per_drug)

# give the chart a title, x label, and y label
plt.title("Number of Samples Per Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Samples")

# save and plot the figure
plt.savefig("figures/summary_bar_chart2.png")
plt.show()

In [70]:
#########################################################################################################

In [81]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [82]:
# create data frame group by sexes
sex = clean_df.groupby('Sex')
sex

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000021110710E10>

In [83]:
# create dataframe with data sample count by sex
sex_count = pd.DataFrame(sex['Sex'].count())
sex_count

Unnamed: 0_level_0,Sex
Sex,Unnamed: 1_level_1
Female,930
Male,958


In [84]:
# create pie chart
sex_count.plot(kind='pie', y = 'Sex', labels=['Female', 'Male'], colors=['pink', 'lightblue'],\
               shadow=True, autopct='%.2f%%', fontsize=18, startangle=135, figsize=(6, 6),\
               title="Distribution of Female Versus Male Mice", legend=False)
plt.axis("equal")

# save an image of the chart and display
plt.savefig("figures/sex_pie_chart1.png")
plt.show()

<IPython.core.display.Javascript object>

In [85]:
###############################################################################################################

In [86]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

In [87]:
# create data frame group by sexes
sex = clean_df.groupby('Sex')
sex

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000021110897470>

In [88]:
# create dataframe with data sample count by sex
sex_count = pd.DataFrame(sex['Sex'].count())
sex_count

Unnamed: 0_level_0,Sex
Sex,Unnamed: 1_level_1
Female,930
Male,958


In [89]:
# labels for the sections the pie chart
labels = ["Female", "Male"]

# colors of each section of the pie chart
colors = ["pink","lightblue"]

In [90]:
# use matplotlib to create a pie chart
plt.pie(sex_count, labels=labels, colors=colors, autopct="%.2f%%", shadow=True, startangle=135)

# set additional chart parameters
plt.axis("equal")
plt.title("Distribution of Female Versus Male Mice")

# save an image of the chart and display
plt.savefig("figures/sex_pie_chart2.png")
plt.show()

  


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [91]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

In [92]:
# find all mice treated with Capomulin
capomulin_mice = clean_df.loc[clean_df['Drug Regimen'] == 'Capomulin']
capomulin_mice

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
10,s185,Capomulin,Female,3,17,0,45.000000,0
11,s185,Capomulin,Female,3,17,5,43.878496,0
12,s185,Capomulin,Female,3,17,10,37.614948,0
13,s185,Capomulin,Female,3,17,15,38.177232,0
14,s185,Capomulin,Female,3,17,20,36.866876,0
...,...,...,...,...,...,...,...,...
440,i557,Capomulin,Female,1,24,45,47.685963,1
1447,r157,Capomulin,Male,22,25,0,45.000000,0
1448,r157,Capomulin,Male,22,25,5,45.597064,0
1449,r157,Capomulin,Male,22,25,10,46.059608,0


In [93]:
# select a random capomulin mouse
sample_mouse = capomulin_mice.sample(1)
sample_mouse

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
330,j119,Capomulin,Female,7,23,10,46.425366,0


In [94]:
# create dataframe with sample_mouse data
sample_mouse_data = capomulin_mice.loc[capomulin_mice['Mouse ID'] == sample_mouse.iloc[0,0], :]
sample_mouse_data

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
328,j119,Capomulin,Female,7,23,0,45.0,0
329,j119,Capomulin,Female,7,23,5,45.838998,0
330,j119,Capomulin,Female,7,23,10,46.425366,0
331,j119,Capomulin,Female,7,23,15,47.322984,0
332,j119,Capomulin,Female,7,23,20,48.158209,1
333,j119,Capomulin,Female,7,23,25,42.417166,1
334,j119,Capomulin,Female,7,23,30,43.013681,1
335,j119,Capomulin,Female,7,23,35,43.765705,1
336,j119,Capomulin,Female,7,23,40,44.58509,1
337,j119,Capomulin,Female,7,23,45,38.125164,1


In [95]:
# x axis
x_values = sample_mouse_data['Timepoint']
x_values

328     0
329     5
330    10
331    15
332    20
333    25
334    30
335    35
336    40
337    45
Name: Timepoint, dtype: int64

In [96]:
# y axis
y_values = sample_mouse_data['Tumor Volume (mm3)']
y_values

328    45.000000
329    45.838998
330    46.425366
331    47.322984
332    48.158209
333    42.417166
334    43.013681
335    43.765705
336    44.585090
337    38.125164
Name: Tumor Volume (mm3), dtype: float64

In [97]:
# plot a line graph
plt.plot(x_values, y_values, marker='s', color='red', linewidth=1)

[<matplotlib.lines.Line2D at 0x2111088eac8>]

In [98]:
# create labels for the X and Y axis
plt.xlabel("Timepoint")
plt.ylabel("Tumor Volume (mm3)")

Text(108.40277777777779, 0.5, 'Tumor Volume (mm3)')

In [99]:
# save and display the chart
plt.savefig("figures/line_plot.png")
plt.show()

In [100]:
##################################################################################################

In [101]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

In [102]:
x_axis = capomulin_mice.groupby('Mouse ID')['Weight (g)']

x_axis

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000021110878DD8>

In [103]:
# determine avg tumor vol each mouse
y_axis = capomulin_mice.groupby('Mouse ID').mean()['Tumor Volume (mm3)']

y_axis.head(3)

Mouse ID
b128    41.963636
b742    40.083699
f966    36.505973
Name: Tumor Volume (mm3), dtype: float64

In [104]:
# create scatter plot
# the size of each point on our plot is determined by their x value
plt.scatter(x_axis, y_axis, marker="o", facecolors="blue", edgecolors="black", alpha=0.75)

TypeError: unhashable type: 'numpy.ndarray'

In [None]:
# The y limits of our scatter plot is 0 to 1
plt.ylim(0, 1)

In [None]:
# The x limits of our scatter plot
plt.xlim(0, x_limit)

In [None]:
# save and display the chart
plt.savefig("figures/scatter_plot.png")
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
