## Observations and Insights 

%matplotlib widget

In [1]:
%matplotlib widget

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress
from sklearn import datasets
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_df = pd.merge(mouse_metadata,study_results, how="outer", on="Mouse ID")
# Display the data table for preview
combined_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [3]:
# Checking the number of mice.
mice_number = combined_df["Mouse ID"].nunique()
mice_number

249

In [4]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

combined_df.loc[combined_df.duplicated(subset=["Mouse ID","Timepoint"])==True, "Mouse ID"]

909    g989
911    g989
913    g989
915    g989
917    g989
Name: Mouse ID, dtype: object

In [5]:
# Optional: Get all the data for the duplicate mouse ID. 
combined_df.loc[combined_df.duplicated(subset=["Mouse ID","Timepoint"])==True, :]

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [6]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_df = combined_df.drop_duplicates(subset=["Mouse ID","Timepoint"],keep="last")
cleaned_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [7]:
# Checking the number of mice in the clean DataFrame.
cleaned_mice_number = cleaned_df["Mouse ID"].nunique()
cleaned_mice_number

249

## Summary Statistics

In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method is the most straighforward, creating multiple series and putting them all together at the end.
drug_regimen=cleaned_df["Drug Regimen"].unique()
drug_regimen

array(['Ramicane', 'Capomulin', 'Infubinol', 'Placebo', 'Ceftamin',
       'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'],
      dtype=object)

In [9]:
drug_regimen_mean =[]
drug_regimen_median =[]
drug_regimen_var =[]
drug_regimen_std =[]
drug_regimen_sem =[]
for drug in drug_regimen:
    drug_regimen_mean.append(cleaned_df.loc[cleaned_df["Drug Regimen"]==drug,"Tumor Volume (mm3)"].mean())
    drug_regimen_median.append(cleaned_df.loc[cleaned_df["Drug Regimen"]==drug,"Tumor Volume (mm3)"].median())
    drug_regimen_var.append(cleaned_df.loc[cleaned_df["Drug Regimen"]==drug,"Tumor Volume (mm3)"].var())
    drug_regimen_std.append(cleaned_df.loc[cleaned_df["Drug Regimen"]==drug,"Tumor Volume (mm3)"].std())
    drug_regimen_sem.append(cleaned_df.loc[cleaned_df["Drug Regimen"]==drug,"Tumor Volume (mm3)"].sem())
summary_stats = pd.DataFrame({
    "Drug Regimen": drug_regimen,
    "Mean": drug_regimen_mean,
    "Median":drug_regimen_median,
    "Variance":drug_regimen_var,
    "Standard Deviation":drug_regimen_std,
    "SEM":drug_regimen_sem
})
summary_stats

Unnamed: 0,Drug Regimen,Mean,Median,Variance,Standard Deviation,SEM
0,Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
1,Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
2,Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
3,Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
4,Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
5,Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
6,Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398
7,Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
8,Propriva,52.382993,50.783528,43.220205,6.574208,0.526358
9,Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466


In [10]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method produces everything in a single groupby function
grouped_regimen = cleaned_df.groupby(["Drug Regimen"])
grouped_regimen_df= pd.DataFrame(grouped_regimen["Tumor Volume (mm3)"].count())
grouped_regimen_df["Mean"]= pd.DataFrame(grouped_regimen["Tumor Volume (mm3)"].mean())
grouped_regimen_df["Median"] = pd.DataFrame(grouped_regimen["Tumor Volume (mm3)"].median())
grouped_regimen_df["Variance"] = pd.DataFrame(grouped_regimen["Tumor Volume (mm3)"].var())
grouped_regimen_df["Standard Deviation"] = pd.DataFrame(grouped_regimen["Tumor Volume (mm3)"].std())
grouped_regimen_df["SEM"] = pd.DataFrame(grouped_regimen["Tumor Volume (mm3)"].sem())

grouped_regimen_df

Unnamed: 0_level_0,Tumor Volume (mm3),Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Capomulin,230,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,178,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,178,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,188,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,186,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,181,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,156,52.382993,50.783528,43.220205,6.574208,0.526358
Ramicane,228,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,181,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,182,53.236507,51.818479,48.533355,6.966589,0.516398


In [11]:
grouped_regimen_df=grouped_regimen_df.drop(columns=["Tumor Volume (mm3)"])
grouped_regimen_df

Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.382993,50.783528,43.220205,6.574208,0.526358
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


## Bar and Pie Charts

In [12]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
regimen_mice_number = grouped_regimen["Mouse ID"].nunique()
regimen_mice_number 

Drug Regimen
Capomulin    25
Ceftamin     25
Infubinol    25
Ketapril     25
Naftisol     25
Placebo      25
Propriva     25
Ramicane     25
Stelasyn     24
Zoniferol    25
Name: Mouse ID, dtype: int64

In [13]:

regimen_mice_number.plot(kind="bar", figsize=(6,4))

# Set a title for the chart
plt.title("Total number of mice for Drug Regimen")
plt.show()
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [14]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
x_axis = np.arange(len(regimen_mice_number))
tick_locations = [value for value in x_axis]
fig1, ax1 = plt.subplots()
plt.bar(x_axis, regimen_mice_number, color='r', alpha=0.5, align="center")
plt.xticks(tick_locations, regimen_mice_number.index, rotation="vertical")
ax1.set_title("Total number of mice for Drug Regimen")
ax1.set_ylabel('Total mice number')
plt.tight_layout()
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [15]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
grouped_gender = cleaned_df.groupby("Sex")
grouped_gender = grouped_gender["Mouse ID"].nunique()
grouped_gender

Sex
Female    124
Male      125
Name: Mouse ID, dtype: int64

In [16]:
gender_list = grouped_gender.index
fig1, ax1 = plt.subplots()
gender_pie = grouped_gender.plot(kind="pie", y=gender_list,title="Distribution of female versus male mice")
gender_pie.set_ylabel("Mice number")
plt.tight_layout()
plt.axis("equal")
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [17]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
colors = ["yellowgreen","lightcoral"]
explode = (0, 0.05)
fig1, ax1 = plt.subplots()
plt.title("Distribution of female versus male mice")
plt.pie(grouped_gender, explode=explode, labels=gender_list, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=90)
plt.axis("equal")
plt.show()


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [18]:
## Quartiles, Outliers and Boxplots

In [19]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
four_regimens_df = cleaned_df.loc[(cleaned_df["Drug Regimen"]=="Capomulin")|
                                  (cleaned_df["Drug Regimen"]=="Ramicane")|
                                  (cleaned_df["Drug Regimen"]=="Infubinol")|
                                  (cleaned_df["Drug Regimen"]=="Ceftamin"), :]
four_regimens_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1868,z581,Infubinol,Female,24,25,25,54.316407,2
1869,z581,Infubinol,Female,24,25,30,56.286750,2
1870,z581,Infubinol,Female,24,25,35,58.628399,3
1871,z581,Infubinol,Female,24,25,40,60.053740,3


In [20]:
# Start by getting the last (greatest) timepoint for each mouse
grouped_four_regimens = four_regimens_df.groupby(["Drug Regimen","Mouse ID"],as_index=True)
max_timepoint= pd.DataFrame(grouped_four_regimens["Timepoint"].max())
max_timepoint

Unnamed: 0_level_0,Unnamed: 1_level_0,Timepoint
Drug Regimen,Mouse ID,Unnamed: 2_level_1
Capomulin,b128,45
Capomulin,b742,45
Capomulin,f966,20
Capomulin,g288,45
Capomulin,g316,45
...,...,...
Ramicane,s508,45
Ramicane,u196,45
Ramicane,w678,5
Ramicane,y449,15


In [21]:
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
grouped_combined = pd.merge(max_timepoint,four_regimens_df, how="left", on=["Drug Regimen","Mouse ID","Timepoint"])
grouped_combined.head()

Unnamed: 0,Drug Regimen,Mouse ID,Timepoint,Sex,Age_months,Weight (g),Tumor Volume (mm3),Metastatic Sites
0,Capomulin,b128,45,Female,9,22,38.982878,2
1,Capomulin,b742,45,Male,7,21,38.939633,0
2,Capomulin,f966,20,Male,16,17,30.485985,0
3,Capomulin,g288,45,Male,3,19,37.074024,1
4,Capomulin,g316,45,Female,22,22,40.15922,2


In [22]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = ['Capomulin', 'Ceftamin', 'Infubinol', 'Ramicane']

In [23]:
# Create empty list to fill with tumor vol data (for plotting)
tumorvol_data = []
for volume in grouped_combined["Tumor Volume (mm3)"]:
    tumorvol_data.append(volume)
tumorvol_data

[38.98287774,
 38.93963263,
 30.48598484,
 37.07402422,
 40.1592203,
 47.68596303,
 37.31184577,
 38.125164399999996,
 38.75326548,
 41.48300765,
 38.84687569,
 28.43096411,
 33.32909778,
 46.53920617,
 32.37735684,
 41.58152074,
 23.34359787,
 40.72857787,
 34.45529799,
 31.02392294,
 40.65812366,
 39.95234669,
 36.04104736,
 28.48403281,
 31.896238399999998,
 62.99935619,
 45.0,
 56.05774909,
 55.74282869,
 48.72207785,
 47.784681799999994,
 61.84902336,
 68.92318457,
 67.74866174,
 57.91838132,
 46.78453491,
 59.85195552,
 64.29983003,
 59.74190064,
 61.43389223,
 64.19234114,
 45.0,
 52.92534846,
 45.0,
 67.52748237,
 45.0,
 64.63494887,
 61.38666032,
 68.59474498,
 64.72983655,
 67.97341878,
 65.52574285,
 57.03186187,
 66.08306589,
 72.2267309,
 36.321345799999996,
 60.96971133,
 62.43540402,
 60.91876652,
 67.28962147,
 66.19691151,
 62.11727887,
 47.01036401,
 60.16518046,
 55.62942846,
 45.69933088,
 54.65654872,
 55.65068132,
 46.25011212,
 54.04860769,
 51.54243058,
 50.0051

In [24]:
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
    # Locate the rows which contain mice on each drug and get the tumor volumes
    # add subset 
    # Determine outliers using upper and lower bounds
quartiles = grouped_combined["Tumor Volume (mm3)"].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
outliers=[]
outliers = [outliers.append(data) for data in tumorvol_data if data<lower_bound]
outliers = [outliers.append(data) for data in tumorvol_data if data>upper_bound]
outliers

[]

In [25]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
ax1.set_title('Tumor volume of mice across four regimens')
ax1.set_ylabel('Tumor volume')
ax1.boxplot(tumorvol_data)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Line and Scatter Plots

In [26]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
lineplot_df = cleaned_df.loc[(cleaned_df["Drug Regimen"]=="Capomulin")&
                             (cleaned_df["Mouse ID"]=="b128"), ["Timepoint","Tumor Volume (mm3)"]]
lineplot_df.set_index("Timepoint")
lineplot_df.plot(kind="line", x = "Timepoint", y="Tumor Volume (mm3)")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7fbdc0243630>

In [35]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

capomulin_regimen = cleaned_df.loc[cleaned_df["Drug Regimen"]=="Capomulin", :]
grouped_mouseid = capomulin_regimen.groupby("Mouse ID")
average_tumorvolume = grouped_mouseid["Tumor Volume (mm3)"].mean()
mice_weight = grouped_mouseid["Weight (g)"].mean()
scatter_df = pd.DataFrame({
             "Mouse Weight": mice_weight,
             "Average Tumor Volume": average_tumorvolume
})

scatter_df.plot(kind="scatter", x = "Mouse Weight", y="Average Tumor Volume",
                title="Weight Vs. Tomor Volume")


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7fbdc125dba8>

## Correlation and Regression

In [37]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

correlation = st.pearsonr(scatter_df.iloc[:,0],scatter_df.iloc[:,1])
print(f"The correlation coefficient between mouse weight and average tomor volume is {round(correlation[0],2)}")

The correlation coefficient between mouse weight and average tomor volume is 0.84


In [39]:
x_values = scatter_df["Mouse Weight"]
y_values = scatter_df["Average Tumor Volume"]
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
line_eq

'y = 0.95x + 21.55'