## Observations and Insights 

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_study = pd.merge(mouse_metadata, study_results, how='outer', on= 'Mouse ID')

# Display the data table for preview
mouse_study.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


### Heading

mouse_metadata

In [23]:
study_results

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.000000,0
1,f932,0,45.000000,0
2,g107,0,45.000000,0
3,a457,0,45.000000,0
4,c819,0,45.000000,0
...,...,...,...,...
1888,r944,45,41.581521,2
1889,u364,45,31.023923,3
1890,p438,45,61.433892,1
1891,x773,45,58.634971,4


In [26]:
#KEEP

# Checking the number of mice.

# creating a list of mice ID's
mouse_ids = mouse_study["Mouse ID"].values

# converting the list to a set of unique value
unique_mice = set(mouse_ids)

# finding its length, which is the number of unique values
number_of_unique_mice = len(unique_mice)

print(f"There are {number_of_unique_mice} mice in the study.")


There are 249 mice in the study.


In [38]:
#KEEP
# Finding the duplicate mice 
# Seeing which Mouse ID numbers have duplicated Timepoint values. 

# obtaining the rows in the dataframe which are duplicated, by considering the two appropriate columns
duplicate_rows = mouse_study[mouse_study.duplicated(['Mouse ID', 'Timepoint'])]

print("The duplicated rows based on two columns are:")
print(f"{duplicate_rows}")
print("")

# obtaining the ID of the mouse that has duplicated data
mouse_duplicate_id = duplicate_rows.iloc[1,0]

print(f"The ID number of the mouse with duplicated data is {mouse_duplicate_id}.")

The duplicated rows based on two columns are:
    Mouse ID Drug Regimen     Sex  Age_months  Weight (g)  Timepoint  \
909     g989     Propriva  Female          21          26          0   
911     g989     Propriva  Female          21          26          5   
913     g989     Propriva  Female          21          26         10   
915     g989     Propriva  Female          21          26         15   
917     g989     Propriva  Female          21          26         20   

     Tumor Volume (mm3)  Metastatic Sites  
909           45.000000                 0  
911           47.570392                 0  
913           49.880528                 0  
915           53.442020                 0  
917           54.657650                 1  

The ID number of the mouse with duplicated data is g989.


In [40]:
#KEEP

# Creating a clean DataFrame by dropping the duplicate mouse by its ID.
clean_mouse_data = mouse_study.loc[mouse_study["Mouse ID"] != mouse_duplicate_id, :]


In [41]:
#KEEP
# Checking the number of mice in the clean DataFrame to ensure there is now 248.

mouse_ids = clean_mouse_data["Mouse ID"].values
unique_mice = set(mouse_ids)
number_of_unique_mice = len(unique_mice)
print(f"There are {number_of_unique_mice} mice in the study after removing the mouse with duplicate data.")

There are 248 mice in the study after removing the mouse with duplicate data.


In [15]:
# Checking the number of mice in the mouse_metadata dataframe

number_of_mice = mouse_metadata["Mouse ID"].count()
print(f"There are {number_of_mice} mice ID's recorded in the Dataframe.")



There are 249 mice ID's recorded in the Dataframe


In [None]:
mouse_study["Mouse ID"].value_counts()

In [None]:
list = (mouse_study["Mouse ID"].value_counts() == 1)
list.value_counts()
#So there are 12 mice that only appear once

In [None]:
list = (mouse_study["Mouse ID"].value_counts() == 2)
list.value_counts()
#So there are 15 mice that only appear twice

In [None]:
list = (mouse_study["Mouse ID"].value_counts() == 10)
list.value_counts()
# there are 130 cases where the mouse has 10 datapoint, which it should for the time points
# do I only keep them? or just remove the duplicates

In [None]:
cleaned_data = 

In [None]:
test = mouse_study.loc[mouse_study["Mouse ID"] == "g989"]
test

In [None]:
test2 = mouse_study.loc[mouse_study["Mouse ID"] == "n482"]
test2

In [18]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

duplicate_rows = mouse_study[mouse_study.duplicated(['Mouse ID', 'Timepoint'])]

mouse_duplicate = duplicate_rows['Mouse ID']

print("Duplicate Rows based on 2 columns are:", duplicate_rows, sep='\n')

mouse_duplicate_id = duplicate_rows.iloc[1,0]

print(f"The ID number of the mouse with duplicated data is {mouse_duplicate_id}")

Duplicate Rows based on 2 columns are:
    Mouse ID Drug Regimen     Sex  Age_months  Weight (g)  Timepoint  \
909     g989     Propriva  Female          21          26          0   
911     g989     Propriva  Female          21          26          5   
913     g989     Propriva  Female          21          26         10   
915     g989     Propriva  Female          21          26         15   
917     g989     Propriva  Female          21          26         20   

     Tumor Volume (mm3)  Metastatic Sites  
909           45.000000                 0  
911           47.570392                 0  
913           49.880528                 0  
915           53.442020                 0  
917           54.657650                 1  
The ID number of the mouse with duplicated data is g989


In [20]:
# Optional: Get all the data for the duplicate mouse ID. 
mouse_duplicate


909    g989
911    g989
913    g989
915    g989
917    g989
Name: Mouse ID, dtype: object

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_mouse_data = mouse_study.loc[mouse_study["Mouse ID"] != mouse_duplicate_id, :]
clean_mouse_data

In [None]:
# Checking the number of mice in the clean DataFrame.

mouse_ids = clean_mouse_data["Mouse ID"].values
unique_mice = set(mouse_ids)
number_of_unique_mice = len(unique_mice)
print(f"There are {number_of_unique_mice} mice in the study after removing the mouse with duplicate data.")

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.

# grouping the mouse data by regimen
grouped_mouse_data = clean_mouse_data.groupby(['Drug Regimen'])

mean = grouped_mouse_data["Tumor Volume (mm3)"].mean()

median = grouped_mouse_data["Tumor Volume (mm3)"].median()

mode = grouped_mouse_data["Tumor Volume (mm3)"].agg(pd.Series.mode)
mode

mode2 = grouped_mouse_data["Tumor Volume (mm3)"].agg(lambda x:x.value_counts().index[0])
mode2

# both mode options give 45 as the mode for eerything?? not sure if this is right??

variance = grouped_mouse_data["Tumor Volume (mm3)"].var()
variance

standard_deviation = grouped_mouse_data["Tumor Volume (mm3)"].std()
standard_deviation

sem = grouped_mouse_data["Tumor Volume (mm3)"].sem()
sem

summary_mouse_df = pd.DataFrame({"Mean": mean,
                                "Median": median,
                                "Mode": mode,
                                "Variance": variance,
                                "Standard Deviation": standard_deviation,
                                "SEM": sem})
summary_mouse_df

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function

# oorrrrr


summary = pd.DataFrame(grouped_mouse_data["Tumor Volume (mm3)"].agg([np.mean, np.median, pd.Series.mode, np.var, np.std, "sem"])
.rename(columns = {"mean":"Mean", "median":"Median", "mode":"Mode", "var":"Variance", "std":"Standard Deviation", "sem":"SEM"}))
                       
summary

In [None]:
#trying different
# THIS WORKS THE BEST

summary2 = pd.DataFrame(grouped_mouse_data["Tumor Volume (mm3)"].agg([np.mean, np.median, st.mode, np.var, np.std, "sem"])
.rename(columns = {"mean":"Mean", "median":"Median", "mode":"Mode", "var":"Variance", "std":"Standard Deviation", "sem":"SEM"}))
                       
summary2

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 

#LOLS SO MUCH EASIER IF I USED mouse_metadata DF BECAUSE IT JUST HAS EACH MOUSE ONCE!!1
# BUT REMOVE DUPLICATE

# I KNOW THAT THERE SHOULD ONLY BE 248 mice in total so will need to remove duplicates 
# first want to remove dupliate moue ID's from the clean data so that each mouse appears once
unique_mice_df = clean_mouse_data.drop_duplicates(subset = 'Mouse ID', keep = 'first')
#this has 248 rows so only includes each mouse once!

# now do a groupby for the drug regiman and a count
grouped_regimen_df = unique_mice_df.groupby(["Drug Regimen"]).count()
mice_per_treatment = grouped_regimen_df["Mouse ID"]


# plotting
mice_per_treatment_chart = mice_per_treatment.plot(kind = "bar", title = "Number of Mice Per Treatment", figsize=(10,5))
mice_per_treatment_chart.set_ylabel("Number of Mice")
plt.tight_layout()



In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
# py plot is plt.bar(x axis, y axis)
# so will need series/;lists
# but can still base it off a table I believe

# Will be obtaining data from grouped_regimen_df

# Setting x axis and tick locations
x_axis = np.arange(len(grouped_regimen_df))
tick_locations = [value for value in x_axis]

# Plotting the graph
# x-axis is the numbered list created above, y-axis will be the values of the Mouse ID colunm in the dataframe
# the values for the x-axis ticks id the index of the dataframe

plt.bar(x_axis, grouped_regimen_df["Mouse ID"], align="center")
plt.xticks(tick_locations, grouped_regimen_df.index, rotation="vertical")
plt.tight_layout()

# Set x limits for neatness of x - axis
plt.xlim(-0.75, len(x_axis)- 0.25)

#setting a title and labels
plt.title("Number of Mice Per Treatment")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")
plt.show()

# not sure how to change figure size.
# also should I put the %matplotlib notebook thing at the top?? maybe that will make them look nicer


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

# usinf unique_mice_df again because just want one instance of each mouse

# now do a groupby for the sex and a count
grouped_by_sex_df = unique_mice_df.groupby(["Sex"]).count()
mice_per_sex = grouped_by_sex_df["Mouse ID"]

mice_per_sex_chart = mice_per_sex.plot(kind = "pie", title = "Distribution of Female Versus Male Mice", autopct="%1.1f%%")
mice_per_treatment_chart.set_ylabel("Number of Mice")
plt.tight_layout()

# how to get rid of MOuse ID
# how to add percentages

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
# grouped by sex df has female first

distribution_of_sex_chart = plt.pie(grouped_by_sex_df["Mouse ID"], labels = grouped_by_sex_df.index, autopct="%1.1f%%")
plt.title("Distribution of Female Versus Male Mice")


#plt.pie(pie_votes, labels = pies, colors = colors, explode = explode, autopct="%1.1f%%", shadow=True, startangle=140)



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# mouse_metadata shows mouse id, drug, sex, age, weight
#study_results shows mouse id, timepoint, tumour volume and metastatic sites

# so can just use already merged mouse_study and just keep last occurance of eahc mouse
# actually, use clean_mouse_data becuase that has the duplicate removed

last_timepoint_df = clean_mouse_data.drop_duplicates(subset = 'Mouse ID', keep = 'last')
last_timepoint_df
# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
last_timepoint_df["Tumor Volume (mm3)"]

In [None]:
#THIS IS THE ONE!!

# Put treatments into a list for for loop (and later for plot labels)
treatments = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]


# Create empty list to fill with tumor vol data (for plotting)
tumour_volume_data = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 

# looping through each treatment
for treatment in treatments: 
    
    # using loc to set the data frame to only contain the treatment being considered at that time
    treatment_data = last_timepoint_df.loc[(last_timepoint_df["Drug Regimen"] == treatment),:]
    
    # accessing the column which contains the tumour data for that treatment and setting it as a list
    tumour_data = treatment_data["Tumor Volume (mm3)"]
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    # turning the tumour data into a list
    tumour_data_list = tumour_data.tolist()
    
    # add subset 
    
    # Determine outliers using upper and lower bounds
    
    quartiles = tumour_data.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq

    #print(f"The lower quartile of {treatment} is: {lowerq}")
    #print(f"The upper quartile of {treatment} is: {upperq}")
    #print(f"The interquartile range of {treatment} is: {iqr}")
    #print(f"The the median of {treatment} is: {quartiles[0.5]} ")

    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    print(f"Values below {lower_bound} could be outliers for {treatment}.")
    print(f"Values above {upper_bound} could be outliers for {treatment}.")
    
    #finding values that are outliers
    outliers = treatment_data.loc[(treatment_data["Tumor Volume (mm3)"] < lower_bound) | (treatment_data["Tumor Volume (mm3)"] > upper_bound)]["Tumor Volume (mm3)"]
    # this creates a series 
    
    outliers_value =  outliers.tolist()   
    
    if len(outliers) == 0:
        print(f"There are no outliers for {treatment}.") 
    elif len(outliers) == 1:
        print(f"There is one outlier for {treatment}. It is {outliers_value[0]}.")
    elif len(outliers) == 2:
        print(f"There are two outliers for {treatment}. They are {outliers_value[0]} and {outliers_value[1]}.")
    else:
        print(f"There are more than two outliers for {treatment}.")
    
    
    # adding the tumour data for the individul treatment to the list for plotting
    tumour_volume_data.append(tumour_data_list)
next    

# have commented out unnecessary rubbish - delete later


In [None]:
# trying for one to figure out what went wrong
#treatments = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tumour_volume_data = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 

# looping through each treatment
#for treatment in treatments: 
    
    # using loc to set the data frame to only contain the treatment being considered at that time
treatment_data = last_timepoint_df.loc[(last_timepoint_df["Drug Regimen"] == "Infubinol"),:]
    
    # accessing the column which contains the tumour data for that treatment and setting it as a list
tumour_data = treatment_data["Tumor Volume (mm3)"]
    # Locate the rows which contain mice on each drug and get the tumor volumes
tumour_data_list = tumour_data.tolist()
    
    # add subset 
    
    # Determine outliers using upper and lower bounds
    
    
quartiles = tumour_data.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of {treatment} is: {lowerq}")
print(f"The upper quartile of {treatment} is: {upperq}")
print(f"The interquartile range of {treatment} is: {iqr}")
print(f"The the median of {treatment} is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers for {treatment}.")
print(f"Values above {upper_bound} could be outliers for {treatment}.")
    
    
outliers = treatment_data.loc[(treatment_data["Tumor Volume (mm3)"] < lower_bound) | (treatment_data["Tumor Volume (mm3)"] > upper_bound)]["Tumor Volume (mm3)"]
    # this creates a series 

outliers_value =  outliers.tolist()   
    
if len(outliers) == 0:
    print(f"There are no outliers for {treatment}.") 
elif len(outliers) == 1:
    print(f"There is one outlier for {treatment}. It is {outliers_value[0]}.")
elif len(outliers) == 2:
    print(f"There are two outliers for {treatment}. They are {outliers_value[0]} and {outliers_value[1]}.")
else:
    print(f"There are more than two outliers for {treatment}.")
    # adding the tumour data for the individul treatment to the list for plotting
tumour_volume_data.append(tumour_data_list)
outliers
outlier_value

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
green_diamond = dict(markerfacecolor='fuchsia', marker='o')
fig1, ax1 = plt.subplots()
ax1.set_title('Final Tumor Volumes Across Four Regimens')
ax1.set_ylabel('Tumor Volume (mm3)')
ax1.boxplot(tumour_volume_data, flierprops = green_diamond)
ax1.set_xticklabels(treatments)
plt.show()

#tumour_volume_data holds my data so use this to create the box plot
# it is a list (?) where each value is a list of my data



#fig7, ax7 = plt.subplots()
#ax7.set_title('Multiple Samples with Different sizes')
#ax7.boxplot(data)



## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
capomulin_data = clean_mouse_data.loc[(clean_mouse_data["Drug Regimen"] == "Capomulin"), ["Mouse ID", "Timepoint", "Tumor Volume (mm3)"]]

capomulin_data.head()

#picking the first mouse s185
s185_data_df = capomulin_data.loc[(capomulin_data["Mouse ID"] == "s185"), ["Timepoint", "Tumor Volume (mm3)"]]
s185_data_df

graph = s185_data_df.plot(kind = "line", x = "Timepoint",  title = "Time Point VS. Tumour Volume (Mouse ID: s185)")
capomulin_data.head(10)

# would need to make it look nicer

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
capomulin_weight_df = clean_mouse_data.loc[(clean_mouse_data["Drug Regimen"] == "Capomulin"), ["Drug Regimen", "Weight (g)", "Tumor Volume (mm3)"]]
capomulin_weight_df

group_by_weight_df = capomulin_weight_df.groupby(["Weight (g)"], as_index = False).mean()
group_by_weight_df

chart = group_by_weight_df.plot(kind="scatter", x= "Weight (g)", y="Tumor Volume (mm3)", grid=True, figsize=(8,8),title="Mouse Weight Vs. Average Tumour Volume")
plt.show()

#need to make it look better

In [None]:
# making graph using matplotlib to see if that is better

group_by_weight_df

weight = group_by_weight_df.iloc[:,0]
tumour_volume = group_by_weight_df.iloc[:,1]
plt.scatter(weight, tumour_volume)
plt.title("Mouse Weight Vs. Average Tumour Volume")
plt.xlabel("Weight (g)")
plt.ylabel("Tumor Volume (mm3)")


#correlation = st.pearsonr(government,life)
#print(f"The correlation between both factors is {round(correlation[0],2)}")
#plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
# still considering group_by_weight_df for this

# setting variables
weight = group_by_weight_df.iloc[:,0]
tumour_volume = group_by_weight_df.iloc[:,1]

# ccorrelation
correlation = st.pearsonr(weight,tumour_volume)
print(f"The correlation between both factors is {round(correlation[0],2)}")

from scipy.stats import linregress

# calcualting regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(weight, tumour_volume)
regress_values = weight * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(weight, tumour_volume)
plt.title("Mouse Weight Vs. Average Tumour Volume (With Regression Line)")
plt.xlabel("Weight (g)")
plt.ylabel("Tumor Volume (mm3)")
plt.plot(weight,regress_values,"r-")
plt.annotate(line_eq,(21,40.5), fontsize=12,color="red")
plt.show()