# Practice Notebook

## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_study = pd.merge(mouse_metadata, study_results, how='outer', on= 'Mouse ID')

# Display the data table for preview
mouse_study.head()

In [None]:
test = mouse_study.groupby(['Drug Regimen'])
test.count()

In [None]:
# Checking the number of mice.

# creating a list of mice ID's
mouse_ids = mouse_study["Mouse ID"].values

# converting the list to a set of unique value
unique_mice = set(mouse_ids)

# finding its length, which is the number of unique values
number_of_unique_mice = len(unique_mice)

print(f"There are {number_of_unique_mice} mice in the study.")


In [None]:
# Finding the duplicate mice 
# Seeing which Mouse ID numbers have duplicated Timepoint values. 

# obtaining the rows in the dataframe which are duplicated, by considering the two appropriate columns
duplicate_rows = mouse_study[mouse_study.duplicated(['Mouse ID', 'Timepoint'])]

print("The duplicated rows based on two columns are:")
print(f"{duplicate_rows}")
print("")

# obtaining the ID of the mouse that has duplicated data
mouse_duplicate_id = duplicate_rows.iloc[1,0]

print(f"The ID number of the mouse with duplicated data is {mouse_duplicate_id}.")

In [None]:
# Creating a clean DataFrame by dropping the duplicate mouse by its ID.
clean_mouse_data = mouse_study.loc[mouse_study["Mouse ID"] != mouse_duplicate_id, :]

In [None]:
# Checking the number of mice in the clean DataFrame to ensure there is now 248.

mouse_ids = clean_mouse_data["Mouse ID"].values
unique_mice = set(mouse_ids)
number_of_unique_mice = len(unique_mice)
print(f"There are {number_of_unique_mice} mice in the study after removing the mouse with duplicate data.")

In [None]:
test = mouse_study.loc[mouse_study["Mouse ID"] == "g989"]
test

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

duplicate_rows = mouse_study[mouse_study.duplicated(['Mouse ID', 'Timepoint'])]

mouse_duplicate = duplicate_rows['Mouse ID']

print("Duplicate Rows based on 2 columns are:", duplicate_rows, sep='\n')

mouse_duplicate_id = duplicate_rows.iloc[1,0]

print(f"The ID number of the mouse with duplicated data is {mouse_duplicate_id}")

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
mouse_duplicate


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_mouse_data = mouse_study.loc[mouse_study["Mouse ID"] != mouse_duplicate_id, :]
clean_mouse_data

In [None]:
# Checking the number of mice in the clean DataFrame.

mouse_ids = clean_mouse_data["Mouse ID"].values
unique_mice = set(mouse_ids)
number_of_unique_mice = len(unique_mice)
print(f"There are {number_of_unique_mice} mice in the study after removing the mouse with duplicate data.")

## Summary Statistics

In [None]:
# option 1
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.

# grouping the mouse data by regimen
grouped_mouse_data = clean_mouse_data.groupby(['Drug Regimen'])

mean = grouped_mouse_data["Tumor Volume (mm3)"].mean()

median = grouped_mouse_data["Tumor Volume (mm3)"].median()

mode = grouped_mouse_data["Tumor Volume (mm3)"].agg(pd.Series.mode)
mode

mode2 = grouped_mouse_data["Tumor Volume (mm3)"].agg(lambda x:x.value_counts().index[0])
mode2

# both mode options give 45 as the mode for eerything?? not sure if this is right??

variance = grouped_mouse_data["Tumor Volume (mm3)"].var()
variance

standard_deviation = grouped_mouse_data["Tumor Volume (mm3)"].std()
standard_deviation

sem = grouped_mouse_data["Tumor Volume (mm3)"].sem()
sem

summary_mouse_df = pd.DataFrame({"Mean": mean,
                                "Median": median,
                                "Mode": mode,
                                "Variance": variance,
                                "Standard Deviation": standard_deviation,
                                "SEM": sem})
summary_mouse_df

In [None]:
# Option 2, simpler
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# THIS WORKS THE BEST
# This method produces everything in a single groupby function
import numpy as np
# oorrrrr
# grouping the mouse data by regimen
grouped_mouse_data = clean_mouse_data.groupby(['Drug Regimen'])

# creating the summary dataframe by applying the specific functions to the Tumour Volume column and changing the column names
summary = pd.DataFrame(grouped_mouse_data["Tumor Volume (mm3)"].agg([np.mean, np.median, pd.Series.mode, np.var, np.std, "sem"])
.rename(columns = {"mean":"Mean", "median":"Median", "mode":"Mode", "var":"Variance", "std":"Standard Deviation", "sem":"SEM"}))

# adding a second index above the column names to indicate that the data is regarding tumour volume
summary.columns = pd.MultiIndex.from_product([['Tumour Volume:'],summary.columns])

summary

In [None]:
# another option = not as nice visually
import numpy as np
# grouping the mouse data by regimen
grouped_mouse_data = clean_mouse_data.groupby(['Drug Regimen'])


summary2 = pd.DataFrame(grouped_mouse_data["Tumor Volume (mm3)"].agg([np.mean, np.median, st.mode, np.var, np.std, "sem"])
.rename(columns = {"mean":"Mean", "median":"Median", "mode":"Mode", "var":"Variance", "std":"Standard Deviation", "sem":"SEM"}))
                       
summary2

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
# Creating a dataframe that includes one occurance of each mouse, removing all but the first occuring time points
unique_mice_df = clean_mouse_data.drop_duplicates(subset = 'Mouse ID', keep = 'first')


# Creating a new dataframe that is grouped by 'Drug Regimen', where the columns are counted to get the number of mice
grouped_regimen_df = unique_mice_df.groupby(["Drug Regimen"]).count()

# Simplifying the datframe to just include the Drug Regimen index and the Mouse ID column
mice_per_treatment = grouped_regimen_df["Mouse ID"]


# Plotting the bar chart directly from the dataframe
mice_per_treatment_chart = mice_per_treatment.plot(kind = "bar", title = "Number of Mice Per Treatment", figsize=(8,5))
mice_per_treatment_chart.set_ylabel("Number of Mice")
plt.tight_layout()



In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
# py plot is plt.bar(x axis, y axis)
# so will need series/;lists
# but can still base it off a table I believe

# Will be obtaining data from grouped_regimen_df created above

# Setting x axis as a numbered list, setting y axis as the values of the Mouse ID colunm 
x_axis = np.arange(len(grouped_regimen_df))
y_axis = grouped_regimen_df["Mouse ID"]

# Creating tick locations and setting the xticks values as the index of the dataframe
tick_locations = [value for value in x_axis]
xticks = grouped_regimen_df.index

# Plotting the chart
plt.bar(x_axis, y_axis, align="center")
plt.xticks(tick_locations, xticks, rotation="vertical")
plt.tight_layout()

# Set x limits for neatness of x axis
plt.xlim(-0.75, len(x_axis)- 0.25)

#setting a title and labels
plt.title("Number of Mice Per Treatment")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")
plt.show()



In [None]:
# Generating a pie plot showing the distribution of female versus male mice using pandas

# Referencing unique_mice_df because require only one occurance of each mouse
# Grouping the dataframe by the sex of the mice and applying a count
grouped_by_sex_df = unique_mice_df.groupby(["Sex"]).count()

# Simplifying the datframe to just include the 'Sex' index and the Mouse ID column
mice_per_sex = grouped_by_sex_df["Mouse ID"]

# Plotting the pie chart from the dataframe
mice_per_sex_chart = mice_per_sex.plot(kind = "pie", title = "Distribution of Female Versus Male Mice", autopct="%1.1f%%")
mice_per_sex_chart.set_ylabel("")
plt.tight_layout()


In [None]:
# Generating a pie plot showing the distribution of female versus male mice using pyplot
# referencing the grouped dataframe created above
values = grouped_by_sex_df["Mouse ID"]
labels = grouped_by_sex_df.index

distribution_of_sex_chart = plt.pie(values, labels = labels, autopct="%1.1f%%")
plt.title("Distribution of Female Versus Male Mice")
plt.tight_layout()
plt.show()


## Quartiles, Outliers and Boxplots

In [None]:
# Calculating the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Adjusting the dataframe to contain the last occurance of each Mouse ID
# This will be the last timepoint where data was recorded, and therefore the final tumour volume for each mouse
last_timepoint_df = clean_mouse_data.drop_duplicates(subset = 'Mouse ID', keep = 'last')


In [None]:
last_timepoint_df["Tumor Volume (mm3)"]

In [None]:
# Creating a list of treatments 
treatments = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Creating empty list to fill with tumour vol data 
tumour_volume_data = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 

# Looping through each treatment
for treatment in treatments: 
    
    # Using loc to set the data frame to only contain the treatment being considered at that time
    treatment_data = last_timepoint_df.loc[(last_timepoint_df["Drug Regimen"] == treatment),:]
    
    # Accessing the Tumour Volume column for that treatment and setting it as a list
    tumour_data = treatment_data["Tumor Volume (mm3)"]
    tumour_data_list = tumour_data.tolist()
    
  
    # Calculating the upper and lower bounds to check for outliers
    quartiles = tumour_data.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    
    # Printing the upper and lower bounds
    print(f"")
    print(f"Values below {lower_bound} could be outliers for {treatment}.")
    print(f"Values above {upper_bound} could be outliers for {treatment}.")
    
    # Checking for values that are outliers and setting them as a list
    outliers = treatment_data.loc[(treatment_data["Tumor Volume (mm3)"] < lower_bound) | (treatment_data["Tumor Volume (mm3)"] > upper_bound)]["Tumor Volume (mm3)"]
    outliers_value =  outliers.tolist()   
    
    # Printing the outliers if they exist, or stating there are none if they don't
    if len(outliers) == 0:
        print(f"There are no outliers for {treatment}.") 
    elif len(outliers) == 1:
        print(f"There is one outlier for {treatment}. It is {outliers_value[0]}.")
    elif len(outliers) == 2:
        print(f"There are two outliers for {treatment}. They are {outliers_value[0]} and {outliers_value[1]}.")
    else:
        print(f"There are more than two outliers for {treatment}.")
    
    # adding the tumour data for the individul treatment to the list for plotting
    tumour_volume_data.append(tumour_data_list)
   




In [None]:
# Generating a box plot of the final tumor volume of each mouse across four regimens of interest
# creating a symbol for the outlier
green_diamond = dict(markerfacecolor='fuchsia', marker='o')
fig1, ax1 = plt.subplots()
ax1.set_title('Final Tumor Volumes')
ax1.set_ylabel('Tumor Volume (mm3)')
ax1.boxplot(tumour_volume_data, flierprops = green_diamond)
ax1.set_xticklabels(treatments)
plt.show()


## Line and Scatter Plots

In [None]:
# Generating a line plot of time point versus tumor volume for a mouse treated with Capomulin

# Adjusting the dataframe to contain only Capomulin data and the relevant columns
capomulin_data = clean_mouse_data.loc[(clean_mouse_data["Drug Regimen"] == "Capomulin"), ["Mouse ID", "Timepoint", "Tumor Volume (mm3)"]]

# Looking at Mouse ID: s185, adjusting the dataframe to contain relevant information
s185_df = capomulin_data.loc[(capomulin_data["Mouse ID"] == "s185"), ["Timepoint", "Tumor Volume (mm3)"]]

# Plotting time point vs. tumour volume for s185
graph = s185_df.plot(kind = "line", x = "Timepoint",  title = "Time Point VS. Tumour Volume (Mouse ID: s185)")



In [None]:
# Generating a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

# Adjusting the dataframe to contain only Capomulin data and the relevant columns
capomulin_weight_df = clean_mouse_data.loc[(clean_mouse_data["Drug Regimen"] == "Capomulin"), ["Drug Regimen", "Weight (g)", "Tumor Volume (mm3)"]]

# Grouping the dataframe by weight and finding the average tumour volume
group_by_weight_df = capomulin_weight_df.groupby(["Weight (g)"], as_index = False).mean()

# Plotting weight vs. average tumour volume
chart = group_by_weight_df.plot(kind="scatter", y= "Weight (g)", x="Tumor Volume (mm3)", grid=True, figsize=(8,8),title="Mouse Weight Vs. Average Tumour Volume")
plt.show()


In [None]:
# making graph using matplotlib to see if that is better

group_by_weight_df

weight = group_by_weight_df.iloc[:,0]
tumour_volume = group_by_weight_df.iloc[:,1]
plt.scatter(tumour_volume, weight)
plt.title("Mouse Weight Vs. Average Tumour Volume")
plt.ylabel("Weight (g)")
plt.xlabel("Tumor Volume (mm3)")


## Correlation and Regression

In [None]:
# Calculating the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

# Using group_by_weight_df 
# Setting variables
weight = group_by_weight_df.iloc[:,0]
tumour_volume = group_by_weight_df.iloc[:,1]

# calculating correlation
correlation = st.pearsonr(tumour_volume, weight)
print(f"The correlation between both factors is {round(correlation[0],2)}")

# put this at the top
from scipy.stats import linregress

# Calculating Regression line
(slope, intercept, rvalue, pvalue, stderr) = linregress(tumour_volume, weight)
regress_values = tumour_volume * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# Plotting the scatterplot and regression model
plt.scatter(tumour_volume, weight)
plt.title("Mouse Weight Vs. Average Tumour Volume (With Regression Line)")
plt.ylabel("Weight (g)")
plt.xlabel("Tumor Volume (mm3)")
plt.plot(tumour_volume,regress_values,"r-")
plt.annotate(line_eq,(40,16), fontsize=12,color="red")
plt.show()
