## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
%reload_ext nb_black

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
df = pd.merge(mouse_metadata, study_results, on="Mouse ID", how= "right")

# Display the data table for preview
df

In [None]:
# Checking the number of mice.
df["Mouse ID"].nunique()

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice_df = df[["Mouse ID", "Timepoint"]]
duplicates = duplicate_mice_df[duplicate_mice_df.duplicated(keep="first")]
duplicates

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
df.loc[(df["Mouse ID"] == "k403")]

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
df = df.drop(df.index[df["Mouse ID"] == "k403"].tolist())
df.loc[(df["Mouse ID"] == "k403")]


In [None]:
# Checking the number of mice in the clean DataFrame.
df["Mouse ID"].nunique()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
df.groupby(["Drug Regimen"]).agg({"Tumor Volume (mm3)" : ["mean",
        "median",
        "var",
        "std",
        "sem"]})
# This method is the most straighforward, creating multiple series and putting them all together at the end.



## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
total_mice = df.groupby(["Drug Regimen"])["Mouse ID"].nunique()

mice_bar_plot = total_mice.plot(kind="bar", title="Total Mice for Each Treatment", color="r")
mice_bar_plot.set_ylabel("Number of Mice")
plt.show()

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
drug = df["Drug Regimen"].unique()
drug

#create bar chart
plt.title("Total Mice for Each Treatment")
plt.bar(drug, total_mice, color="r",)
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")
plt.xticks(rotation=90)
plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
mice_by_sex = df.groupby(["Sex"])["Mouse ID"].nunique().to_frame()

mice_by_sex.plot(kind="pie", subplots="True", title="Gender Distribution of Mice", autopct="%1.1f%%", colors=["m", "r"], legend=False)
plt.show()

In [None]:
#Easy way to get dataset for piechart
mice_by_sex

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
sex = [123, 125]
labels = ["Female", "Male"]
colors = ["tab:purple", "tab:red"]

#create pie chart
plt.title("Gender Distribution of Mice")
plt.pie(sex, labels=labels, colors=colors, autopct="%1.1f%%")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
capomulin_mouse = df[df["Mouse ID"]== "x401"]

x401_timepoints = capomulin_mouse["Timepoint"]
x401_tumor_vol = capomulin_mouse["Tumor Volume (mm3)"]

plt.title("Tumor Volume Across Time Points")
plt.ylabel("Tumor Volume (mm3)")
plt.xlabel("Time (Days)")
plt.plot(x401_timepoints, x401_tumor_vol, color="r", label="Mouse ID x401", marker="x")
plt.legend()
plt.grid()
plt.show()

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
capommulin_data_df = df[df["Drug Regimen"] == "Capomulin"]

mouse_weight = capommulin_data_df.groupby(["Mouse ID"])["Weight (g)"].mean()
avg_tumor_vol = capommulin_data_df.groupby(["Mouse ID"])["Tumor Volume (mm3)"].mean()

plt.title("Capomulin Regimen: Average Tumor volume vs Mouse Weight")
plt.xlabel("Mouse Weight (g)")
plt.ylabel("Tumore Volume (mm3)")
plt.scatter(mouse_weight, avg_tumor_vol, color="b", label="Tumor Volume by Mouse Weight")
plt.grid()
plt.legend()
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
