# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

mouse_metadata = pd.DataFrame(mouse_metadata)
study_results = pd.DataFrame(study_results)

# Combine the data into a single DataFrame
merge_df = pd.merge(study_results, mouse_metadata, on="Mouse ID")

# Display the data table for preview
merge_df

In [None]:
# Checking the number of mice.
unique_mice = merge_df["Mouse ID"].unique()
total_mice = len(unique_mice)
total_mice 

In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
mice_duplicates = merge_df.loc[merge_df.duplicated(subset=["Mouse ID", "Timepoint"], keep = False)]
unique_duplicates = mice_duplicates["Mouse ID"].unique()
unique_duplicates

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
mice_duplicates = merge_df.loc[merge_df["Mouse ID"] == "g989", :]
mice_duplicates

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = merge_df.loc[merge_df["Mouse ID"] != "g989"]
clean_df

In [None]:
# Checking the number of mice in the clean DataFrame.
total_unique = clean_df["Mouse ID"].unique()
total_unique = len(total_unique)
total_unique

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.
mean = clean_df["Tumor Volume (mm3)"].groupby(clean_df["Drug Regimen"]).mean()
median = clean_df["Tumor Volume (mm3)"].groupby(clean_df["Drug Regimen"]).median()
variance = clean_df["Tumor Volume (mm3)"].groupby(clean_df["Drug Regimen"]).var()
stdv = clean_df["Tumor Volume (mm3)"].groupby(clean_df["Drug Regimen"]).std()
sem = clean_df["Tumor Volume (mm3)"].groupby(clean_df["Drug Regimen"]).sem()

summary = pd.DataFrame({"Mean Tumor Volume": mean,
                       "Median Tumor Volume": median,
                       "Tumor Volume Variance": variance,
                       "Tumor Volume Std. Dev.": stdv,
                       "Tumor Volume Std. Err.": sem})

summary

In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line
summary_aggregation = clean_df.groupby(["Drug Regimen"])[["Tumor Volume (mm3)"]].agg(["mean","median","var", "std","sem"])
summary_aggregation

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
drugreg = clean_df["Drug Regimen"].value_counts()
drugreg

pd_bar = drugreg.plot(kind = "bar", color = "blue", xlabel = "Drug Regimen", ylabel = "# of Observed Mouse Timepoints")
pd_bar

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
x = drugreg.index.values
y = drugreg.values

plt.bar(x, y, color = "blue", align = "center")
plt.xlabel("Drug Regimen")
plt.ylabel("# of Observed Mouse Timepoints")
plt.xticks(rotation = "vertical")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
gender = clean_df["Sex"].value_counts()

gender.plot(kind = "pie", autopct = "%1.1f%%", ylabel = "Sex")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = gender.index.values
count = gender.values

plt.pie(count, labels = labels, autopct = "%1.1f%%")
plt.ylabel("Sex")

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
max_tp = clean_df.groupby("Mouse ID")["Timepoint"].max().reset_index()
#max_tp


# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
merge_df = pd.merge(max_tp, clean_df, on = ["Mouse ID", "Timepoint"], how="left")
merge_df

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatment = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tumor_volume_list = []


# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for drugs in treatment: 
    # Locate the rows which contain mice on each drug and get the tumor volumes
    tumor_volume = merge_df.loc[merge_df["Drug Regimen"] == drugs, "Tumor Volume (mm3)"]
    # add subset 
    tumor_volume_list.append(tumor_volume)
    quartiles = tumor_volume.quantile([.25,.5,.75])
    lowerq = quartiles[.25]
    upperq = quartiles[.75]
    iqr = upperq-lowerq 
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)    
    
    # Determine outliers using upper and lower bounds
outliers = merge_df.loc[(merge_df["Tumor Volume (mm3)"]<lower_bound) |
                       (merge_df["Tumor Volume (mm3)"]>upper_bound)]
print(f"{drugs}'s potential outliers: {outliers}")

In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.
fig1, ax1 = plt.subplots()
ax1.set_ylabel("Final Tumor Volume (mm3)")
ax1.boxplot()


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin
capomulin_df = clean_df.loc[(clean_df["Drug Regimen"] == "Capomulin") & (clean_df["Mouse ID"] == "l509")]
#capomulin_df
reduced_capomulin = capomulin_df[["Timepoint", "Tumor Volume (mm3)"]]
reduced_capomulin = reduced_capomulin.set_index("Timepoint")

reduced_capomulin.plot.line(title = "Capomulin treatment of mouse l509", 
                            xlabel = "Timepoint (days)",
                            ylabel = "Tumor Volume (mm3)")


In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen
all_capomulin = clean_df.loc[(clean_df["Drug Regimen"] == "Capomulin")]
#all_capomulin
avg_capomulin = all_capomulin.groupby("Mouse ID")[["Weight (g)", "Tumor Volume (mm3)"]].mean()
#avg_capomulin
avg_capomulin.plot(kind="scatter", x="Weight (g)", y="Tumor Volume (mm3)")

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model 
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
x_values = avg_capomulin["Weight (g)"]
y_values = avg_capomulin["Tumor Volume (mm3)"]


correlation = st.pearsonr(x_values, y_values)
print(f"The correlation between mouse weight and the average tumor volume is {round(correlation[0], 2)}")

(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
regression = x_values * slope + intercept
#line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept, 2))
plt.scatter(x_values, y_values)
plt.plot(x_values, regression, color="r")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")