## Observations and Insights

## Dependencies and starter code

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress
import numpy as np

# Read the mouse data and the study results
mouse_metadata = pd.read_csv("data/Mouse_metadata.csv")
study_results = pd.read_csv("data/Study_results.csv")

# Combine the data into a single dataset
mice_df = pd.merge(mouse_metadata, study_results, on="Mouse ID", how="outer")

mice_df.head()

## Summary statistics

In [None]:
## Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# create summary dataframe with requested statistics
summary_df = pd.DataFrame({"Mean Tumor Volume (mm3)": mice_df["Tumor Volume (mm3)"].groupby(mice_df["Drug Regimen"]).mean(),
                          "Median Tumor Volume (mm3)": mice_df["Tumor Volume (mm3)"].groupby(mice_df["Drug Regimen"]).median(),
                          "Variance of Tumor Volume (mm3)": mice_df["Tumor Volume (mm3)"].groupby(mice_df["Drug Regimen"]).var(),
                          "Standard Deviation for Tumor Volume (mm3)": mice_df["Tumor Volume (mm3)"].groupby(mice_df["Drug Regimen"]).std(),
                          "SEM of Tumor Volume (mm3)": mice_df["Tumor Volume (mm3)"].groupby(mice_df["Drug Regimen"]).sem()})
summary_df

## Bar plots

In [None]:
## Generate a bar plot showing number of data points for each treatment regimen using pandas and pyplot

# create sorted list of drug names
drug_list = sorted(mice_df["Drug Regimen"].unique().tolist())

# make tick locations with drug_list
tick_locations = np.arange(len(drug_list))

# title and labels
bar_title = "Number of Data Points per Drug Regimen"
bar_xlabel = "Drug Regimen"
bar_ylabel = "Number of Data Points"

In [None]:
## Bar Plot using Pandas

# create a new series with only the drug name and number of data points
drug_and_count = mice_df["Tumor Volume (mm3)"].groupby(mice_df["Drug Regimen"]).count()

# create bar plot using pandas
drug_and_count.plot(kind="bar", legend=False, width=.8)

# assign xticks and rotate 45 degrees
plt.xticks(tick_locations, drug_list, rotation=45)

# give plot a title and label
plt.title(bar_title)
plt.ylabel(bar_ylabel)

# display plot
plt.show()

In [None]:
# Bar Plot using PyPlot

# create bar plot using pyplot
plt.bar(drug_list, mice_df["Drug Regimen"].groupby(mice_df["Drug Regimen"]).value_counts())

# assign xticks and rotate 45 degrees
plt.xticks(tick_locations, drug_list, rotation=45)

# give plot a title and label
plt.title(bar_title)
plt.xlabel(bar_xlabel)
plt.ylabel(bar_ylabel)

# display plot
plt.show()

## Pie plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas and pyplot

# title and label for plots
pie_title = "Distribution of Female versus Male Mice"
pie_ylabel = "Sex"

In [None]:
## Pie Plot using Pandas

# create series for gender counts
gender_s = mice_df["Sex"].value_counts()

# create the pie plot
gender_s.plot(kind="pie", autopct='%1.1f%%', shadow=True)

# assign title to plot
plt.title(pie_title)

# show plot
plt.show()

In [None]:
## Pie Plot using PyPlot

# gender list for plot labels
gender_list = ["Male", "Female"]

# create plot
plt.pie(gender_s, labels=gender_list, autopct='%1.1f%%', shadow=True)

# assign title and label to plot
plt.title(pie_title)
plt.ylabel(pie_ylabel)

# show plot
plt.show()

## Quartiles, outliers and boxplots

In [None]:
# Find the 4 "most promising" treatments
# This will be determined by highest median final tumor volume

# find final tumor volume for each mouse
final_size = mice_df.loc[mice_df["Timepoint"] == 45]

# calculate median tumor size by drug regimen
median_size_df = pd.DataFrame({"Median Final Tumor Volume (mm3)": final_size["Tumor Volume (mm3)"].groupby([final_size["Drug Regimen"]]).median()})

# find 4 smallest values
lowest_medians = median_size_df.nsmallest(4, "Median Final Tumor Volume (mm3)")

# make a list of the four most promising treatments (based on median final tumor volume)
best_regimens_list = lowest_medians.index.values.tolist()
best_regimens_list.sort()

# find all mice that had these treatments in final_size
best_regimens = final_size.loc[final_size["Drug Regimen"].isin(best_regimens_list)]

# create a series for each treatment with all final tumor volumes
treatment1 = best_regimens.loc[best_regimens["Drug Regimen"] == best_regimens_list[0], "Tumor Volume (mm3)"]
treatment2 = best_regimens.loc[best_regimens["Drug Regimen"] == best_regimens_list[1], "Tumor Volume (mm3)"]
treatment3 = best_regimens.loc[best_regimens["Drug Regimen"] == best_regimens_list[2], "Tumor Volume (mm3)"]
treatment4 = best_regimens.loc[best_regimens["Drug Regimen"] == best_regimens_list[3], "Tumor Volume (mm3)"]

# create a list for each treatment with all final tumor volumes
treatment1_list = best_regimens.loc[best_regimens["Drug Regimen"] == best_regimens_list[0], "Tumor Volume (mm3)"].tolist()
treatment2_list = best_regimens.loc[best_regimens["Drug Regimen"] == best_regimens_list[1], "Tumor Volume (mm3)"].tolist()
treatment3_list = best_regimens.loc[best_regimens["Drug Regimen"] == best_regimens_list[2], "Tumor Volume (mm3)"].tolist()
treatment4_list = best_regimens.loc[best_regimens["Drug Regimen"] == best_regimens_list[3], "Tumor Volume (mm3)"].tolist()

In [None]:
# make a function for iqr
def iqr(treatment, volumes_list, index):
    # calculate quartiles and IQR
    quartiles = treatment.quantile([.25,.5,.75])
    lowerq = quartiles[.25]
    upperq = quartiles[.75]
    iqr = upperq-lowerq
    
    # calculate upper and lower boundaries
    upper_bound = upperq + (1.5*iqr)
    lower_bound = lowerq - (1.5*iqr)
    
    # determine if there are any outliers in the dataset
    outliers = []
    for volume in volumes_list:
        if volume > upper_bound or volume < lower_bound:
            outliers1.append(volume)

    # remove any outliers from the dataset
    #for outlier in outliers:
        #if outlier in volumes_list: volumes_list.remove(outlier)
    ### comment back in if wanting outliers removed

    # print out IQR, upper and lower bounds, and outliers
    print(f"The IQR for Final Tumor Volume for {best_regimens_list[index]} is: {round(iqr, 2)} mm3\n")
    print(f"Values below {round(lower_bound, 2)} mm3 cound be outliers.")
    print(f"Values above {round(upper_bound, 2)} mm3 cound be outliers.\n")
    print(f"The following outliers have been found:\n"
          f"{outliers}\n"
          f"---------------------------------------------------------")

In [None]:
# print iqr and quartiles for each of the 4 regimens
iqr(treatment1, treatment1_list, 0)
iqr(treatment2, treatment2_list, 1)
iqr(treatment3, treatment3_list, 2)
iqr(treatment4, treatment4_list, 3)

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
data = [treatment1, treatment2, treatment3, treatment4]
fig, ax = plt.subplots()
ax.set_title(f"Final Tumor Volumes for\n"
              f"{best_regimens_list[0]}, {best_regimens_list[1]}, {best_regimens_list[2]}, and {best_regimens_list[3]}")
ax.set_ylabel("Tumor Volume (mm3)")
ax.set_xticklabels(best_regimens_list)

# Add a horizontal grid to the plot, but make it very light in color
# so we can use it for reading data values but not be distracting @borrowed
ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
               alpha=0.5)

# Hide these grid behind plot objects @also borrowed
ax.set_axisbelow(True)

ax.boxplot(data)
plt.show()

## Line and scatter plots

In [None]:
## Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

# create variable for requested regimen
regimen = "Capomulin"
mouse_id = "s185"

# create new dataframe with data for requested regimen
regimen_df = mice_df.loc[mice_df["Drug Regimen"] == regimen]
mouse_data = regimen_df.loc[regimen_df["Mouse ID"] == mouse_id]

# clean up data frame for plotting
mouse_data = mouse_data[["Timepoint", "Tumor Volume (mm3)"]]

time_data = mouse_data.set_index("Timepoint")

time_data.plot.line(legend=False)

plt.title(f"Timepoint versus Tumor Volume for\n"
          f"mouse: {mouse_id} treated with {regimen}")
plt.ylabel("Tumor Volume (mm3)")
plt.show()

In [None]:
## Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

# create data frame with only Capomulin data
capomulin_df = mice_df.loc[mice_df["Drug Regimen"] == "Capomulin", :]

# create data frame with average tumor volume and average weight per mouse on Capomulin
size_and_weight = pd.DataFrame(
    {
        "Avg Tumor Volume (mm3)": capomulin_df["Tumor Volume (mm3)"].groupby(capomulin_df["Mouse ID"]).mean(),
        "Avg Weight (g)": capomulin_df["Weight (g)"].groupby(capomulin_df["Mouse ID"]).mean()
    }
)

# create scatter plot
plt.scatter(size_and_weight["Avg Weight (g)"], size_and_weight["Avg Tumor Volume (mm3)"])

# give plot title and labels
plt.title("Mouse Weight vs Avg Tumor Volume\n"
          "for the Capomulin Regimen")
plt.xlabel("Mouse Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")

# display plot
plt.show()

In [None]:
# Calculate the correlation coefficient and linear regression model for 
# mouse weight and average tumor volume for the Capomulin regimen

x_values = size_and_weight["Avg Weight (g)"]
y_values = size_and_weight["Avg Tumor Volume (mm3)"]
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = f"y = {str(round(slope, 2))}x + {str(round(intercept, 2))}"
plt.scatter(x_values, y_values)
plt.plot(x_values, regress_values, "r-")
plt.annotate(line_eq, (18, 37), fontsize=15, color="red")

plt.title(
    f"Linear Regression Model\n"
    f"for Mouse Weight vs Average Tumor Volume\n"
    f"for the Capomulin Regimen")
plt.xlabel("Mouse Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")

rsquared = rvalue ** 2
print(f"The r-value is: {rvalue}\n"
      f"The r-squared is: {rsquared}")

plt.show()