# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
mouse_complete = pd.merge(study_results,mouse_metadata, how="left", on=["Mouse ID", "Mouse ID"])

# Display the data table for preview
mouse_complete.head()

In [None]:
# Checking the number of mice.
len(pd.unique(mouse_complete["Mouse ID"]))


In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_rows = mouse_complete[mouse_complete.duplicated(subset=["Mouse ID","Timepoint"],keep=False)]
odd_mouse_out = pd.unique(duplicate_rows["Mouse ID"])

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_data = mouse_complete[mouse_complete["Mouse ID"].isin(odd_mouse_out)]
duplicate_data


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_mouse_complete =  mouse_complete[~mouse_complete["Mouse ID"].isin(odd_mouse_out)]
clean_mouse_complete.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
len(pd.unique(clean_mouse_complete["Mouse ID"]))

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.
means = clean_mouse_complete.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].mean()
medians = clean_mouse_complete.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].median()
variances = clean_mouse_complete.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].var()
std_deviations = clean_mouse_complete.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].std()
sems = clean_mouse_complete.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].sem()

summary_stats = pd.DataFrame({"Mean Tumor Volume" : means,
                              "Median Tumor Volume" : medians,
                              "Tumor Volume Variance" : variances,
                              "Tumor Volumestd" : std_deviations,
                              "Tumor Volumesem" : sems
                             })

summary_stats


In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line
agg_summary_stats = clean_mouse_complete.groupby(["Drug Regimen"]) \
                                        .agg({"Tumor Volume (mm3)":["mean","sum","var","std","sem"]}) 
agg_summary_stats

## Bar and Pie Charts

In [None]:
pd.pivot_table(clean_mouse_complete, values= "Mouse ID", columns = ["Timepoint"], aggfunc = "count")

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
regimen_counts = clean_mouse_complete["Drug Regimen"].value_counts()
pandasBar = regimen_counts.plot(kind='bar', figsize=(10, 6))
pandasBar.set_ylabel("# of observed Mouse Timepoints")
plt.show



In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
figure1 = regimen_counts.plot(kind = "bar", ylabel="# of observed Mouse Timepoints")
xticklabels = regimen_counts.index
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
labels = clean_mouse_complete["Sex"].value_counts().index
sizes = clean_mouse_complete["Sex"].value_counts().reset_index()
sizes.columns = ["Label","Sex"] 
sizes.plot(kind="pie",autopct="%1.1f%%", y= "Sex", labels=labels, legend=False)


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
plt.pie(sizes["Sex"],labels=labels, autopct="%1.1f%%")
plt.text(-1.5, -.1, 'Sex', fontsize=10, color='black', rotation=90)
plt.show()


## Quartiles, Outliers and Boxplots

In [None]:

# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
grouped_data = clean_mouse_complete.groupby("Mouse ID")["Timepoint"].max().reset_index()

# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
mouse_data_final = pd.merge(grouped_data, clean_mouse_complete, on=["Mouse ID", "Timepoint"], how="inner")


In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatment_regimens = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tumor_volume_list= []

# Calculate the IQR and quantitatively determine if there are any potential outliers.
for drug in treatment_regimens:
        
    # Locate the rows which contain mice on each drug and get the tumor volumes
    tumor_volume_temp_list = mouse_data_final.loc[mouse_data_final["Drug Regimen"] == drug, "Tumor Volume (mm3)"]
        
    # add subset 
    tumor_volume_list.append(tumor_volume_temp_list)

for i in range(len(treatment_regimens)):
    # initialize upper and lower bounds
    quartiles = tumor_volume_list[i].quantile([0.25, 0.5, 0.75])
    lower_quartile = quartiles[0.25]
    upper_quartile = quartiles[0.75]
    iqr = upper_quartile - lower_quartile
    lower_bound = lower_quartile - (1.5 * iqr)
    upper_bound = upper_quartile + (1.5 * iqr)
    
    # Determine outliers using upper and lower bounds
    outliers = tumor_volume_list[i].loc[(tumor_volume_list[i] <lower_bound) | (tumor_volume_list[i] > upper_bound)]

    #print statement
    print(f"{treatment_regimens[i]}'s potential outliers: {outliers}")  
                                              

In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.
fig1,ax1 = plt.subplots()    # creates plot elements we can easily alter
ax1.set_title('Distribution of Tumor Volume by Treatment Group')   #titles graph
ax1.set_ylabel('Final Tumor Volume (mm3)')     # y label
ax1.set_xlabel("Treatment Group")              # x label
ax1.boxplot(tumor_volume_list, 0, sym ="o", 
            flierprops = {"markerfacecolor":'red'}, labels=treatment_regimens)   #creates box plot and formats outlier

plt.show() # render plot

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin

#filter data to just mouse l509 as shown in example 
filtered_data = clean_mouse_complete[
    (clean_mouse_complete["Mouse ID"] == "l509")]

#creating line plot for singular mouse
fig2,ax2 = plt.subplots()    # creates plot elements we can easily alter
ax2.plot("Timepoint","Tumor Volume (mm3)", data= filtered_data)     # creates line plot
ax2.set(xlabel= "Timepoint (days)", ylabel = "Tumor Volume (mm3)",  # labels axes and title
       title= "Capomulin treatment of mouse l509")

plt.show # render plot

In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen
capomulin_data = clean_mouse_complete[
    (clean_mouse_complete["Drug Regimen"] == "Capomulin")  # filter date down to just mice treated with capomulin
    ]

# runs aggregation on filtered data to pull mean from muliple columns and add to their own data frame
capomulin_data_averages = capomulin_data.groupby(["Mouse ID"])\
                                        .agg({"Tumor Volume (mm3)":"mean",  
                                              "Weight (g)":"mean"           
                                             })     
# plot data as a scatter plot
capomulin_data_averages.plot(kind="scatter", x= "Weight (g)", y= "Tumor Volume (mm3)", 
                             title= "Mice Treated with Capomulin")


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model 
# for mouse weight and average observed tumor volume for the entire Capomulin regimen

# assign values to axes 
x_values = capomulin_data_averages["Weight (g)"] 
y_values = capomulin_data_averages["Tumor Volume (mm3)"]

# perform linear regression to data
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
regress_values = x_values * slope + intercept

# format and save line equation for presentation on plot
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# creating plots
plt.scatter(capomulin_data_averages["Weight (g)"], 
            capomulin_data_averages["Tumor Volume (mm3)"]) #create scatter plot
plt.plot(x_values,regress_values,"r-") # append linear regression line to graph
plt.annotate(line_eq,(18,36),fontsize=15,color="red") # add line equation
plt.xlabel("Weight (g)")  # add x axis label
plt.ylabel("Average Tumor Volume (mm3)") # add y axis label
           
plt.show # render plot