# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np 
from scipy.stats import linregress
from matplotlib.pyplot import figure


# Study data files
mouse_metadata_path = "../Resources/Mouse_metadata.csv"
study_results_path = "../Resources/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
study_df = pd.merge(mouse_metadata, study_results, on='Mouse ID')

# Display the data table for preview
study_df.head()

FileNotFoundError: [Errno 2] File ../Resources/Mouse_metadata.csv does not exist: '../Resources/Mouse_metadata.csv'

In [None]:
# Checking the number of mice.
num_mice = study_df["Mouse ID"].value_counts()
print(num_mice)

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
mouse_duplicate = study_df.loc[study_df.duplicated(subset=["Mouse ID", "Timepoint"]),"Mouse ID"].unique()
print(mouse_duplicate)

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
all_mouse_df = study_df.loc[study_df["Mouse ID"]=="g989"]
all_mouse_df

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_mice = study_df[study_df['Mouse ID'].isin(mouse_duplicate)==False]
clean_mice

In [None]:
# Checking the number of mice in the clean DataFrame.
clean_df = clean_mice["Mouse ID"].value_counts()
number_of_mice = len(clean_df)
number_of_mice

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
drug_groups_df = clean_mice.groupby(['Drug Regimen'])

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 

tumor_mean_vol = drug_groups_df["Tumor Volume (mm3)"].mean()

tumor_median_vol = drug_groups_df["Tumor Volume (mm3)"].median()

tumor_variance_vol = drug_groups_df["Tumor Volume (mm3)"].var()

tumor_std = drug_groups_df["Tumor Volume (mm3)"].std()

tumor_sem = drug_groups_df["Tumor Volume (mm3)"].sem()

# mean, median, variance, standard deviation, and SEM of the tumor volume. 

# Assemble the resulting series into a single summary DataFrame.

summary_df = pd.DataFrame({"Tumor Volume Mean": tumor_mean_vol, "Tumor Volume Median": tumor_median_vol,
                            "Tumor Volume Variance": tumor_variance_vol, "Standard Deviation": tumor_std,
                            "Tumor SEM": tumor_sem})

summary_df

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line.

drug_groups2 = clean_mice.groupby('Drug Regimen')
summary_df2 = drug_groups2.agg(['mean','median','var','std','sem'])["Tumor Volume (mm3)"]
summary_df2

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.

measurements_df = clean_mice.groupby(["Drug Regimen"]).count()["Mouse ID"] 


bar_plot_drug = measurements_df.plot.bar(figsize=(15,10), color=["green", "red", "blue", "orange", "yellow", "purple",
                                                               "pink", "teal", "grey", "black"],fontsize = 14)
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")
plt.title("Number of Mice per Treatment")
plt.ylim(0, 275)
plt.tight_layout()

plt.show()


In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
mouse_list =(clean_mice.groupby(["Drug Regimen"])["Mouse ID"].count()).tolist()


x_axis = np.arange(len(measurements_df))
fig1, ax1 = plt.subplots(figsize=(15, 10))
plt.bar(x_axis, mouse_list, color=["blue", "green", "red", "orange", "yellow", "purple",
                                "pink", "black", "grey", "teal"], alpha=1, align='center')

tick_locations = [value for value in x_axis]

plt.xticks(tick_locations, ['Capomulin', 'Ceftamin', 'Infubinol', 'Ketapril', 
                            'Naftisol', 'Placebo', 'Propriva', 'Ramicane', 'Stelasyn', 'Zoniferol'],  rotation=90)

#plt.xlim(-0.75, len(x_axis)-0.25)

plt.ylim(0, 275)
plt.tight_layout()
plt.title("Number of Mice per Treatment")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas

groupby_gender = clean_mice.groupby(["Mouse ID","Sex"])

gender_df = pd.DataFrame(groupby_gender.size())

mouse_sex = pd.DataFrame(gender_df.groupby(["Sex"]).count())
mouse_sex.columns = ["Total Count"]

mouse_sex["Percentage of Sex"] = (100*(mouse_sex["Total Count"]/mouse_sex["Total Count"].sum()))

print(mouse_sex)

colors = ["red", "blue"]
explode = (0.1, 0)
plot = mouse_sex.plot.pie(y='Total Count',figsize=(15,10), colors = colors, startangle=140, explode = explode, shadow = True, autopct="%1.1f%%")
plt.title('Female vs. Male Mouse Population',fontsize = 20)

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

labels = ["Female","Male"]

sizes = [49.596774,50.403226]

colors = ["red", "blue"]

explode = (0.1, 0)
 
fig1, ax1 = plt.subplots(figsize=(15, 10))
plt.pie(sizes, explode=explode,labels=labels, colors=colors, autopct="%1.1f%%", shadow=True, startangle=140,)
plt.title('Male vs Female Mouse Population',fontsize = 20)


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse

greatest_timepoint = clean_mice.groupby(["Mouse ID"])["Timepoint"].max()
greatest_timepoint = greatest_timepoint.reset_index()


# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint

last_tum_vol = greatest_timepoint.merge(clean_mice, on=["Mouse ID","Timepoint"],how="left")

In [None]:
# Put treatments into a list for for loop (and later for plot labels)

treatments = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)

tumor_vol_data = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    
for drug in treatments:
    
    final_tumor_max = last_tum_vol.loc[last_tum_vol["Drug Regimen"] == drug, "Tumor Volume (mm3)"]
    
    tumor_vol_data.append(final_tumor_max)
    
    quartiles = final_tumor_max.quantile([.25,.5,.75])
    quartiles_low = quartiles[0.25]
    quartiles_high = quartiles[0.75]
    iqr = quartiles_high-quartiles_low
    lower_bound = quartiles_low - (1.5*iqr)
    upper_bound = quartiles_high + (1.5*iqr)
    outliers = final_tumor_max.loc[(final_tumor_max < lower_bound) | (final_tumor_max > upper_bound)]
    print(outliers)   
    

In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.

plt.boxplot(tumor_vol_data, labels = treatments)
plt.ylabel("Final Tumor Volume (mm3)")
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin

capomulin_treatment = clean_mice.loc[clean_mice["Drug Regimen"] == "Capomulin"]

mouse_f966 = capomulin_treatment.loc[capomulin_treatment["Mouse ID"]== "f966"]

plt.plot(mouse_f966["Timepoint"],mouse_f966["Tumor Volume (mm3)"])
plt.xlabel("Timepoint (days)")
plt.ylabel("Tumor Volume (mm3)")
plt.title("Capomulin treatment of mouse f966")
plt.show()


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen

capomulin_treatment = clean_mice.loc[clean_mice["Drug Regimen"] == "Capomulin"]

capomulin_avg = capomulin_treatment.groupby(["Mouse ID"]).mean()

plt.scatter(capomulin_avg["Weight (g)"],capomulin_avg["Tumor Volume (mm3)"])
plt.xlabel("Weight")
plt.ylabel("Average Tumor Volume (mm3)")
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

correlation = round(st.pearsonr(capomulin_avg["Weight (g)"],capomulin_avg["Tumor Volume (mm3)"])[0],2)
model = st.linregress(capomulin_avg["Weight (g)"],capomulin_avg["Tumor Volume (mm3)"])
print(f"The correlation between mouse weight and the average tumor volume is: {correlation}")
y_values = capomulin_avg["Weight (g)"]*model[0]+model[1]
plt.scatter(capomulin_avg["Weight (g)"],capomulin_avg["Tumor Volume (mm3)"])
plt.plot(capomulin_avg["Weight (g)"],y_values,color="red")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.show()

In [None]:
#fin