# Pymaceuticals Inc.
---

### Observations and Insights
- There seems to be a strong correlation between mouse weight and tumor volume size (0.84). This indicates that as one increases the other does as well.
- Without duplicate mice, the total number of mice tested is 248.
- Ramicane and Capomulin were tested on the greatest number of mice.


In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
from scipy import stats
from scipy.stats import sem

mouse_metadata_path = "Data/Mouse_metadata.csv"
results_path = "Data/Study_results.csv"

mouse_metadata = pd.read_csv(mouse_metadata_path)
results = pd.read_csv(results_path)

combined_data = pd.merge(results, mouse_metadata, on='Mouse ID', how='left')
combined_data.head()

In [None]:
# Checking the number of mice.
number_of_mice = len(combined_data["Mouse ID"].unique())
number_of_mice


In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
duplicate_mice=combined_data.loc[combined_data.duplicated(subset=['Mouse ID', 'Timepoint']),'Mouse ID'].unique()
duplicate_mice


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.

drop_duplicate_id=results.loc[results.duplicated(subset=['Mouse ID','Timepoint']),'Mouse ID'].unique()
clean_data=results[results['Mouse ID'].isin(drop_duplicate_id)==False]
clean_mouse_df=mouse_metadata[mouse_metadata['Mouse ID'].isin(drop_duplicate_id)==False]

clean_combined_data=pd.merge(clean_data,clean_mouse_df, on='Mouse ID')
clean_combined_data.head()

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
combined_data_duplicate = combined_data.set_index('Mouse ID')
duplicate_mouse_ID=combined_data_duplicate.loc['g989',:]
duplicate_mouse_ID


In [None]:
# Checking the number of mice in the clean DataFrame.
number_of_mice_clean = len(clean_combined_data["Mouse ID"].unique())
number_of_mice_clean


## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.


summary_stats=pd.DataFrame(combined_data.groupby("Drug Regimen").count())

summary_stats["Mean Tumor Volume"]=pd.DataFrame(combined_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].mean())
summary_stats["Median Tumor Volume"]=pd.DataFrame(combined_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].median())
summary_stats["Tumor Volume Variance"]=pd.DataFrame(combined_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].var())
summary_stats["Tumor Volume Std. Dev."]=pd.DataFrame(combined_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].std())
summary_stats["Tumor Volume Std. Err."]=pd.DataFrame(combined_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].sem())

summary_stats=summary_stats[["Mean Tumor Volume", "Median Tumor Volume", "Tumor Volume Variance", "Tumor Volume Std. Dev.", "Tumor Volume Std. Err."]]

summary_stats


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line

groupby_summary_stats=combined_data.groupby('Drug Regimen')
summary_stats_2=groupby_summary_stats.agg(['mean','median','var','std','sem'])['Tumor Volume (mm3)']
summary_stats_2


## Bar and Pie Charts

In [None]:
drug_regimen = clean_combined_data.groupby('Drug Regimen')

drug_regimen_count = pd.DataFrame(drug_regimen['Drug Regimen'].count())

drug_chart = drug_regimen_count.plot(kind='bar',legend=False)

drug_chart.set_title('Number of Unique Mice Tested')
drug_chart.set_xlabel("Drug Regimen")
drug_chart.set_ylabel("Count")
plt.tight_layout()

#plt.savefig("figures/PandaBarChart.png")

plt.show()


In [None]:
x_axis = np.arange(len(drug_regimen_count))
tick_locations = [value for value in x_axis]

plt.figure(figsize=(6,4))
plt.bar(x_axis, drug_regimen_count["Drug Regimen"], align="center", width = 0.52, label='_nolegend_')
plt.xticks(tick_locations, list(drug_regimen_count.index), rotation="vertical")

plt.xlim(-0.7, len(x_axis)-0.3)
plt.ylim(0, max(drug_regimen_count["Drug Regimen"])*1.05)

plt.xlabel("Drug Regimen")
plt.ylabel("Number of Unique Mice Tested")

column_name = ["Drug Regimen"]
plt.legend(column_name,loc="best")

#plt.savefig("figures/PyplotBarChart.png")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
group_by_gender=clean_combined_data.groupby('Sex')
gender_count = pd.DataFrame(group_by_gender['Sex'].count())
gender_count.head()

In [None]:
gender_count.plot(kind='pie', y='Sex',autopct='%1.1f%%',shadow=False,legend =False,startangle=180)
plt.axis("equal")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
genders = list(gender_count.index.values)

counts_gender = gender_count['Sex']

plt.pie(counts_gender,labels=genders,autopct="%1.1f%%",shadow=False, startangle=180)
plt.rcParams['font.size']=12
plt.ylabel("Sex")
plt.axis("equal")
#plt.savefig("figures/PyplotPieChart.png")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

capomulin=clean_combined_data.loc[clean_combined_data['Drug Regimen']=='Capomulin',:]
ramicane=clean_combined_data.loc[clean_combined_data['Drug Regimen']=='Ramicane',:]
infubinol=clean_combined_data.loc[clean_combined_data['Drug Regimen']=='Infubinol',:]
ceftamin=clean_combined_data.loc[clean_combined_data['Drug Regimen']=='Ceftamin',:]
capomulin.head()

In [None]:
# Start by getting the last (greatest) timepoint for each mouse

capomulin_last = capomulin.groupby('Mouse ID').max()['Timepoint']
cap_vol=pd.DataFrame(capomulin_last)
cap_last_merge=pd.merge(cap_vol, clean_combined_data, on=('Mouse ID','Timepoint'),how='left')
cap_last_merge.head()

In [None]:
cap_tumor=capomulin_merge['Tumor Volume (mm3)']

quartiles = cap_tumor.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq


lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Capomulin potential outliers could be values below {lower_bound} and above {upper_bound} could be outliers.")


In [None]:
ram_last = ramicane.groupby('Mouse ID').max()['Timepoint']
ram_vol=pd.DataFrame(ram_last)
ram_last_merge=pd.merge(ram_vol, clean_combined_data, on=('Mouse ID','Timepoint'),how='left')

ram_tumor=ram_last_merge['Tumor Volume (mm3)']

quartiles = ram_tumor.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Ramicane potential outliers could be values below {lower_bound} and above {upper_bound} could be outliers.")


In [None]:
inf_last = infubinol.groupby('Mouse ID').max()['Timepoint']
inf_vol=pd.DataFrame(inf_last)
inf_last_merge=pd.merge(inf_vol, clean_combined_data, on=('Mouse ID','Timepoint'),how='left')

inf_tumor=inf_last_merge['Tumor Volume (mm3)']

quartiles = inf_tumor.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Infubinol potential outliers could be values below {lower_bound} and above {upper_bound} could be outliers.")


In [None]:
ceft_last = ceftamin.groupby('Mouse ID').max()['Timepoint']
ceft_vol=pd.DataFrame(ceft_last)
ceft_last_merge=pd.merge(ceft_vol, clean_combined_data, on=('Mouse ID','Timepoint'),how='left')

ceft_tumor=ceft_last_merge['Tumor Volume (mm3)']

quartiles = ceft_tumor.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Ceftamin potential outliers could be values below {lower_bound} and above {upper_bound} could be outliers.")


In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
tumors_to_plot=[cap_tumor,ram_tumor,inf_tumor,ceft_tumor]

fig1, ax1 = plt.subplots()
ax1.set_title('Tumors')
ax1.set_ylabel('Final Tumor Volume (mm3)')
ax1.set_xlabel('Drug Regimen')

ax1.boxplot(tumors_to_plot, labels=["Capomulin","Ramicane","Infubinol","Ceftamin",])

plt.savefig('boxplot')
plt.show()


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
capomulin_data = combined_data.loc[combined_data['Drug Regimen'] == 'Capomulin']
cap_time_tumor = capomulin_data.groupby(['Timepoint']).agg(\
        Mean_Tumor_Volume=('Tumor Volume (mm3)', np.mean),\
        SEM_Tumor_Volume=('Tumor Volume (mm3)', st.sem)\
).round(3)

# View the groupby dataframe 
cap_time_tumor.head(10)

In [None]:
single_mouse=capomulin.loc[capomulin['Mouse ID']=='b128',:]
single_mouse.head()

In [None]:
x_axis = single_mouse["Timepoint"]
tumor_size = single_mouse["Tumor Volume (mm3)"]

plt.title('Capomulin treatmeant of mouse B128')
plt.plot(x_axis, tumor_size,linewidth=2, markersize=12)
plt.xlabel('Timepoint (Days)')
plt.ylabel('Tumor Volume (mm3)')

plt.savefig('linechart')
plt.show()

In [None]:
# Groupby Mouse ID using .agg() method and get the weight and mean of tumor volume
cap_mouse_id = capomulin_data.groupby(['Mouse ID']).agg(\
        Mouse_weight=('Weight (g)', np.mean),\
        Tumor_vol_mean=('Tumor Volume (mm3)', np.mean)\
).round(3)
cap_mouse_id.head(30)

In [None]:
plt.scatter(
    cap_mouse_id['Mouse_weight'],
    cap_mouse_id['Tumor_vol_mean'],
    marker='o',
    facecolors='red',
    edgecolors='black',
    s=cap_mouse_id['Tumor_vol_mean'],
    alpha=.75)

# Create a title, x label, and y label for our chart
plt.title("Mouse weight vs. Avg. Tumor Volume")
plt.xlabel("Mouse weight (g)")
plt.ylabel("Tumor Volume (mm3)")
# Save the figure
#plt.savefig("figures/ScatterWeightTumorVol.png")

plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
correlation = st.pearsonr(cap_mouse_id['Mouse_weight'],cap_mouse_id['Tumor_vol_mean'])
print(f"The correlation between both factors is {round(correlation[0],2)}")


In [None]:
x_values = cap_mouse_id['Mouse_weight']
y_values = cap_mouse_id['Tumor_vol_mean']
res=stats.linregress(x_values,y_values)
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = f'y = {str(round(slope,2))}x + {str(round(intercept,2))}'
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(17,37),fontsize=15,color="black")
plt.title("Mouse weight vs. Avg. Tumor Volume")
plt.xlabel("Mouse weight (g)")
plt.ylabel("Tumor Volume (mm3)")
print(f"The r-squared is: {rvalue}")
print(f"The equation of the regression line is: {line_eq}")

# Save the figure
#plt.savefig("figures/ScatterWeightTumorVolRegression.png")

plt.show()