## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as stats


In [2]:
# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)


In [3]:
mouse_metadata

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16
...,...,...,...,...,...
244,z314,Stelasyn,Female,21,28
245,z435,Propriva,Female,12,26
246,z581,Infubinol,Female,24,25
247,z795,Naftisol,Female,13,29


In [4]:
study_results = pd.read_csv(study_results_path)


In [5]:
study_results

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.000000,0
1,f932,0,45.000000,0
2,g107,0,45.000000,0
3,a457,0,45.000000,0
4,c819,0,45.000000,0
...,...,...,...,...
1888,r944,45,41.581521,2
1889,u364,45,31.023923,3
1890,p438,45,61.433892,1
1891,x773,45,58.634971,4


In [None]:
# Combine the data into a single dataset

merged_metadata_studyResult_df = pd.merge(mouse_metadata, study_results, on="Mouse ID", how="outer")

# Display the data table for preview
merged_metadata_studyResult_df

In [None]:
# Checking the number of mice.
number_Mice = len(mouse_metadata['Mouse ID'].unique())
number_Mice

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 


DuplicatedMice=study_results[study_results.duplicated("Mouse ID")]
DuplicatedMice

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
merged_metadata_studyResult_df[merged_metadata_studyResult_df.duplicated('Mouse ID')]

#mouse_metadata[mouse_metadata.duplicated('Mouse ID')]

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleanDataFrame = pd.DataFrame(merged_metadata_studyResult_df.drop_duplicates('Mouse ID'))
cleanDataFrame

In [None]:
# Checking the number of mice in the clean DataFrame.
MouseCleen_Number = len(cleanDataFrame["Mouse ID"].unique())
MouseCleen_Number


## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

Regimen_mean= cleanDataFrame.groupby('Drug Regimen')['Tumor Volume (mm3)'].mean()
Regimen_median= cleanDataFrame.groupby('Drug Regimen')['Tumor Volume (mm3)'].median()
Regimen_variance= cleanDataFrame.groupby('Drug Regimen')['Tumor Volume (mm3)'].var()
Regimen_stdv= cleanDataFrame.groupby('Drug Regimen')['Tumor Volume (mm3)'].std()

Regimen_sem= cleanDataFrame.groupby('Drug Regimen')['Tumor Volume (mm3)'].sem()

summarystatisticsdataframe = pd.DataFrame({"Mean": Regimen_mean, "Median": Regimen_median, "Variance": Regimen_variance, "Standard Deviation": Regimen_stdv,"SEM": Regimen_sem})

summarystatisticsdataframe





In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
cleanDataFrame.groupby("Drug Regimen")["Tumor Volume (mm3)"].describe()


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
merged_clean_df= cleanDataFrame[["Drug Regimen", "Mouse ID"]].groupby("Drug Regimen")
number_mouse = pd.DataFrame(merged_clean_df["Mouse ID"].count())

# Use DataFrame.plot() in order to create a bar chart of the data
number_mouse.plot(kind="bar", figsize=(10,5))

#Set a title for the chart
plt.title("Number of Measurement taken on each drug regimen")
plt.xlabel("Drug Regimen")
plt.ylabel(" Points")


#Display chart
plt.show()
plt.tight_layout()


In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
drug_regimen = number_mouse.index.tolist()

mouse_count = number_mouse["Mouse ID"].tolist()

x_axis = np.arange(len(mouse_count))
tick_locations = [value for value in x_axis]


plt.figure(figsize=(10,3))
plt.bar(x_axis, mouse_count, color="blue", alpha=0.9, align="center")
plt.xticks(tick_locations, drug_regimen)

plt.title("Total Number of measurement taken per drug regimen")
plt.xlabel("Drug regimens")
plt.ylabel("Mouse count")

# Set the limits of the x axis
plt.xlim(-.60, 10)
# Set the limits of the y axis
plt.ylim(0, 50)
plt.show()

# Generate a pie plot showing the distribution of female versus male mice using pandas



In [None]:
groupBygender = mouse_metadata[["Mouse ID","Sex"]].groupby("Sex")
gender_count = groupBygender["Mouse ID"].count()
gender_count


In [None]:
explode=[0,0]
gender_count.plot(kind="pie", title=("female versus male"),explode=explode,legend=True,autopct='%1.1f%%',startangle=110, shadow=False, fontsize=13, figsize=(5,5))

plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

gender_count = mouse_metadata.groupby("Sex")["Mouse ID"].count().tolist()
labels = ["Female", "Male"]
colors = ["r", "g"]
explode = (0, 0)

plt.pie(gender_count ,explode=explode, labels=labels, colors=colors,autopct="%1.1f%%", shadow=True, startangle=110)

plt.axis("equal")
plt.show()


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
#regi_mouse_group = groupby(["Drug Regimen","Mouse ID"])

grouped_df = study_results.groupby("Mouse ID")
max_timepoint = grouped_df['Timepoint'].max().reset_index()

max_timepoint_df = max_timepoint.rename(columns={"Timepoint":"Timepoint Max"})

    

In [None]:
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
merged_max = pd.merge(cleanDataFrame, max_timepoint_df,on="Mouse ID", how="left")
merged_max

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
drug_list = ['Ceftamin', 'Ramicane','Infubinol','Capomulin']

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol = np.arange(0, 70, 5)

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
 
# Locate the rows which contain mice on each drug and get the tumor volumes

drug_sorted = merged_max[(merged_max['Drug Regimen']== "Ceftamin")
                      |(merged_max['Drug Regimen']=="Ramicane")
                      |(merged_max['Drug Regimen']=="Infubinol")
                      |(merged_max['Drug Regimen']=="Capomulin")]

tumor_max = drug_sorted.loc[(drug_sorted["Timepoint"])==(drug_sorted["Timepoint Max"])]

# add subset
Ceftamin = tumor_max[tumor_max['Drug Regimen']== "Ceftamin"]['Tumor Volume (mm3)']
Ramicane = tumor_max[tumor_max['Drug Regimen']== "Ramicane"]['Tumor Volume (mm3)']
Infubinol = tumor_max[tumor_max['Drug Regimen']== "Infubinol"]['Tumor Volume (mm3)']
Capomulin = tumor_max[tumor_max['Drug Regimen']== "Capomulin"]['Tumor Volume (mm3)']

drug_subset = [Ceftamin,Ramicane,Infubinol,Capomulin]
# Determine outliers using upper and lower quantiles
quartiles = tumor_max['Tumor Volume (mm3)'].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of max tumor volume of mice tested in given drug regimens is: {lowerq}")
print(f"The upper quartile of max tumor volume of mice tested in given drug regimens is: {upperq}")
print(f"The interquartile range of max tumor volume of mice tested in given drug regimens is: {iqr}")
print(f"The the median of max tumor volume of mice stested in given drug regimens is : {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")


In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()

ax1.set_title('Drug Regimens')
ax1.set_ylabel('Final Tumor Volume (mm3)')
ax1.boxplot(drug_subset, labels=drug_list)
plt.show()



## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
capomulin_df = merged_max[(merged_max['Drug Regimen']== "Capomulin")]
mouseid_capomulin = capomulin_df[capomulin_df['Mouse ID']=='g288'][['Timepoint','Tumor Volume (mm3)']].set_index('Timepoint')

mouseid_capomulin.plot(figsize=(10, 8), linewidth=20, color='red')
plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
average_tumor = capomulin_df.groupby(['Mouse ID','Weight (g)'])['Tumor Volume (mm3)'].mean().reset_index()

average_tumor.plot(kind="scatter", x="Weight (g)", y="Tumor Volume (mm3)", grid=True, figsize=(8,8),
              title=" Mouse Weight Vs. Average Tumor Volume")

plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

avg_tumor = average_tumor['Tumor Volume (mm3)']
mouse_weight = average_tumor['Weight (g)']
correlation = stats.pearsonr(mouse_weight,avg_tumor)
print(f"The correlation between mouse weight and average tumor volume for the Capomulin regimen is {round(correlation[0],2)}")



In [None]:
x_values = avg_tumor
y_values = mouse_weight

(slope, intercept, rvalue, pvalue, stderr) = stats.linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"g")
plt.annotate(line_eq,(45,30),fontsize=5,color="red")
plt.xlabel('Averega Tumor Volume')
plt.ylabel('Mouse Weight')
plt.show()
