# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_data_complete =pd.merge(mouse_metadata, study_results, how ='right', on =['Mouse ID', 'Mouse ID'])

# Display the data table for preview
mouse_data_complete

In [None]:
# Checking the number of mice.
mouse_count = mouse_data_complete['Mouse ID'].nunique()
print(mouse_count)

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
mouse_count_duplicates= mouse_data_complete[mouse_data_complete.duplicated('Mouse ID')]
mouse_count_duplicates

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
mouse_ID_duplicates= mouse_data_complete[mouse_data_complete.duplicated(subset=['Mouse ID','Timepoint'],keep=False)]

mouse_ID_duplicates

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_mouse_data= mouse_data_complete.drop_duplicates(subset=['Mouse ID','Timepoint'], keep='last')
clean_mouse_data.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
clean_mouse_data.count()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.
# Group the data by Drug Regimen and calculate the summary statistics
mean_T_vol = clean_mouse_data['Tumor Volume (mm3)'].groupby(clean_mouse_data['Drug Regimen']).mean()
medi_T_vol= clean_mouse_data['Tumor Volume (mm3)'].groupby(clean_mouse_data['Drug Regimen']).median()
var_vol_var =clean_mouse_data['Tumor Volume (mm3)'].groupby(clean_mouse_data['Drug Regimen']).var()
T_vol_st_dev = clean_mouse_data['Tumor Volume (mm3)'].groupby(clean_mouse_data['Drug Regimen']).std()
T_vol_st_err = clean_mouse_data['Tumor Volume (mm3)'].groupby(clean_mouse_data['Drug Regimen']).sem()

summary_stats_df = pd.DataFrame({"Mean Tumor Volume": mean_T_vol,
                     "Median Tumor Volume":medi_T_vol,
                     "Tumor Volume Variance": var_vol_var,
                     "Tumor Volume Std. Dev.": T_vol_st_dev,
                     "Tumor Volume Std. Err.":T_vol_st_err})

# Display the summary statistics
summary_stats_df

In [None]:
# Using the aggregation method, produce the same summary statistics in a single line.
summary_aggregation_df=clean_mouse_data.groupby(['Drug Regimen'])[['Tumor Volume (mm3)']].agg(['mean','median','var','std','sem'])
summary_aggregation_df

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
total_timepoints = clean_mouse_data['Drug Regimen'].value_counts()
total_timepoints

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
bar_plot=total_timepoints.plot.bar()
plt.xlabel('Drug Regimen')
plt.ylabel('Number of mice')

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
pie_plot = clean_mouse_data['Sex'].value_counts()
pie_plot.plot.pie(autopct="%1.1f%%")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = ['Female', 'Male']
numbers = [49.3,50.7]
plot= pie_plot.plot.pie(y='Mouse Count', autopct="%1.1f%%")
plt.ylabel=('Sex')
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
#Create a grouped DataFrame that shows the last (greatest) time point for each mouse. 
max_timepoints = clean_mouse_data.groupby('Mouse ID')['Timepoint'].agg('max')
max_timepoints

#Merge this grouped DataFrame with the original cleaned DataFrame.
merged_df = pd.merge(clean_mouse_data, max_timepoints, on='Mouse ID')

# View the merged DataFrame
merged_df.head()

In [None]:
#Create a list that holds the treatment names as well as a second, empty list to hold the tumour volume data.
treatments = ["Capomulin", "Ramicane","Infubinol","Ceftamin"]
df_filtered = clean_mouse_data[clean_mouse_data['Drug Regimen'].isin(treatments)]

In [None]:
# Create a new column to store the maximum Final tumor volume for each mouse
# Start by getting the last (greatest) timepoint for each mouse
last_timepoint = df_filtered.groupby('Mouse ID')['Timepoint'].max()
tumor_vol_data = []
merged_df = pd.merge(last_timepoint, df_filtered, on=['Mouse ID', 'Timepoint'], how='left')
for treatment in treatments:
    tumor_vol_data.append(merged_df.loc[merged_df['Drug Regimen'] == treatment]['Tumor Volume (mm3)'].tolist())

In [None]:
# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
tumor_vol_df = pd.DataFrame({'Capomulin': tumor_vol_data[0], 'Ramicane': tumor_vol_data[1], 'Infubinol': tumor_vol_data[2], 'Ceftamin': tumor_vol_data[3]})

In [None]:
#extract final tumor volumes for each treatment
capomulin_volumes = tumor_vol_df.loc[:, 'Capomulin']
ramicane_volumes = tumor_vol_df.loc[:, 'Ramicane']
infubinol_volumes = tumor_vol_df.loc[:, 'Infubinol']
ceftamin_volumes = tumor_vol_df.loc[:, 'Ceftamin']

In [None]:
#Subset for each treatment
capomulin_subset = tumor_vol_df[['Capomulin']]
ramicane_subset = tumor_vol_df[['Ramicane']]
infubinol_subset = tumor_vol_df[['Infubinol']]
ceftamin_subset = tumor_vol_df[['Ceftamin']]

In [None]:
#Set quartile range for each treatment - Q1 and Q3 
capomulin_q1 = capomulin_subset.quantile(0.25)
capomulin_q3 = capomulin_subset.quantile(0.75)
ramicane_q1 = ramicane_subset.quantile(0.25)
ramicane_q3 = ramicane_subset.quantile(0.75)
infubinol_q1 = infubinol_subset.quantile(0.25)
infubinol_q3 = infubinol_subset.quantile(0.75)
ceftamin_q1 = ceftamin_subset.quantile(0.25)
ceftamin_q3 = ceftamin_subset.quantile(0.75)

In [None]:
#Calculate interquartile range for each treatment
capomulin_iqr = capomulin_q3 - capomulin_q1
ramicane_iqr = ramicane_q3 - ramicane_q1
infubinol_iqr = infubinol_q3 - infubinol_q1
ceftamin_iqr = ceftamin_q3 - ceftamin_q1

In [None]:
capomulin_lower_bound = capomulin_q1 - (1.5 * capomulin_iqr)
capomulin_upper_bound = capomulin_q3 + (1.5 * capomulin_iqr)
ramicane_lower_bound = ramicane_q1 - (1.5 * ramicane_iqr)
ramicane_upper_bound = ramicane_q3 + (1.5 * ramicane_iqr)
infubinol_lower_bound = infubinol_q1 - (1.5 * infubinol_iqr)
infubinol_upper_bound = infubinol_q3 + (1.5 * infubinol_iqr)
ceftamin_lower_bound = ceftamin_q1 - (1.5 * ceftamin_iqr)
ceftamin_upper_bound = ceftamin_q3 +(1.5* ceftamin_iqr)

In [None]:
# Create a boxplot for each treatment
plt.boxplot([tumor_vol_df[treatment] for treatment in treatments])

# Set the x-axis labels to the treatments
plt.xticks(range(1, len(treatments) + 1), treatments)

# Add a y-axis label
plt.ylabel= ('Final Tumor Volume')

# Show the plot
plt.show()


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
capomulin_df = clean_mouse_data[clean_mouse_data['Drug Regimen'] == 'Capomulin']

mouse_id ="b128"
mouse_data = capomulin_df[capomulin_df['Mouse ID'] == mouse_id]
mouse_data


In [None]:
# Create a line plot of tumor volume vs. time point for the chosen mouse
plt.plot(mouse_data['Timepoint'], mouse_data['Tumor Volume (mm3)'])

# Set the x-axis label to "Timepoint (days)"
plt.xlabel('Timepoint (days)')

# Set the y-axis label to "Tumor Volume (mm3)"
plt.ylabel('Tumor Volume (mm3)')

# Show the plot
plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
#capomulin_df = clean_mouse_data[clean_mouse_data['Drug Regimen'] == 'Capomulin']

mouse_group = capomulin_df.groupby('Mouse ID').agg({'Tumor Volume (mm3)': 'mean', 'Weight (g)': 'mean'})

# Create a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
plt.scatter(mouse_group['Weight (g)'], mouse_group['Tumor Volume (mm3)'])

# Set the x-axis label to "Weight (g)"
plt.xlabel=('Weight (g)')

# Set the y-axis label to "Tumor Volume (mm3)"
plt.ylabel=('Tumor Volume (mm3)')

# Set the plot title
plt.title=('Average Tumor Volume vs. Mouse Weight for Capomulin Regimen')

# Show the plot
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
slope, intercept, r_value, p_value, std_err = linregress(mouse_group['Weight (g)'], mouse_group['Tumor Volume (mm3)'])

# Create a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
plt.scatter(mouse_group['Weight (g)'], mouse_group['Tumor Volume (mm3)'])

# Calculate the predicted tumor volume values for the regression line
regression_values = mouse_group['Weight (g)'] * slope + intercept

# Plot the regression line
plt.plot(mouse_group['Weight (g)'], regression_values, color='red')

# Set the x-axis label to "Weight (g)"
plt.xlabel=('Weight (g)')

# Set the y-axis label to "Tumor Volume (mm3)"
plt.ylabel=('Tumor Volume (mm3)')

# Set the plot title
plt.title=('Average Tumor Volume vs. Mouse Weight for Capomulin Regimen')

# Show the plot
plt.show()