# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [None]:
# Dependencies and Setup
%matplotlib widget

import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
combined_data = pd.merge(study_results, mouse_metadata, how="left", on=['Mouse ID', 'Mouse ID'])

# Display the data table for preview
combined_data.head(5)

In [None]:
# Checking the number of mice.
unique_mice_id = combined_data['Mouse ID'].unique()
# print(unique_mice_id)
mice_count = len(unique_mice_id)
mice_count

In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
combined_data.duplicated(["Mouse ID", "Timepoint"]) #boolean array
combined_data.loc[combined_data.duplicated(["Mouse ID", "Timepoint"])]['Mouse ID'].unique()


In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
combined_data.loc[combined_data['Mouse ID'] == "g989"]

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_data = combined_data.loc[combined_data["Mouse ID"] != "g989"]
# clean_data

In [None]:
# Checking the number of mice in the clean DataFrame.
count = len(clean_data['Mouse ID'].unique())
count

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.

drug_regimen_groupby = clean_data.groupby(['Drug Regimen'])
drug_regimen_groupby_mean = drug_regimen_groupby['Tumor Volume (mm3)'].mean()
drug_regimen_groupby_median = drug_regimen_groupby['Tumor Volume (mm3)'].median()
drug_regimen_groupby_var = drug_regimen_groupby['Tumor Volume (mm3)'].var()
drug_regimen_groupby_std = drug_regimen_groupby['Tumor Volume (mm3)'].std()
drug_regimen_groupby_sem = drug_regimen_groupby['Tumor Volume (mm3)'].sem()

summary_df = pd.DataFrame((
    {
     "Mean Tumor Volume": drug_regimen_groupby_mean,
     "Median Tumor Volume": drug_regimen_groupby_median,
     "Tumor Volume Variance": drug_regimen_groupby_var,
     "Tumor Volume Std. Dev.": drug_regimen_groupby_std,
     "Tumor Volume Std. Err." : drug_regimen_groupby_sem,
    })
)
summary_df

# drug_regimen_tumor_vol_mean = clean_data.groupby(['Drug Regimen']).mean()['Tumor Volume (mm3)']
# drug_regimen_tumor_vol_median = clean_data.groupby(['Drug Regimen']).median()['Tumor Volume (mm3)']
# drug_regimen_tumor_vol_var = clean_data.groupby(['Drug Regimen']).var()['Tumor Volume (mm3)']
# drug_regimen_tumor_vol_std = clean_data.groupby(['Drug Regimen']).std()['Tumor Volume (mm3)']
# drug_regimen_tumor_vol_sem = clean_data.groupby(['Drug Regimen']).sem()['Tumor Volume (mm3)']

In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line
# reference the following site: https://stackoverflow.com/questions/55388610/how-to-calculate-aggregated-summary-statistics-in-pandas-dataframe
summary_df_two = clean_data.groupby(['Drug Regimen']).agg({'Tumor Volume (mm3)':['mean', 'median', 'var', 'std', 'sem']})
summary_df_two

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
pandas_bar_chart_info = clean_data[['Drug Regimen','Timepoint']]
pandas_bar_chart_info = pandas_bar_chart_info.set_index("Drug Regimen")
pandas_bar_chart_info = pandas_bar_chart_info.groupby(['Drug Regimen']).count().sort_values(by=['Timepoint'], ascending=False)
pandas_bar_chart_info.plot(kind="bar", figsize=(6,3.5), ylabel="# of Observed Mouse Timepoints", legend=False, title="Numbers of Mouse treated with Differnet Drug Regimen")
plt.show()
plt.tight_layout()


In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
timepoint_count = pandas_bar_chart_info.iloc[:, 0]
print(len(timepoint_count))
x_axis = np.arange(len(timepoint_count))

plt.bar(x_axis, timepoint_count, color='blue', align="center")
plt.title("Numbers of Mouse treated with Differnet Drug Regimen")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas

gender_data = clean_data["Sex"].value_counts()
#print(gender_data)

gender_data.plot.pie(autopct= "%1.1f%%", legend=False)
plt.title('Percentage of Male vs. Female Mouse')
plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

pie_labels = ['Male', 'Female']

pie_sizes = clean_data["Sex"].value_counts()

plt.pie(pie_sizes, labels=pie_labels,
        autopct="%1.1f%%")
plt.title('Percentage of Male vs. Female Mouse', verticalalignment="center_baseline", loc="left")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
capomulin_mouse = clean_data.loc[clean_data['Drug Regimen'] == "Capomulin"]
ramicane_mouse = clean_data.loc[clean_data['Drug Regimen'] == "Ramicane"]
infubinol_mouse = clean_data.loc[clean_data['Drug Regimen'] == "Infubinol"]
ceftamin_mouse = clean_data.loc[clean_data['Drug Regimen'] == "Ceftamin"]

# Start by getting the last (greatest) timepoint for each mouse
capomulin_mouse_max_times = capomulin_mouse.groupby(["Mouse ID"]).max()['Timepoint']
ramicane_mouse_max_times = ramicane_mouse.groupby(["Mouse ID"]).max()['Timepoint']
infubinol_mouse_max_times = infubinol_mouse.groupby(["Mouse ID"]).max()['Timepoint']
ceftamin_mouse_max_times = ceftamin_mouse.groupby(["Mouse ID"]).max()['Timepoint']
# ramicane_mouse_max_times


# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes

    
    # add subset 

    
    # Determine outliers using upper and lower bounds


In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin

# Finding a randam mice treated with Capomulin
find_single_mice_capomulin = clean_data.groupby('Drug Regimen').min()
# print(find_single_mice_capomulin)

single_mice_capomulin = combined_data.loc[combined_data['Mouse ID'] == "b128"]
#print(single_mice_capomulin)

x_axis = np.arange(0, 50, 5)
#print(x_axis)

tumor_vol = []

test = single_mice_capomulin.iloc[0,2]
tumor = 0 

for tumor in range(10):
    store_tumor_vol = single_mice_capomulin.iloc[tumor,2]
    tumor_vol.append(store_tumor_vol)

plt.plot(x_axis, tumor_vol, color="blue")
plt.xlabel("Timepoint (days)")
plt.ylabel("Tumor Volume (mm3)")
plt.title("Capomulin treatment on mouse b128")
plt.show()

In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen
mouse_treated_with_capomulin = clean_data.loc[clean_data['Drug Regimen'] == "Capomulin"]

mouse_treated_with_capomulin_mean = mouse_treated_with_capomulin.groupby(['Mouse ID']).mean()

columns_wanted = mouse_treated_with_capomulin_mean[['Tumor Volume (mm3)', 'Weight (g)']]

# Turn a pandas dataframe into a list --> https://www.geeksforgeeks.org/get-a-list-of-a-specified-column-of-a-pandas-dataframe/#
tumor_vol_list = columns_wanted["Tumor Volume (mm3)"].tolist()
mice_weight_list = columns_wanted["Weight (g)"].tolist()

# I tried looping through the columns_wanted dataframe, but it was not working out
# num_mice = len(columns_wanted['Weight (g)'])
# x = 0
# tumor_vol_list = []
# mice_weight_list = []

# for x in num_mice:
#     store_tumor_vol = columns_wanted.iloc[x, 0]
#     tumor_vol_list.append(store_tumor_vol)

#     store_weight = columns_wanted.iloc[x, 1]
#     mice_weight_list.append(store_weight)

# print(tumor_vol_list)
# print(mice_weight_list)

plt.scatter(mice_weight_list, tumor_vol_list, marker="o", facecolors="blue", edgecolors="black",
            alpha=0.75)
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.title('Weight of each Mouse vs Tumor Volume for Mouse Treated with Capomulin')
plt.show()


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model 
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
