# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [None]:
# %matplotlib notebook
%matplotlib ipympl

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
combined_data = pd.merge(mouse_metadata, study_results,
                            how = "outer", on = "Mouse ID")

# Display the data table for preview
combined_data

In [None]:
# Checking the number of mice.
unique_number_mice = combined_data ["Mouse ID"].nunique()
unique_number_mice

In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
duplicate_mice =  combined_data.loc [combined_data.duplicated(subset = ["Mouse ID", "Timepoint"]), "Mouse ID"].unique()
duplicate_mice

In [None]:
# Optional: Get all the data for the duplicate mouse ID.
all_ID_duplicates = combined_data[combined_data.duplicated(["Mouse ID", "Timepoint"])]
all_ID_duplicates

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_data = combined_data.drop_duplicates(subset = ["Mouse ID"])
cleaned_data

In [None]:
# Checking the number of mice in the clean DataFrame.
mice_count = cleaned_data["Mouse ID"].nunique()
mice_count

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen:
# mean, median, variance, standard deviation, and SEM of the tumor volume.
regimen_mean = combined_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].mean()
regimen_median = combined_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].median()
regimen_var = combined_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].var()
regimen_std = combined_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].std()
regimen_sem = combined_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].sem()

# Assemble the resulting series into a single summary DataFrame.
summary_statistics_regimen = pd.DataFrame({
    "Tumor Volume Mean": regimen_mean,
    "Tumor Volume Median": regimen_median,
    "Tumor Volume Variance": regimen_var,
    "Tumor Volume Std. Dev.": regimen_std,
    "Tumor Volume Std. Error": regimen_sem                                           
})
summary_statistics_regimen

In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)
# Using the aggregation method, produce the same summary statistics in a single line
aggregated_statistics_summary = combined_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].agg(mean = "mean", median = "median", var = "var", std = "std", sem = "sem").reset_index()
aggregated_statistics_summary

In [None]:
import numpy as np

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
mouse_drug_regimen = combined_data.groupby(["Drug Regimen"]).size().sort_values(ascending=False)

pandas_plot = mouse_drug_regimen.plot(kind = "bar", facecolor = "blue", figsize = (6, 6), width = 0.5,
                                title = "Total Number of Mice per Drug Regimen",
                                xlabel = "Drug Regimen",
                                ylabel = "# of Observed Mouse Timepoints"
                                
                        )
plt.tight_layout()

In [None]:
mouse_drug_regimen

In [None]:
# Create an array
timepoints = [230, 228, 188, 186, 182, 181, 181, 178, 178, 161]
x_axis = np.arange(len(timepoints))

# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
print(timepoints)
print(x_axis)

In [None]:
%matplotlib ipympl
plt.bar(x_axis,timepoints, color = "b",align = "center")
plt.show()

In [None]:
# Get the unique mice with their gender
sex_totals = combined_data["Sex"].value_counts()
print(sex_totals)

In [None]:
# Generate a pie chart, using Pandas, showing the distribution of unique female versus male mice used in the study
# Labels and sizes for pie sections
import pandas as pd
import matplotlib.pyplot as plt

pandas_pie_chart = pd.DataFrame({
    "Sex": ["Male", "Female"],
    "Count": [958, 935]
})

# Make the pie chart
pandas_pie_chart.set_index("Sex").plot.pie(
    
    explode = [0.0, 0],
    title = "Sex Distribution among Mice",
    labels = pandas_pie_chart["Sex"],
    autopct = "%1.1f%%",
    colors = ["blue", "orange"],
    legend = False,
    subplots = True
)
plt.show()


In [None]:
# Generate a pie chart, using pyplot, showing the distribution of unique female versus male mice used in the study

# Get the unique mice with their gender
plt.figure()
labels = ["Male", "Female"]
count = [958, 935]

# Make the pie chart
colors = ["blue", "orange"]
explode = (0.1, 0)
plt.title("Sex Distribution among Mice")
plt.ylabel("Count")
plt.pie (count, labels = labels, colors = colors, autopct = "%1.1f%%")
plt.show()


In [None]:
import pandas as pd

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:
# Capomulin, Ramicane, Infubinol, and Ceftamin
# Start by getting the last (greatest) timepoint for each mouse
last_timepoint = combined_data.groupby(["Mouse ID"])["Timepoint"].max()
last_timepoint = last_timepoint.reset_index()

# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
final_tumor_volume = last_timepoint.merge(combined_data, on = ["Mouse ID", "Timepoint"], how = "left")
final_tumor_volume

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Put treatments into a list for for loop (and later for plot labels)
treatments = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
total_tumor_volume = []

# Calculate the IQR and quantitatively determine if there are any potential outliers.
for drug in treatments:
    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    tumor_volumes = final_tumor_volume.loc[final_tumor_volume["Drug Regimen"] == drug, "Tumor Volume (mm3)"]
    
    # Determine outliers using upper and lower bounds
    quartiles = tumor_volumes.quantile([0.25, 0.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq - lowerq

    lower_bound = lowerq - (1.5 * iqr)
    upper_bound = upperq + (1.5 * iqr)

    outliers = tumor_volumes[(tumor_volumes < lower_bound) | (tumor_volumes > upper_bound)]
    print(f"{drug}'s potential outliers: {outliers}")


In [None]:
# Generate a box plot that shows the distribution of the tumor volume for each treatment group.
import pandas as pd
import matplotlib.pyplot as plt

boxplot = [final_tumor_volume.loc[final_tumor_volume["Drug Regimen"] == treatment, "Tumor Volume (mm3)"] for treatment in treatments]

fig1, ax1 = plt.subplots()
ax1.set_title("Tumor Volume Distribution per Drug Regimen")
ax1.set_ylabel("Final Tumor Volume (mm3)")

ax1.boxplot(
    boxplot,
    labels = treatments,
    widths = 0.5,
    flierprops = dict(markerfacecolor = "red", marker = "o")
)
plt.show()

## Line and Scatter Plots

In [None]:
#overall mouse data
capomulin = combined_data.loc[combined_data["Drug Regimen"] == "Capomulin",:]

#single mouse data
x401 = capomulin.loc[capomulin["Mouse ID"] == "x401",:]

print("Columns in x401:", x401.columns)

In [None]:
# Set line graph parameters
x_axis = x401["Timepoint"]
tumor_volume = x401["Tumor Volume (mm3)"]

# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin
fig1, ax1 = plt.subplots (figsize = (6, 6))
plt.plot(x_axis, tumor_volume, linewidth = 2, color = "blue", label = "mm3")
plt.xlabel("Timepoints (days)")
plt.ylabel("Tumor Volume (mm3)")
plt.title("Capomulin Treatment of Mouse x401")

plt.plot(x_axis, tumor_volume)

In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen
average_tumor_volume = capomulin.groupby(['Mouse ID'])[["Tumor Volume (mm3)", "Weight (g)"]].mean()

fig1, ax1 = plt.subplots(figsize = (6,6))
plt.scatter(average_tumor_volume['Weight (g)'], average_tumor_volume['Tumor Volume (mm3)'], s = 90, color = "blue")
plt.title("Mouse Weight vs Tumor Volume Average")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")

plt.show()


## Correlation and Regression

In [None]:
import matplotlib.pyplot as plt
from scipy.stats import linregress
import scipy.stats as st 

# Calculate the correlation coefficient and a linear regression model
print(f"The correlation coefficient between weight and average tumor volume is {round(st.pearsonr(average_tumor_volume['Weight (g)'],average_tumor_volume['Tumor Volume (mm3)'])[0],2)}")

# for mouse weight and average observed tumor volume for the entire Capomulin regimen
# Add the linear regression equation and line to plot
x_values = average_tumor_volume["Weight (g)"]
y_values = average_tumor_volume["Tumor Volume (mm3)"]
fig, ax = plt.subplots()

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values, s = 90, color ="blue")
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(20,38),fontsize = 15,color = "red")
plt.title("Mouse Weight vs Tumor Volume Average")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3")
plt.show()