In [None]:
# Import necessary libraries
import pandas as pd

# Load the datasets
mouse_metadata_path = '../matplotlib-challenge/Mouse_metadata.csv'  # Assuming the file names
study_results_path = '../matplotlib-challenge/Study_results.csv'

# Read the CSV files
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Merge the two datasets on "Mouse ID"
merged_data = pd.merge(study_results, mouse_metadata, on="Mouse ID")

# Check the number of unique mice
unique_mice_count = merged_data["Mouse ID"].nunique()

# Check for any duplicates in the "Mouse ID" and "Timepoint" columns
duplicate_rows = merged_data[merged_data.duplicated(subset=["Mouse ID", "Timepoint"])]

# Display duplicate rows if found
duplicate_rows

In [None]:
# Remove duplicate rows based on Mouse ID and Timepoint
cleaned_data = merged_data[~merged_data["Mouse ID"].isin(duplicate_rows["Mouse ID"])]

# Check the number of unique mice after removal of duplicates
cleaned_unique_mice_count = cleaned_data["Mouse ID"].nunique()

cleaned_unique_mice_count

In [None]:
# Group the data by Drug Regimen and calculate the required summary statistics
summary_stats = cleaned_data.groupby("Drug Regimen").agg(
    mean_tumor_vol=("Tumor Volume (mm3)", "mean"),
    median_tumor_vol=("Tumor Volume (mm3)", "median"),
    tumor_var=("Tumor Volume (mm3)", "var"),
    tumor_std=("Tumor Volume (mm3)", "std"),
    tumor_sem=("Tumor Volume (mm3)", "sem")
)

# Display the summary statistics
print(summary_stats)


In [None]:
# Create a bar chart for the number of timepoints (rows) for each drug regimen using Pandas
timepoint_counts = cleaned_data["Drug Regimen"].value_counts()

# Bar chart with Pandas
timepoint_counts.plot(kind='bar', title="Number of Timepoints for Each Drug Regimen (Pandas)", xlabel="Drug Regimen", ylabel="Number of Timepoints", rot=45)

# Now create the same bar chart using Matplotlib
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
plt.bar(timepoint_counts.index, timepoint_counts.values)
plt.title("Number of Timepoints for Each Drug Regimen (Matplotlib)")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Timepoints")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Create a pie chart showing the distribution of male vs. female mice using Pandas
gender_counts = cleaned_data["Sex"].value_counts()

# Pie chart with Pandas
gender_counts.plot(kind='pie', autopct='%1.1f%%', title="Distribution of Male vs Female Mice (Pandas)", ylabel='')

# Now create the same pie chart using Matplotlib
plt.figure(figsize=(6,6))
plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=90)
plt.title("Distribution of Male vs Female Mice (Matplotlib)")
plt.show()


In [None]:
# Filter the data for the four drug regimens: Capomulin, Ramicane, Infubinol, and Ceftamin
promising_drugs = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Get the last (maximum) timepoint for each mouse
max_timepoints = cleaned_data.groupby("Mouse ID")["Timepoint"].max().reset_index()

# Merge this with the cleaned data to get the final tumor volume for each mouse
final_tumor_volume = pd.merge(max_timepoints, cleaned_data, on=["Mouse ID", "Timepoint"])

# Filter the final tumor volume data for the four promising drug regimens
final_tumor_volume_filtered = final_tumor_volume[final_tumor_volume["Drug Regimen"].isin(promising_drugs)]

# Prepare to calculate quartiles and IQR, and to check for outliers
tumor_volumes_by_drug = {}
for drug in promising_drugs:
    tumor_volumes_by_drug[drug] = final_tumor_volume_filtered.loc[final_tumor_volume_filtered["Drug Regimen"] == drug, "Tumor Volume (mm3)"]

# Calculate quartiles and IQR for each drug
outliers = {}
for drug, volumes in tumor_volumes_by_drug.items():
    quartiles = volumes.quantile([0.25, 0.5, 0.75])
    iqr = quartiles[0.75] - quartiles[0.25]
    lower_bound = quartiles[0.25] - 1.5 * iqr
    upper_bound = quartiles[0.75] + 1.5 * iqr
    outliers[drug] = volumes[(volumes < lower_bound) | (volumes > upper_bound)]

# Display the outliers
outliers


In [None]:
# Generate a box plot for the final tumor volume of each regimen
plt.figure(figsize=(10,6))

# Create the box plot with potential outliers highlighted
plt.boxplot([tumor_volumes_by_drug[drug] for drug in promising_drugs], labels=promising_drugs, flierprops={'marker': 'o', 'markerfacecolor': 'red', 'markersize': 12, 'linestyle': 'none'})

# Add title and labels
plt.title('Final Tumor Volume Distribution for Promising Drug Regimens')
plt.ylabel('Final Tumor Volume (mm3)')
plt.xlabel('Drug Regimen')

plt.show()


In [None]:
# Select a single mouse treated with Capomulin
capomulin_mouse_data = cleaned_data[cleaned_data["Drug Regimen"] == "Capomulin"].groupby("Mouse ID").get_group("b742")

# Create a line plot of tumor volume vs timepoint for this mouse
plt.figure(figsize=(8,6))
plt.plot(capomulin_mouse_data["Timepoint"], capomulin_mouse_data["Tumor Volume (mm3)"], marker='o')
plt.title("Tumor Volume vs. Time for Mouse b742 (Capomulin)")
plt.xlabel("Timepoint (Days)")
plt.ylabel("Tumor Volume (mm3)")
plt.grid(True)
plt.show()

# Now, create a scatter plot for mouse weight vs average tumor volume for Capomulin
capomulin_grouped = cleaned_data[cleaned_data["Drug Regimen"] == "Capomulin"].groupby("Mouse ID").agg(
    avg_tumor_vol=("Tumor Volume (mm3)", "mean"),
    weight=("Weight (g)", "mean")
)

# Scatter plot of weight vs average tumor volume
plt.figure(figsize=(8,6))
plt.scatter(capomulin_grouped["weight"], capomulin_grouped["avg_tumor_vol"], marker='o')
plt.title("Mouse Weight vs. Average Tumor Volume (Capomulin)")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.grid(True)
plt.show()


In [None]:
from scipy.stats import linregress

# Calculate correlation coefficient
correlation = capomulin_grouped["weight"].corr(capomulin_grouped["avg_tumor_vol"])

# Perform linear regression
slope, intercept, r_value, p_value, std_err = linregress(capomulin_grouped["weight"], capomulin_grouped["avg_tumor_vol"])

# Create a scatter plot again with the regression line
plt.figure(figsize=(8,6))
plt.scatter(capomulin_grouped["weight"], capomulin_grouped["avg_tumor_vol"], marker='o')

# Plot regression line
regression_values = slope * capomulin_grouped["weight"] + intercept
plt.plot(capomulin_grouped["weight"], regression_values, color="red")

# Add title and labels
plt.title(f"Mouse Weight vs. Average Tumor Volume (Capomulin)\nCorrelation: {correlation:.2f}")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.grid(True)
plt.show()

# Display the correlation and p-value
correlation, p_value
