In [178]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset

merged_df = pd.merge(mouse_metadata, study_results, on="Mouse ID")
merged_df = pd.DataFrame(merged_df)
# Display the data table for preview

merged_df.head()


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [179]:
# Checking the number of mice.

number_of_mice = merged_df["Mouse ID"].nunique()
print(number_of_mice)



249


In [180]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

duplicate_mouse = merged_df[merged_df.duplicated()]

duplicate_mouse = pd.DataFrame(duplicate_mouse["Mouse ID"])

duplicate_mouse

Unnamed: 0,Mouse ID
909,g989


In [181]:
#checking to see how many instances of dupes there are = 5

dupe_mouse_df = merged_df.set_index("Mouse ID")

dupe_mouse_df.loc["g989", "Timepoint"]

Mouse ID
g989     0
g989     0
g989     5
g989     5
g989    10
g989    10
g989    15
g989    15
g989    20
g989    20
g989    25
g989    30
g989    35
Name: Timepoint, dtype: int64

In [182]:
# Optional: Get all the data for the duplicate mouse ID. 

dupe_mouse_df.loc["g989"]

Unnamed: 0_level_0,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Mouse ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
g989,Propriva,Female,21,26,0,45.0,0
g989,Propriva,Female,21,26,0,45.0,0
g989,Propriva,Female,21,26,5,48.786801,0
g989,Propriva,Female,21,26,5,47.570392,0
g989,Propriva,Female,21,26,10,51.745156,0
g989,Propriva,Female,21,26,10,49.880528,0
g989,Propriva,Female,21,26,15,51.325852,1
g989,Propriva,Female,21,26,15,53.44202,0
g989,Propriva,Female,21,26,20,55.326122,1
g989,Propriva,Female,21,26,20,54.65765,1


In [185]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# Can see the count has now dropped from 1893 to 1888 (5 values dropped)

merged_df_duplicates = merged_df.drop_duplicates(subset=['Mouse ID', 'Timepoint'])

clean_merged_df = pd.DataFrame(merged_df_duplicates)

clean_merged_df.describe()

Unnamed: 0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,1888.0,1888.0,1888.0,1888.0,1888.0
mean,12.792903,25.661547,19.597458,50.449276,1.023835
std,7.186737,3.926776,14.084762,8.904565,1.138507
min,1.0,15.0,0.0,22.050126,0.0
25%,7.0,25.0,5.0,45.0,0.0
50%,13.0,27.0,20.0,48.951421,1.0
75%,20.0,29.0,30.0,56.324075,2.0
max,24.0,30.0,45.0,78.567014,4.0


In [187]:
# Checking the number of mice in the clean DataFrame.

number_of_mice = clean_merged_df["Mouse ID"].nunique()
print(number_of_mice)

249
