In [1]:
import pandas as pd
import numpy as np

In [3]:
# 1: Load the raw dataset
from pathlib import Path

# Define path relative to this notebook
data_path = Path('../data/cleaned_data.csv')

# Load the CSV
df = pd.read_csv(data_path)

In [5]:
# 2: Convert date columns to datetime, handle inconsistent formats robustly
date_cols = ['cdc_case_earliest_dt', 'cdc_report_dt', 'onset_dt', 'pos_spec_dt']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

In [6]:
# 3: Normalize categorical columns
# Strip whitespace, unify case, handle typos (example with sex)
df['sex'] = df['sex'].str.strip().str.capitalize()
df['sex'] = df['sex'].replace({'Femal': 'Female', 'Mle': 'Male'})  # add known typos here

In [7]:
# 4: Clean binary variables: 'Yes'/'No'/'Missing' → 1/0/np.nan
binary_cols = ['hosp_yn', 'icu_yn', 'death_yn', 'medcond_yn']
mapping = {'Yes': 1, 'No': 0, 'Missing': np.nan}
for col in binary_cols:
    df[col] = df[col].map(mapping)

In [8]:
# 5: Create feature - days from case to report date (critical metric)
df['days_to_report'] = (df['cdc_report_dt'] - df['cdc_case_earliest_dt']).dt.days

In [9]:
# 6: Handle missing values strategically
# Fill missing days_to_report with median
median_days = df['days_to_report'].median()
df['days_to_report'] = df['days_to_report'].fillna(median_days)

In [None]:
# 7: Save cleaned, processed data for next stage
processed_path = Path('../data/processed_data.csv')

# Save the cleaned DataFrame
df.to_csv(processed_path, index=False)

print("Data cleaning complete. Processed data saved at:", processed_path.resolve())
