In [4]:
# --- Split Dataset for Modeling (Updated) ---

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import os

# 1. Load cleaned dataset
print("Loading cleaned dataset...")
df = pd.read_csv("../../data/processed/cleaned_framingham.csv")
print(f"Dataset shape: {df.shape}\n")

# 2. Separate features and target
X = df.drop(columns=["TenYearCHD"])
y = df["TenYearCHD"]

# 3. Stratified split into Train and Test (80%/20%)
print("Splitting into train/test...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}\n")

# 4. Identify continuous features
continuous_features = ['age', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']

# 5. Impute missing values (fit on train only)
print("Imputing missing values...")

# Use median imputer for continuous features
median_imputer = SimpleImputer(strategy='median')
X_train[continuous_features] = median_imputer.fit_transform(X_train[continuous_features])
X_test[continuous_features] = median_imputer.transform(X_test[continuous_features])

# For categorical features (if you have any categorical columns)
# Here we are assuming you might want to apply mode imputation for categorical features,
# just replace with your list of categorical features if needed.
categorical_features = ['male', 'education', 'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes']
mode_imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_features] = mode_imputer.fit_transform(X_train[categorical_features])
X_test[categorical_features] = mode_imputer.transform(X_test[categorical_features])

# 6. Standard scaling for continuous features (fit on train only)
print("Scaling continuous features...")
scaler = StandardScaler()
X_train[continuous_features] = scaler.fit_transform(X_train[continuous_features])
X_test[continuous_features] = scaler.transform(X_test[continuous_features])

print("Scaling complete.\n")

# 7. Create splits directory if not exists
os.makedirs("../../data/splits", exist_ok=True)

# 8. Save datasets as .npy files
print("Saving split datasets...")
np.save("../../data/splits/X_train.npy", X_train.values)
np.save("../../data/splits/y_train.npy", y_train.values)

np.save("../../data/splits/X_test.npy", X_test.values)
np.save("../../data/splits/y_test.npy", y_test.values)

print("All splits saved successfully!")


Loading cleaned dataset...
Dataset shape: (4240, 16)

Splitting into train/test...
Train shape: (3392, 15)
Test shape: (848, 15)

Imputing missing values...
Scaling continuous features...
Scaling complete.

Saving split datasets...
All splits saved successfully!
