In [3]:
# --- Split Dataset for Modeling ---

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

# 1. Load cleaned dataset
print("Loading cleaned dataset...")
df = pd.read_csv("../data/processed/cleaned_framingham.csv")
print(f"Dataset shape: {df.shape}\n")

# 2. Separate features and target
X = df.drop(columns=["TenYearCHD"])
y = df["TenYearCHD"]

# 3. Stratified split into Train and Test (80%/20%) (no need for validation split here)
print("Splitting into train/test...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}\n")

# 4. Identify continuous features
continuous_features = ['age', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']

# 5. Standard scaling for continuous features (fit on train only)
print("Scaling continuous features...")
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[continuous_features] = scaler.fit_transform(X_train[continuous_features])
X_test_scaled[continuous_features] = scaler.transform(X_test[continuous_features])

print("Scaling complete.\n")

# 6. Create splits directory if not exists
os.makedirs("../data/splits", exist_ok=True)

# 7. Save datasets as .npy files
print("Saving split datasets...")
np.save("../data/splits/X_train.npy", X_train_scaled.values)
np.save("../data/splits/y_train.npy", y_train.values)

np.save("../data/splits/X_test.npy", X_test_scaled.values)
np.save("../data/splits/y_test.npy", y_test.values)

print("All splits saved successfully!")


Loading cleaned dataset...
Dataset shape: (4240, 16)

Splitting into train/test...
Train shape: (3392, 15)
Test shape: (848, 15)

Scaling continuous features...
Scaling complete.

Saving split datasets...
All splits saved successfully!
