In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, roc_auc_score
import joblib

# Load Dataset
We're loading the basic dataset and dropping any values where Smoking Status and BMI are not known for the sake of simplicity

In [None]:
stroke_df = pd.read_csv("../data/healthcare-dataset-stroke-data.csv")
stroke_df = stroke_df[stroke_df["bmi"].notna() & (stroke_df['smoking_status']!= "Unknown")]
stroke_df = stroke_df.drop(columns=["id"])
stroke_df.head()

In [None]:
for col in ["gender", "ever_married", "work_type", "Residence_type", "smoking_status", "hypertension", "heart_disease"]:
    print(f"{col}: {stroke_df[col].unique()}")
for col in ["age", "avg_glucose_level", "bmi"]:
    print(f"{col}: min {stroke_df[col].min()}, max {stroke_df[col].max()}")

# Split in to Training and Test Sets

In [None]:
X = stroke_df.drop(columns=["stroke"])
y = stroke_df["stroke"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Logistic Regression

In [None]:
numeric_features = ["age", "avg_glucose_level", "bmi", "hypertension", "heart_disease"]
categorical_features = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine them into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Now make the full pipeline: preprocess → logistic regression
log_reg_pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, random_state=42))
])
log_reg_pipe.fit(X_train, y_train)

In [None]:
y_pred = log_reg_pipe.predict(X_test)
y_proba = log_reg_pipe.predict_proba(X_test)[:, 1]
print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUROC:", roc_auc_score(y_test, y_proba))

# Random Forest

In [None]:
rf_pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),  # reuse same preprocessor
    ("classifier", RandomForestClassifier(
        random_state=42,
        n_jobs=-1
    ))
])

rf_pipe.fit(X_train, y_train)

In [None]:
y_pred = rf_pipe.predict(X_test)
y_proba = rf_pipe.predict_proba(X_test)[:, 1]
print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUROC:", roc_auc_score(y_test, y_proba))

# Save Model

In [None]:
joblib.dump(log_reg_pipe, "../models/log_reg_model.joblib")