In [6]:
import pandas as pd
from sklearn.compose import make_column_selector as selector
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import mannwhitneyu
from scipy.stats import chi2_contingency


from sklearn.model_selection import train_test_split

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.preprocessing import OneHotEncoder, PowerTransformer, StandardScaler
import statsmodels.api as sm
import pandas as pd


In [7]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
df.drop(['id'], axis=1, inplace=True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [8]:

# Automatically select columns
numerical_selector = selector(dtype_include=['int64', 'float64'])
categorical_selector = selector(dtype_include=['object', 'category', 'bool'])

# Get initial lists
numerical_cols = numerical_selector(df)
categorical_cols = categorical_selector(df)

# Identify binary numeric columns (with only two unique values)
binary_numerical_cols = [col for col in numerical_cols 
                         if df[col].nunique(dropna=False) == 2]

# Move them from numerical to categorical
numerical_cols = [col for col in numerical_cols if col not in binary_numerical_cols]
categorical_cols = categorical_cols + binary_numerical_cols

# Results
print("Numerical columns:", numerical_cols)
print("Categorical columns:", categorical_cols)

Numerical columns: ['age', 'avg_glucose_level', 'bmi']
Categorical columns: ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'hypertension', 'heart_disease', 'stroke']


## Data Splitting

In [9]:
# Assuming df is your DataFrame
target_col = 'stroke'

numerical_cols = ['age', 'avg_glucose_level', 'bmi']
categorical_cols = ['gender', 'hypertension', 'heart_disease', 'ever_married',
                    'work_type', 'Residence_type', 'smoking_status']

# Step 1: Split data
X = df.drop(columns=[target_col])
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


## Pipeline

Numeric features (age, avg_glucose_level, bmi) were scaled and transformed; missing bmi values were imputed using iterative median-based imputation.

Binary features (hypertension, heart_disease) were kept as-is (0/1).

Multi-category categorical features (gender, ever_married, work_type, Residence_type, smoking_status) were one-hot encoded with drop='first' to avoid the dummy variable trap.

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline
import pandas as pd

# Define columns
num_cols = ['age', 'avg_glucose_level', 'bmi']
binary_cols = ['hypertension', 'heart_disease']  # already 0/1
cat_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# Numeric preprocessing
num_preprocessor = ColumnTransformer([
    ("age", StandardScaler(), ["age"]),
    ("glucose", Pipeline([
        ("yj", PowerTransformer(method="yeo-johnson")),
        ("scaler", StandardScaler())
    ]), ["avg_glucose_level"]),
    ("bmi", Pipeline([
        ("imputer", IterativeImputer(random_state=0, initial_strategy="median")),
        ("yj", PowerTransformer(method="yeo-johnson")),
        ("scaler", StandardScaler())
    ]), ["bmi"])
], remainder='passthrough')  # will keep binary columns for now

# Categorical preprocessing (multi-category only)
cat_preprocessor = ColumnTransformer([
    ("ohe", OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), cat_cols)
], remainder='passthrough')  # keep numeric & binary columns

# Combine numeric + categorical into a full preprocessing pipeline
full_preprocessor = ColumnTransformer([
    ("num", num_preprocessor, num_cols),      # numeric preprocessing
    ("cat", cat_preprocessor, cat_cols)       # multi-category encoding
], remainder='passthrough')                  # any remaining columns (binary) passed through

# Fit and transform training data
X_train_processed = full_preprocessor.fit_transform(X_train)

# Transform test data
X_test_processed = full_preprocessor.transform(X_test)

# Convert to DataFrame with proper column names
X_train_processed = pd.DataFrame(
    X_train_processed,
    columns=full_preprocessor.get_feature_names_out(),
    index=X_train.index
)

X_test_processed = pd.DataFrame(
    X_test_processed,
    columns=full_preprocessor.get_feature_names_out(),
    index=X_test.index
)


In [20]:
X_train_processed.head()

Unnamed: 0,num__age__age,num__glucose__avg_glucose_level,num__bmi__bmi,cat__ohe__gender_Male,cat__ohe__gender_Other,cat__ohe__ever_married_Yes,cat__ohe__work_type_Never_worked,cat__ohe__work_type_Private,cat__ohe__work_type_Self-employed,cat__ohe__work_type_children,cat__ohe__Residence_type_Urban,cat__ohe__smoking_status_formerly smoked,cat__ohe__smoking_status_never smoked,cat__ohe__smoking_status_smokes,remainder__hypertension,remainder__heart_disease
2226,0.389044,0.448391,-0.905049,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3927,0.833687,-0.163836,1.014078,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3358,1.67851,0.091026,0.208864,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4152,0.522437,-0.849609,0.021884,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4866,-0.277921,0.337261,0.993039,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## Feature Selection

In [21]:


from sklearn.feature_selection import mutual_info_classif
import pandas as pd

# Compute Mutual Information on preprocessed data
mi_scores = mutual_info_classif(X_train_processed, y_train, discrete_features='auto', random_state=42)

# Convert to DataFrame for readability
mi_df = pd.DataFrame({
    'Feature': X_train_processed.columns,
    'MI Score': mi_scores
}).sort_values(by='MI Score', ascending=False)

print("Mutual Information Scores:\n", mi_df)


Mutual Information Scores:
                                      Feature  MI Score
0                              num__age__age  0.034544
2                              num__bmi__bmi  0.010906
14                   remainder__hypertension  0.009884
9               cat__ohe__work_type_children  0.008989
1            num__glucose__avg_glucose_level  0.005888
12     cat__ohe__smoking_status_never smoked  0.005842
8          cat__ohe__work_type_Self-employed  0.005718
5                 cat__ohe__ever_married_Yes  0.005249
15                  remainder__heart_disease  0.003416
11  cat__ohe__smoking_status_formerly smoked  0.001055
4                     cat__ohe__gender_Other  0.000000
3                      cat__ohe__gender_Male  0.000000
7                cat__ohe__work_type_Private  0.000000
6           cat__ohe__work_type_Never_worked  0.000000
10            cat__ohe__Residence_type_Urban  0.000000
13           cat__ohe__smoking_status_smokes  0.000000


In [22]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Select top 10 features based on Mutual Information
selector = SelectKBest(score_func=mutual_info_classif, k=10)
X_train_selected = selector.fit_transform(X_train_processed, y_train)
X_test_selected = selector.transform(X_test_processed)

# Get the selected feature names
selected_features = X_train_processed.columns[selector.get_support()]
print("Selected Features:\n", selected_features)


Selected Features:
 Index(['num__age__age', 'num__glucose__avg_glucose_level', 'num__bmi__bmi',
       'cat__ohe__ever_married_Yes', 'cat__ohe__work_type_Self-employed',
       'cat__ohe__work_type_children', 'cat__ohe__Residence_type_Urban',
       'cat__ohe__smoking_status_formerly smoked', 'remainder__hypertension',
       'remainder__heart_disease'],
      dtype='object')


#### observation:

Mutual Information analysis revealed that numeric features (age, bmi, avg_glucose_level) are the most informative, while binary features (hypertension, heart_disease) and a few categorical one-hot features (ever_married_Yes, work_type_Self-employed, work_type_children, Residence_type_Urban, smoking_status_formerly smoked) contribute moderately to predicting the target. Using SelectKBest, the top 10 features were selected, capturing the majority of predictive information and reducing dimensionality, thus providing a compact and effective set of features for model building.

## Handling Class Imbalance with SMOTE

The stroke dataset is imbalanced, so we applied SMOTE on the training data to generate synthetic minority class samples. This balances the classes, helping the model learn patterns for both classes effectively, while keeping the test set untouched for realistic evaluation.

“SMOTE improves the model’s ability to detect stroke cases and reduces bias toward the majority class.”

In [23]:
# 1. Import SMOTE
from imblearn.over_sampling import SMOTE

# 2. Initialize SMOTE
smote = SMOTE(random_state=42)

# 3. Apply SMOTE only on the training set
X_train_bal, y_train_bal = smote.fit_resample(X_train_selected, y_train)

# 4. Optional: check the new class distribution
import pandas as pd
print(pd.Series(y_train_bal).value_counts())


stroke
0    3403
1    3403
Name: count, dtype: int64


## BAseline MOdel

In [24]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score, accuracy_score

# Initialize dummy classifier (stratified)
dummy_clf = DummyClassifier(strategy='stratified', random_state=42)

# Define scoring metrics
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score)
}

# Cross-validation on balanced training set
cv_results = cross_validate(dummy_clf, X_train_bal, y_train_bal, cv=5, scoring=scoring)

# Print average scores
print("CV Accuracy:", cv_results['test_accuracy'].mean())
print("CV F1 Score:", cv_results['test_f1'].mean())
print("CV Precision:", cv_results['test_precision'].mean())
print("CV Recall:", cv_results['test_recall'].mean())


CV Accuracy: 0.5000002157867423
CV F1 Score: 0.49946482735960906
CV Precision: 0.4999874826235847
CV Recall: 0.4989686447266132



<b>Performance Metrics Selection and Baseline</b>:



For stroke prediction, we focus on Recall and F1 Score as primary metrics, with Precision monitored as an additional metric:

Recall: Measures how many actual stroke cases are correctly identified. High recall is critical to avoid missing strokes.

F1 Score: Balances recall and precision, providing an overall measure of model performance.

Precision: Helps monitor false alarms but is secondary to recall.

Baseline (DummyClassifier) Performance:

Accuracy: 0.50

F1 Score: 0.50

Precision: 0.50

Recall: 0.50

“These baseline results represent a naive classifier. The main predictive models are expected to improve upon these metrics, achieving higher recall and F1 Score while maintaining reasonable precision, ensuring reliable and practical stroke prediction.”