https://youtu.be/-8s9KuNo5SA

https://www.kaggle.com/code/robikscube/cross-validation-visualized-youtube-tutorial/notebook

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import (
    train_test_split,
    TimeSeriesSplit,
    KFold,
    StratifiedGroupKFold,
    GroupKFold,
    StratifiedKFold
)

In [5]:
def get_prep_data():
    data = pd.read_csv(
        "healthcare-dataset-stroke-data.csv"
    )
    data["ever_married"] = (
        data["ever_married"].replace("Yes", True).replace("No", False)
    )
    data["gender"] = data["gender"].astype("category")
    data["smoking_status"] = data["smoking_status"].astype("category")
    data["Residence_type"] = data["Residence_type"].astype("category")
    data["work_type"] = data["work_type"].astype("category")
    data["doctor"] = np.random.randint(0, 8, size=len(data))
    holdout_ids = data.sample(n=500, random_state=529).index

    train = (
        data.loc[~data.index.isin(holdout_ids)]
        .sample(frac=1, random_state=529)
        .sort_values("doctor")
        .reset_index(drop=True)
    )
    holdout = (
        data.loc[data.index.isin(holdout_ids)]
        .sample(frac=1, random_state=529)
        .sort_values("doctor")
        .reset_index(drop=True)
    )

    return train, holdout


train, holdout = get_prep_data()

In [6]:
train.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,doctor
0,37320,Female,77.0,0,0,True,Private,Rural,80.85,19.4,Unknown,0,0
1,59729,Male,53.0,0,0,True,Private,Urban,211.03,34.2,formerly smoked,0,0
2,42041,Female,38.0,0,0,True,Private,Rural,217.55,,smokes,0,0
3,54805,Female,27.0,0,0,False,Self-employed,Urban,73.65,24.8,Unknown,0,0
4,59054,Male,17.0,0,0,False,Private,Rural,77.79,23.6,Unknown,0,0


In [8]:
train.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke', 'doctor'],
      dtype='object')

In [9]:
def get_X_y(train):
    FEATURES = [
        "gender",
        "age",
        "hypertension",
        "heart_disease",
        "ever_married",
        "work_type",
        "Residence_type",
        "avg_glucose_level",
        "bmi",
        "smoking_status",
    ]

    GROUPS = "doctor"

    TARGET = "stroke"

    X = train[FEATURES]
    y = train[TARGET]
    groups = train[GROUPS]
    return X, y, groups

In [23]:
X, y, groups = get_X_y(train)
clf = lgb.LGBMClassifier(n_estimators=100)
clf.fit(X,y)

pred = clf.predict(X)
pred_proba = clf.predict_proba(X)[:, 1]

In [24]:
pred_proba

array([4.52795827e-02, 1.72018573e-03, 1.19125736e-02, ...,
       1.73443478e-02, 4.96537653e-02, 4.09043401e-05])

In [25]:
acc_score = accuracy_score(y, pred)
auc_score = roc_auc_score(y, pred_proba)
print(f'{acc_score}  {auc_score}')

0.9911062906724512  0.9997403662063107


In [26]:
x_holdout, y_holdout, group_holdout = get_X_y(holdout)

pred = clf.predict(x_holdout)
pred_proba = clf.predict_proba(x_holdout)[:, 1]
acc_score = accuracy_score(y_holdout, pred)
auc_score = roc_auc_score(y_holdout, pred_proba)
print(f'{acc_score}  {auc_score}')

0.938  0.7801850048685491


Validation

In [28]:
X, y, groups = get_X_y(train)

x_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.1)
clf = lgb.LGBMClassifier(n_estimators=100, max_depth=3)
clf.fit(X,y)
pred = clf.predict(X_val)
pred_proba = clf.predict_proba(X_val)[:, 1]
acc_score = accuracy_score(y_val, pred)
auc_score = roc_auc_score(y_val, pred_proba)
print(f'{acc_score}  {auc_score}')

0.9479392624728851  0.9595728451563692


Cross Validation

In [29]:
sgk = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=529)
X, y, groups = get_X_y(train)

for train_idx, val_idx in sgk.split(X, y, groups):
    X_tr = X.loc[train_idx]
    y_tr = y.loc[train_idx]

    X_val = X.loc[val_idx]
    y_val = y.loc[val_idx]

    clf = lgb.LGBMClassifier(n_estimators=100)
    clf.fit(X,y)
    pred = clf.predict(X_val)
    pred_proba = clf.predict_proba(X_val)[:, 1]
    acc_score = accuracy_score(y_val, pred)
    auc_score = roc_auc_score(y_val, pred_proba)
    print(f'{acc_score}  {auc_score}')


0.9885764499121266  0.9999819615058535
0.9914383561643836  0.9991815476190476
0.990990990990991  1.0
0.9922945205479452  0.999477252307441
0.992274678111588  0.9996648613866694
