In [30]:
import logging
from typing import Dict, Tuple

import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, recall_score
from sklearn.model_selection import RandomizedSearchCV



def split_data(data: pd.DataFrame) -> Tuple:
    """
    Splits data into train and test splits.
    Args:
        data: DataFrame containing the features and target to split
        parameters: Dictionary of parameters to split the data using sklearn.train_test_split
    Returns:
        Tuple containing the splited data.
    """

    X = data.drop('stroke', axis=1)
    y = data.stroke

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size = 0.3,
        random_state = 42,
        stratify=y
    )

    return X_train, X_test, y_train, y_test





def train_model(X_train: pd.DataFrame, y_train: pd.Series) -> RandomizedSearchCV:
    """
    Use Randomized Search to tune the Gradient Boost hyperparameters and train model

    Args:
        X_train, y_train : training data
    Return:
        RandomizedSearchCV with the model parameters.
    """


    #oversample data
    sm = SMOTE(random_state=42)
    X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

    params = {
    'learning_rate': [0.05,0.01,0.0001],
    'num_leaves': [90,140,200],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'max_depth' : [3,4,5,6,7,8],
    'random_state' : [42], 
    'colsample_bytree' : [0.5,0.6,0.7,0.8,1.0],
    'subsample' : [0.5,0.6,0.7,0.8,1.0],
    'min_split_gain' : [0.01],
    'min_data_in_leaf':[10],
    'metric':['auc']
    }
    clf = lgb.LGBMClassifier()
    RSCV = RandomizedSearchCV(clf,params,verbose=3,cv=10,n_jobs = -1,n_iter=10)
    RSCV.fit(X_train_res,y_train_res)
    return RSCV



def evaluate_model(classifier: RandomizedSearchCV, X_test: pd.DataFrame, y_test: pd.Series) -> None:
    """
    Calculates and logs the f1 score and recall.
    Args:
        classifier: Trained LGBMClassifier
        X_test, y_test: Test_data for evaluation
    """

    y_pred = classifier.predict(X_test)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print ('f1_score: ', f1)

In [31]:
df = pd.read_csv('../data/03_primary/preprocessed_stroke_data.csv')
X = df.drop('stroke', axis=1)
y = df.stroke

In [32]:
from imblearn.over_sampling import SMOTE

sm = SMOTE()
X_res, y_res = sm.fit_resample(X, y)

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size = 0.2, random_state = 42)

In [35]:
params = {
        'learning_rate': [0.05, 0.01, 0.0001],
        'n_estimators': [90, 140, 200],
        'min_samples_split': [1, 2, 3, 4],
        'max_depth' : [1, 2, 3, 4],
        'warm_start': [False, True]
    }

clf = GradientBoostingClassifier()
RSVC = RandomizedSearchCV(
        clf,
        params,
        verbose=3,
        cv=10,
        n_jobs=-1,
        n_iter=10,
        scoring = 'recall'
)

RSVC.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


 0.86070266 0.87123987 0.87123987 0.86789994]


RandomizedSearchCV(cv=10, estimator=GradientBoostingClassifier(), n_jobs=-1,
                   param_distributions={'learning_rate': [0.05, 0.01, 0.0001],
                                        'max_depth': [1, 2, 3, 4],
                                        'min_samples_split': [1, 2, 3, 4],
                                        'n_estimators': [90, 140, 200],
                                        'warm_start': [False, True]},
                   scoring='recall', verbose=3)

In [36]:
y_pred=rsvc.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.91      0.95      0.93       975
           1       0.94      0.90      0.92       970

    accuracy                           0.93      1945
   macro avg       0.93      0.93      0.93      1945
weighted avg       0.93      0.93      0.93      1945

