# myopia_lgb.ipynb

In [1]:
# Initial imports

# ignore metric did not predict warning
import warnings
warnings.filterwarnings("ignore")
from IPython.display import display, HTML

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statistics import mean, mode
from pathlib import Path
from imblearn.pipeline import Pipeline
# from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, StratifiedKFold, GridSearchCV, cross_val_score, cross_validate, validation_curve
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE  
from binary_classifier import model_compare

In [18]:
df = pd.read_csv('../eda/reduced_filtered_df.csv')

In [19]:
df.head(2)

Unnamed: 0,ACD,LT,VCD,SPORTHR,DADMY,delta_spheq,total_positive_screen,MYOPIC
0,3.702,3.392,15.29,4,1,1.358,8,0
1,3.462,3.514,15.52,14,0,1.929,10,0


In [20]:
X=df.drop('MYOPIC', axis=1)
y=df['MYOPIC']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=200, stratify=y)

In [22]:
over = SMOTE()
X_train, y_train = over.fit_resample(X_train, y_train)
print(len(X))

372


## Test the XGBoost with Params
-https://machinelearningmastery.com/gradient-boosting-with-scikit-learn-xgboost-lightgbm-and-catboost/

In [6]:
# # instll LGBM if not already in environment
#!pip install lightgbm

In [23]:
import lightgbm as lgb


In [24]:
params = {'n_estimators': 5,
 'max_leaves': 4,
 'min_child_weight': 3.5806456485030385,
 'learning_rate': 0.4359086627863047,
 'subsample': 1.0,
 'colsample_bylevel': 0.9122163035553499,
 'colsample_bytree': 0.9599028390069959,
 'reg_alpha': 0.11831815110092216,
 'reg_lambda': 0.07223617575057466,
 'objective': 'binary'}

In [25]:
train_data = lgb.Dataset(X_train, label=y_train)
validation_data = lgb.Dataset(X_test, label=y_test)

In [26]:
params['metric'] = ['auc', 'binary_logloss']

In [27]:
num_round = 10
score = lgb.cv(params, train_data, num_round, nfold=5)

In [28]:
np.mean(score['auc-mean'])

0.8903687875150059

In [29]:
num_round = 10
bst = lgb.train(params, train_data, num_round, valid_sets=[validation_data])

[1]	valid_0's auc: 0.751094	valid_0's binary_logloss: 0.534235
[2]	valid_0's auc: 0.793114	valid_0's binary_logloss: 0.493769
[3]	valid_0's auc: 0.791946	valid_0's binary_logloss: 0.471761
[4]	valid_0's auc: 0.826233	valid_0's binary_logloss: 0.446513
[5]	valid_0's auc: 0.82419	valid_0's binary_logloss: 0.447176


In [30]:
ypred = bst.predict(X_test)

In [31]:
yhat = [ 1 if i >0.5 else 0 for i in ypred]

In [32]:
from sklearn.metrics import classification_report

print(classification_report(y_test, yhat))

              precision    recall  f1-score   support

           0       0.94      0.79      0.86       149
           1       0.33      0.70      0.45        23

    accuracy                           0.77       172
   macro avg       0.64      0.74      0.65       172
weighted avg       0.86      0.77      0.80       172



## LGB using Scikit-Learn Wrapper

In [33]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RepeatedStratifiedKFold

In [34]:
params = {'n_estimators': 5,
 'max_leaves': 4,
 'min_child_weight': 3.5806456485030385,
 'learning_rate': 0.4359086627863047,
 'subsample': 1.0,
 'colsample_bylevel': 0.9122163035553499,
 'colsample_bytree': 0.9599028390069959,
 'reg_alpha': 0.11831815110092216,
 'reg_lambda': 0.07223617575057466
 }

In [35]:
model = LGBMClassifier()
model.set_params(**params)

In [36]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [37]:
n_scores = cross_validate(model, X_train, y_train, scoring=['precision', 'roc_auc'], cv=cv, n_jobs=-1, error_score='raise')

In [38]:
# np.mean(n_scores)
print(f"Precision mean score: {np.mean(n_scores['test_precision'])}")
print(f"ROC AUC mean score: {np.mean(n_scores['test_roc_auc'])}")

Precision mean score: 0.8311319666322023
ROC AUC mean score: 0.9293765218505701


In [53]:
model.fit(X_train, y_train)
yhat = model.predict(X_test)

In [54]:
from sklearn.metrics import classification_report

print(classification_report(y_test, yhat))

              precision    recall  f1-score   support

           0       0.93      0.94      0.93        81
           1       0.55      0.50      0.52        12

    accuracy                           0.88        93
   macro avg       0.74      0.72      0.73        93
weighted avg       0.88      0.88      0.88        93



## LGB with Pipes

In [55]:
labels=['not_myopic', 'myopic']

model = LGBMClassifier()
model.set_params(**params)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# create process steps
pipes = [
    ("scaler", StandardScaler()),
    ("lgb", model)
]

summary_dict = model_compare('SVC Unbalanced Data', X_train, y_train, X_test, y_test, labels, pipes, cv);

Unnamed: 0,scenario,type,pipe1,pipe2,pipe3,pipe4,pipe5,cv,f1-score,recall,precision,ROC_AUC,Precision_Recall_AUC,accuracy
0,SVC Unbalanced Data,cross_validation,StandardScaler(),LGBMClassifier(colsample_bylevel=0.91221630355...,,,,"RepeatedStratifiedKFold(n_repeats=3, n_splits=...",0.393704,0.347222,0.502222,0.83734,0.43,0.88157


In [56]:
pd.DataFrame(summary_dict)

Unnamed: 0,scenario,type,pipe1,pipe2,pipe3,pipe4,pipe5,precision,recall,f1-score,number_samples,accuracy,cv,ROC_AUC,Precision_Recall_AUC
0,SVC Unbalanced Data,threshold_train_0,StandardScaler(),LGBMClassifier(colsample_bylevel=0.91221630355...,,,,0.924901,0.966942,0.945455,242.0,0.903226,,,
1,SVC Unbalanced Data,threshold_train_1,StandardScaler(),LGBMClassifier(colsample_bylevel=0.91221630355...,,,,0.692308,0.486486,0.571429,37.0,0.903226,,,
2,SVC Unbalanced Data,threshold_test_0,StandardScaler(),LGBMClassifier(colsample_bylevel=0.91221630355...,,,,0.926829,0.938272,0.932515,81.0,0.88172,,,
3,SVC Unbalanced Data,threshold_test_1,StandardScaler(),LGBMClassifier(colsample_bylevel=0.91221630355...,,,,0.545455,0.5,0.521739,12.0,0.88172,,,
4,SVC Unbalanced Data,cross_validation,StandardScaler(),LGBMClassifier(colsample_bylevel=0.91221630355...,,,,0.502222,0.347222,0.393704,,0.88157,"RepeatedStratifiedKFold(n_repeats=3, n_splits=...",0.83734,0.43
