# myopia_xgboost.ipynb

In [1]:
# Initial imports

# ignore metric did not predict warning
import warnings
warnings.filterwarnings("ignore")
from IPython.display import display, HTML

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statistics import mean, mode
from pathlib import Path
from imblearn.pipeline import Pipeline
# from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, StratifiedKFold, GridSearchCV, cross_val_score, cross_validate, validation_curve
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE  
from binary_classifier import model_compare

In [3]:
df = pd.read_csv('../eda/reduced_filtered_df.csv')

In [4]:
df.head(2)

Unnamed: 0,ACD,LT,VCD,SPORTHR,DADMY,delta_spheq,total_positive_screen,MYOPIC
0,3.702,3.392,15.29,4,1,1.358,8,0
1,3.462,3.514,15.52,14,0,1.929,10,0


In [5]:
X=df.drop('MYOPIC', axis=1)
y=df['MYOPIC']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y)

## Test the XGBoost with Params

In [7]:
from xgboost import XGBClassifier


In [8]:
params = {'n_estimators': 7,
 'max_leaves': 6,
 'min_child_weight': 5.8614537300277965,
 'learning_rate': 0.11677765280016518,
 'subsample': 0.8895588746662894,
 'colsample_bylevel': 0.7905358317292889,
 'colsample_bytree': 0.9544060556215052,
 'reg_alpha': 0.0015245843735931766,
 'reg_lambda': 0.5536296597037936}

In [9]:
clf = XGBClassifier()
clf.set_params(**params)


In [11]:
# create process steps
pipes = [
    ("scaler", StandardScaler()),
    ("xgboost", clf)
]

# target labels
labels=['not_myopic', 'myopic']

# 5 fold cross validation
cv = StratifiedKFold(n_splits=5)


summary_dict = model_compare('XGBoost', X_train, y_train, X_test, y_test, labels, pipes, cv)

Unnamed: 0,scenario,type,pipe1,pipe2,pipe3,pipe4,pipe5,cv,f1-score,recall,precision,ROC_AUC,Precision_Recall_AUC,accuracy
0,XGBoost,cross_validation,StandardScaler(),"XGBClassifier(base_score=None, booster=None,\n...",,,,"StratifiedKFold(n_splits=5, random_state=None,...",0.0,0.0,0.0,0.836681,0.35,0.867403


In [12]:
pd.DataFrame(summary_dict)

Unnamed: 0,scenario,type,pipe1,pipe2,pipe3,pipe4,pipe5,precision,recall,f1-score,number_samples,accuracy,cv,ROC_AUC,Precision_Recall_AUC
0,XGBoost,threshold_train_0,StandardScaler(),"XGBClassifier(base_score=None, booster=None,\n...",,,,0.867384,1.0,0.928983,242.0,0.867384,,,
1,XGBoost,threshold_train_1,StandardScaler(),"XGBClassifier(base_score=None, booster=None,\n...",,,,0.0,0.0,0.0,37.0,0.867384,,,
2,XGBoost,threshold_test_0,StandardScaler(),"XGBClassifier(base_score=None, booster=None,\n...",,,,0.870968,1.0,0.931034,81.0,0.870968,,,
3,XGBoost,threshold_test_1,StandardScaler(),"XGBClassifier(base_score=None, booster=None,\n...",,,,0.0,0.0,0.0,12.0,0.870968,,,
4,XGBoost,cross_validation,StandardScaler(),"XGBClassifier(base_score=None, booster=None,\n...",,,,0.0,0.0,0.0,,0.867403,"StratifiedKFold(n_splits=5, random_state=None,...",0.836681,0.35
