In [None]:
# XGBOOST


from xgboost import XGBClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt

#from sklearn.linear_model import LogisticRegression
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

RSEED=42

In [None]:
data = pd.read_csv('data/cleaned_data.csv')

data.head()

In [None]:
data.info()

In [None]:
# define target and features variables

X = data.drop('state', axis=1)
y = data.state

In [None]:
# split to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=RSEED)

In [None]:
# fit model to training data

xgb = XGBClassifier(random_state=RSEED,
                    n_jobs=-1,
                    n_estimators=300,
                    learning_rate=0.1,
                    subsample=0.5,
                    max_depth=13,
                    colsample_bytree=0.85
                    )
xgb.fit(X_train, y_train)

In [None]:
# Make predictions on test set
y_pred = xgb.predict(X_test)

In [None]:
# model evaluation
 
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)

sns.heatmap(cm, annot=True, fmt='g', cmap='Blues')

In [None]:
X_test_analysis = pd.DataFrame(X_test)

In [None]:
y_test

In [None]:
corr_pred = [y_test == y_pred for i in range(len(y_test))]

In [None]:
corr_pred = np.array(corr_pred[0])
corr_pred

In [None]:
len(corr_pred)

In [None]:
len(y_test)

In [None]:
X_test_analysis["corr_pred"] = corr_pred

In [None]:
X_test_analysis['true_res'] = y_test

In [None]:
X_test_analysis

#### Define array containing tp, tn, fp, fn for error analysis

In [None]:
result_category = np.empty(len(y_test), dtype=object)
mtp = (corr_pred == True)
mfp = (corr_pred == False)
mtt = (y_test == 1)
mft = (y_test == 0)

mTP = np.logical_and(mtp, mtt)
mFP = np.logical_and(mtp, mft)
mTN = np.logical_and(mfp, mft)
mFN = np.logical_and(mfp, mtt)

#mask = np.logical_and(corr_pred == True and y_test == 1)
#print(mTP)
result_category[mTP] = 'TP'
result_category[mFP] = 'FP'
result_category[mTN] = 'TN'
result_category[mFN] = 'FN'


In [None]:
result_category

In [None]:
X_test_analysis['result_category'] = result_category

In [None]:
sns.pairplot(X_test_analysis, hue='corr_pred')

In [None]:
sns.pairplot(X_test_analysis.drop(['corr_pred', 'true_res'],axis=1), hue='result_category')