In [None]:
import numpy as np 
import pandas as pd 
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, cohen_kappa_score, precision_score, recall_score, confusion_matrix
import xgboost as xgb
from sklearn.externals import joblib 

In [None]:
dataset = pd.read_csv("dataset.csv")
train_df = dataset[dataset['START_DATETIME']<'2017-01-01']
test_df = dataset[dataset['START_DATETIME']>='2017-01-01']
del(dataset)
train_df = train_df.drop('START_DATETIME',axis=1)
test_df = test_df.drop('START_DATETIME',axis=1)
y_train = traindata['LOSS_RATE_IND']
y_test = testdata['LOSS_RATE_IND']
train_cols = [c for c in train_df.columns if c not in ["LOSS_RATE",'LOSS_RATE_IND']]

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1001)
y_train.value_counts()

In [None]:
weight_ratio = float(len(y_train[y_train == 0]))/float(len(y_train[y_train == 1]))
weight_ratio

w_array = np.array([1]*y_train.shape[0])
w_array

w_array[y_train==1] = weight_ratio
w_array[y_train==0] = 1- weight_ratio
w_array

test_df = test_df[train_cols]
print(test_df.shape)
print(train_df.shape)

In [None]:
dev_result = np.zeros(train_df.shape[0])
pred_te = np.zeros(test_df.shape[0])

feature_importance_df = pd.DataFrame()

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_df, y_train)):
    
    trn_x, trn_y = train_df[train_cols].iloc[trn_idx], y_train.iloc[trn_idx]
    val_x, val_y = train_df[train_cols].iloc[val_idx], y_train.iloc[val_idx]
    
    dtrain = xgb.DMatrix(trn_x, trn_y, feature_names=trn_x.columns)
    dval = xgb.DMatrix(val_x, val_y, feature_names=val_x.columns)
    
    clf = xgb.train(params=params, dtrain=dtrain, num_boost_round=5000, evals=[(dtrain, "Train"), (dval, "Val")],
        verbose_eval= 100, early_stopping_rounds=50) 
    
    joblib.dump(clf, 'Model/clf_xgb_'+str(n_fold+1)+'.pkl')
    
    dev_result[val_idx] = clf.predict(xgb.DMatrix(val_x))
    pred_te += clf.predict(xgb.DMatrix(test_df)) / folds.n_splits

In [None]:
print('\nCV AUC score %.6f & std %.6f' % (roc_auc_score(y_train, dev_result), np.std((dev_result))))
print('CV Precision score %.6f' % (precision_score(y_train, np.round(dev_result))))
print('CV Recall score %.6f' % (recall_score(y_train, np.round(dev_result))))
print('CV F1 score %.6f' % (f1_score(y_train, np.round(dev_result))))
print('CV Kappa score %.6f' % (cohen_kappa_score(y_train, np.round(dev_result))))

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, pred_te)
roc_auc = auc(fpr, tpr) # compute area under the curve
roc_auc

thresholds

def Find_Optimal_Cutoff(target, predicted):
    fpr, tpr, threshold = roc_curve(target, predicted)
    i = np.arange(len(tpr)) 
    roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
    roc_t = roc.ix[(roc.tf-0).abs().argsort()[:1]]

    return list(roc_t['threshold']) 

Find_Optimal_Cutoff(y_test, pred_te)

In [None]:
pred_y = np.where(pred_te >=0.03281814092770219, 1, 0)
roc_auc_score(y_test, pred_y)

confusion_matrix(y_test, pred_y)

print(confusion_matrix(y_test, pred_y)[0,0]/confusion_matrix(y_test, pred_y)[0].sum())
print(confusion_matrix(y_test, pred_y)[1,1]/confusion_matrix(y_test, pred_y)[1].sum())
print((confusion_matrix(y_test, pred_y)[1,1]+confusion_matrix(y_test, pred_y)[0,0])/confusion_matrix(y_test, pred_y).sum())