In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
default=pd.read_csv('/kaggle/input/nek-310-lecture-11/defcc (1).csv')

In [None]:
default.info()

<h2>Today we will look into some more modern models, like Random Forests and XG Boost</h2>

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

<h2>First we just try to fit the models on the raw data and later we will make some more data explorations to see if we can improve model performance.</h2>

In [None]:
X_def = default.drop(['dpnm','ID'],axis=1) 
Y_def=default['dpnm']

X_std = StandardScaler().fit_transform(X_def)

x_train,x_test,y_train,y_test = train_test_split(X_std,Y_def,test_size=0.3,random_state=42)
#len(x_test),len(x_train),len(y_train),len(y_test)
y_train = np.ravel(y_train)

<h2>Random Forest</h2>

In [None]:
# Create the model with 100 trees
modelRF = RandomForestClassifier(n_estimators=300, criterion='entropy',
                                 oob_score=True,
                                 bootstrap=True,
                               random_state=2, 
                               max_features ='sqrt' ,
                               n_jobs=-1, verbose = 0).fit(x_train, y_train)


# Fit on training data
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
RF_roc_auc_train = roc_auc_score(y_train, np.argmax(modelRF.oob_decision_function_,axis=1))
fpr_train_RF, tpr_train_RF, thresholds_train_RF = roc_curve(y_train, modelRF.oob_decision_function_[:,1])
plt.figure()
plt.plot(fpr_train_RF, tpr_train_RF, label='Logistic Regression (area = %0.2f)' % RF_roc_auc_train)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic training data')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, modelRF.predict_proba(x_test)[:,1])
fpr, tpr, thresholds = roc_curve(y_test, modelRF.predict_proba(x_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Random Forest (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
df_x_test=pd.DataFrame(x_test)
from sklearn.inspection import permutation_importance
result = permutation_importance(modelRF, df_x_test, y_test,
                            n_repeats=30,
                            random_state=0)

In [None]:
df=pd.concat([pd.DataFrame({'Variable': pd.DataFrame(X_def).columns.tolist()}),pd.DataFrame({'Importance':result.importances_mean})],axis=1)
df1=df.sort_values(by='Importance',ascending=False)
df1.head(10)

<h2>Let's see if we can impmrove the modelfit by tweaking som model hyperparameters</h2>
<h3><br><br>"n estimators" is the number of trees in the forest.
<br><br>"max features" is the number of features randomly selected for each split of a tree.
<br><br>"max depth" is the max depth of the trees.
<br><br>"min samples leaf" is the number samples required at a leaf node for a new split to happen.
<br><br>"min samples split" is the number of samples to split an internal node.
<br><br>"criterion" is the way the gain is measured.</h3>


In [None]:
from scipy.stats import randint as sp_randint

rfc = RandomForestClassifier(random_state = 42)

params = {'n_estimators' : [100],
              'max_features' : [7,9,12],
              'max_depth': [2,4,6,8,10,12],
              'min_samples_leaf':[10,15,20,30]}

rsearch_rfc = RandomizedSearchCV(rfc, param_distributions= params, cv = 5, scoring = 'roc_auc',n_iter = 200,random_state = 42,n_jobs = -1,return_train_score = True)

rsearch_rfc.fit(x_train, y_train)
    
print("Tuned RF Parameters: {}".format(rsearch_rfc.best_params_))
print("_" * 100)

In [None]:
# Create the model with 100 trees
modelRF = RandomForestClassifier(n_estimators=100, 
                                 min_samples_leaf=20,
                                 max_depth=15,
                                 oob_score=True,
                                 bootstrap=True,
                               random_state=2, 
                               max_features =12 ,
                               n_jobs=-1, verbose = 0).fit(x_train, y_train)


# Fit on training data
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
RF_roc_auc_train = roc_auc_score(y_train, np.argmax(modelRF.oob_decision_function_,axis=1))
fpr_train_RF, tpr_train_RF, thresholds_train_RF = roc_curve(y_train, modelRF.oob_decision_function_[:,1])
plt.figure()
plt.plot(fpr_train_RF, tpr_train_RF, label='Logistic Regression (area = %0.2f)' % RF_roc_auc_train)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic training data')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, modelRF.predict_proba(x_test)[:,1])
fpr, tpr, thresholds = roc_curve(y_test, modelRF.predict_proba(x_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Random Forest (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

<h2>XG Boost. Is boosting algorithm that has been highly succesful in winning e.g. Kaggle competitions. An overview is given at www.analyticsvidhya.com/blog/2018/09/an-end-to-end-guide-to-understand-the-math-behind-xgboost/</h2>

In [None]:
#model = xgb.XGBClassifier()
model = xgb.XGBClassifier(n_jobs=-1, objective='binary:logistic', booster='dart', gamma=0
                       , learning_rate=0.01, n_estimators=300, reg_alpha=1, reg_lambda=0)
model.fit(x_train, y_train)
predictions_train = model.predict_proba(x_train)
predictions_test = model.predict_proba(x_test)

In [None]:
XG_roc_auc_train = roc_auc_score(y_train, predictions_train[:,1])
fpr_train_XG, tpr_train_XG, thresholds_train_XG = roc_curve(y_train, predictions_train[:,1])
plt.figure()
plt.plot(fpr_train_XG, tpr_train_XG, label='XG Boost (area = %0.2f)' % XG_roc_auc_train)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic training data')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

<h2>What about test data?</h2>

In [None]:
XG_roc_auc_test = roc_auc_score(y_test, predictions_test[:,1])
fpr_test_XG, tpr_test_XG, thresholds_test_XG = roc_curve(y_test, predictions_test[:,1])
plt.figure()
plt.plot(fpr_test_XG, tpr_test_XG, label='XG Boost (area = %0.2f)' % XG_roc_auc_test)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic test data')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

<h2>Clearly there is an over-fit to the training data. So we try to search trough a grid of (hyper)parameter values for the XG Boost to minimize the problem. There are a lot of parameters that can be tweaked but we focus on a few of them. </h2>
<h3>
<br>"Learning rate" determines how much each tree contributes to the model. 
<br><br>"Max depth" determines how large the trees may get. 
<br><br>"Min child weight" is the minimum number of observations in a leaf node. 
<br><br>"Gamma" is a global regularisation parameter, it sets a lower limit on the gain (improvement) for adding new nodes. 
<br><br>"Colsample bytree" sets the ratio of features randomly selected for each tree.</h3>

In [None]:
mod1= xgb.XGBClassifier()

param_grid = {"learning_rate"    : [0.03, 0.04, 0.05] ,
                  "max_depth"        : [5, 6, 7, 8],
                 "min_child_weight" : [11, 12, 13],
                 "gamma"            : [0.3, 0.4, 0.5],
                 "colsample_bytree" : [0.3, 0.4, 0.5, 0.6] }
  
#Building a 5 fold CV GridSearchCV object
mod_RS = RandomizedSearchCV(mod1, param_grid ,cv = 5, scoring = 'roc_auc',n_iter = 200,n_jobs = -1)

mod_RS.fit(x_train, y_train)
    
print("Tuned XG Boost Parameters: {}".format(mod_RS.best_params_))
print("_" * 100)

In [None]:
#model = xgb.XGBClassifier()
model = xgb.XGBClassifier(n_jobs=-1, objective='binary:logistic', booster='dart',
                          min_child_weight=13, max_depth=6, learning_rate=0.05, gamma=0.4, n_estimators=300, 
                          colsample_bytree=0.6, reg_alpha=1, reg_lambda=0)
model.fit(x_train, y_train)
predictions_train = model.predict_proba(x_train)
predictions_test = model.predict_proba(x_test)

In [None]:
XG_roc_auc_train = roc_auc_score(y_train, predictions_train[:,1])
fpr_train_XG, tpr_train_XG, thresholds_train_XG = roc_curve(y_train, predictions_train[:,1])
plt.figure()
plt.plot(fpr_train_XG, tpr_train_XG, label='XG Boost (area = %0.2f)' % XG_roc_auc_train)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic training data')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
XG_roc_auc_test = roc_auc_score(y_test, predictions_test[:,1])
fpr_test_XG, tpr_test_XG, thresholds_test_XG = roc_curve(y_test, predictions_test[:,1])
plt.figure()
plt.plot(fpr_test_XG, tpr_test_XG, label='XG Boost (area = %0.2f)' % XG_roc_auc_test)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic test data')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
df_x_test=pd.DataFrame(x_test)
from sklearn.inspection import permutation_importance
result = permutation_importance(model, df_x_test, y_test,
                            n_repeats=30,
                            random_state=0)

In [None]:
df=pd.concat([pd.DataFrame({'Variable': pd.DataFrame(X_def).columns.tolist()}),pd.DataFrame({'Importance':result.importances_mean})],axis=1)
df1=df.sort_values(by='Importance',ascending=False)
df1.head(10)