In [4]:
#Import libraries and load data frame
import pandas as pd
import numpy as np
from numpy import sort
import xgboost as xgb
from xgboost import XGBClassifier

from joblib import dump

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, precision_score



In [2]:
#Load data

trainDF = pd.read_csv('../data/raw/train.csv')
testDF = pd.read_csv('../data/raw/test.csv')

#Remove Id columns as irrelevant and data cleaning such as make all columns absolute values

trainDF_cleaned = trainDF.copy()
trainDF_cleaned = trainDF_cleaned.abs()
trainDF_cleaned.drop(['Id_old','Id'], axis=1, inplace=True)
target = trainDF_cleaned.pop('TARGET_5Yrs')

testDF_cleaned = testDF.copy()
testDF_cleaned = testDF_cleaned.abs()
testDF_cleaned.drop(['Id_old','Id'], axis=1, inplace=True)

In [3]:
#Split data

X_train, X_val, y_train, y_val = train_test_split (trainDF_cleaned, target, test_size=0.2, random_state=8)

np.save('../data/processed/X_train', X_train)
np.save('../data/processed/X_val',   X_val)

np.save('../data/processed/y_train', y_train)
np.save('../data/processed/y_val',   y_val)


In [9]:
#Build pipeline

full_pipeline = Pipeline([('imputer', SimpleImputer()),  ('scaler', StandardScaler()),

('xgb', XGBClassifier(max_depth=8, learning_rate=0.05, min_child_weight=9, subsample=0.7, colsample_bytree=1.0, colsample_bylevel=0.9, colsample_bynode=0.8))])


In [10]:
# Fit and predict using pipeline

full_pipeline.fit(X_train,y_train)
pred_val = full_pipeline.predict(X_val)





In [None]:
#Optional: Parameter grid for pipeline

##param1 = [,..]

##param_grid = dict(xgb__paramname=param1, xbg__param2name=param2)

In [13]:
# Print confusion matrix to evaluate classification accuracy
cm_val = confusion_matrix(y_val, pred_val)
print("Confusion Matrix \n", cm_val)


# Model Accuracy, how often is the classifier correct?
print("Accuracy:",accuracy_score(y_val, pred_val))

#AUC score
auc_val = roc_auc_score(y_val,pred_val)
print('AUC: %.2f' % auc_val)

Confusion Matrix 
 [[   8  249]
 [  22 1321]]
Accuracy: 0.830625
AUC: 0.51
