# BT4012 Kaggle Competition

Author: Loh Hong Tak Edmund

Python Version: 3.8.11

# plan:

- EDA
- Scaling
- RFE
- Models:
    - XGBoost
    - RandomForest
    - Logit
    - Neural Network: CNN + LSTM

## Importing packages

In [6]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-1.4.2-py3-none-win_amd64.whl (97.8 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.4.2


In [65]:
# Loading Packages

# EDA
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
from numpy import sort

from collections import Counter

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_curve, auc, log_loss, roc_auc_score 
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV
import xgboost

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.layers import TextVectorization

# Settings
import pickle
import warnings
warnings.filterwarnings("ignore")

## Helper Functions

In [9]:
def getRFEfeatures(model, x, y, n_features_to_select):
    rfe = RFE(model, n_features_to_select)
    rfe = rfe.fit(x,y)
    selected_features = list(x.columns[rfe.support_])
    print('Selected features: %s' % selected_features)
    return selected_features

def get_auc(model, x, y):
    y_pred_proba = model.predict_proba(x)[:,1]
    [fpr, tpr, thr] = roc_curve(y, y_pred_proba)
    return auc(fpr, tpr)

def get_logloss(model, x, y):
    y_pred_proba = model.predict_proba(x)[:,1]
    return log_loss(y, y_pred_proba)

def print_train_score(model, x_train, y_train, auc=True):
    pred = model.predict(x_train)
    model_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
    print("TRAIN RESULT:\n================================================")
    print(f"ACCURACY SCORE: {accuracy_score(y_train, pred) * 100:.2f}%")
    print("_______________________________________________")
    print(f"CLASSIFICATION REPORT:\n{model_report}")
    print("_______________________________________________")
    print(f"CONFUSION MATRIX: \n {confusion_matrix(y_train, pred)}\n")
    if auc:
        print("_______________________________________________")
        print(f"AUC Score: \n {get_auc(model, x_train, y_train)}\n")
    

def print_test_score(model, x_test, y_test, auc=True):
    pred = model.predict(x_test)
    model_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
    print("TEST RESULT:\n================================================")
    print(f"ACCURACY SCORE: {accuracy_score(y_test, pred) * 100:.2f}%")
    print("_______________________________________________")
    print(f"CLASSIFICATION REPORT:\n{model_report}")
    print("_______________________________________________")
    print(f"CONFUSION MATRIX: \n {confusion_matrix(y_test, pred)}\n")
    if auc:
        print("_______________________________________________")
        print(f"AUC Score: \n {get_auc(model, x_test, y_test)}\n")

## Loading Dataset

In [19]:
ds = pd.read_csv('train.csv')

In [20]:
ds.head()

Unnamed: 0,r0c0,r0c1,r0c2,r0c3,r0c4,r0c5,r0c6,r0c7,r0c8,r0c9,...,r19c11,r19c12,r19c13,r19c14,r19c15,r19c16,r19c17,r19c18,r19c19,label
0,1,1,1,1,28,43,52,255,255,255,...,191,255,52,34,1,1,1,1,1,0
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,0
2,1,1,128,255,255,255,255,255,255,255,...,255,255,255,255,255,255,128,1,1,0
3,53,54,61,91,141,172,197,223,233,246,...,184,185,187,169,147,106,82,34,23,1
4,46,46,46,46,36,36,41,41,41,41,...,38,65,65,95,95,149,149,205,205,0


In [25]:
ds.label

0        0
1        0
2        0
3        1
4        0
        ..
72134    0
72135    0
72136    0
72137    0
72138    0
Name: label, Length: 72139, dtype: int64

In [32]:
counter = Counter
counter(ds.label)

Counter({0: 65581, 1: 6558})

In [15]:
ds.describe()

Unnamed: 0,r0c0,r0c1,r0c2,r0c3,r0c4,r0c5,r0c6,r0c7,r0c8,r0c9,...,r19c11,r19c12,r19c13,r19c14,r19c15,r19c16,r19c17,r19c18,r19c19,label
count,72139.0,72139.0,72139.0,72139.0,72139.0,72139.0,72139.0,72139.0,72139.0,72139.0,...,72139.0,72139.0,72139.0,72139.0,72139.0,72139.0,72139.0,72139.0,72139.0,72139.0
mean,36.125383,51.473308,70.855279,82.032354,86.869142,96.39173,101.608728,108.712333,115.257752,119.01468,...,105.187416,101.996327,98.175689,94.994663,83.791611,79.024744,67.467084,49.497775,37.186349,0.090908
std,70.769428,83.545299,93.468985,99.787787,100.295541,103.128587,104.129642,103.806258,106.480366,107.572306,...,106.387324,102.402083,103.242319,102.317456,98.44079,97.781003,90.884177,82.706445,72.454911,0.28748
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
50%,1.0,1.0,2.0,18.0,33.0,52.0,67.0,91.0,96.0,104.0,...,69.0,73.0,60.0,52.0,26.0,7.0,1.0,1.0,1.0,0.0
75%,30.0,80.0,128.0,171.0,181.0,208.0,219.0,226.0,248.0,255.0,...,236.0,213.0,207.0,199.0,172.0,163.0,128.0,73.0,24.0,0.0
max,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,...,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,1.0


## Splitting Training and Testing Data

In [33]:
X = ds.drop('label', axis=1)
y = ds.label.values

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)


## XGBoost (Base)

In [46]:
xgb = xgboost.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(x_train,y_train)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [47]:
print_train_score(xgb, x_train,y_train)


TRAIN RESULT:
ACCURACY SCORE: 99.95%
_______________________________________________
CLASSIFICATION REPORT:
                      0            1  accuracy     macro avg  weighted avg
precision      0.999695     0.997721  0.999515      0.998708      0.999515
recall         0.999771     0.996963  0.999515      0.998367      0.999515
f1-score       0.999733     0.997342  0.999515      0.998537      0.999515
support    52442.000000  5269.000000  0.999515  57711.000000  57711.000000
_______________________________________________
CONFUSION MATRIX: 
 [[52430    12]
 [   16  5253]]

_______________________________________________
AUC Score: 
 0.9999885656649199



In [48]:
print_test_score(xgb, x_test,y_test)


TEST RESULT:
ACCURACY SCORE: 97.52%
_______________________________________________
CLASSIFICATION REPORT:
                      0            1  accuracy     macro avg  weighted avg
precision      0.978009     0.939566  0.975187      0.958787      0.974574
recall         0.995129     0.771916  0.975187      0.883523      0.975187
f1-score       0.986495     0.847530  0.975187      0.917012      0.974080
support    13139.000000  1289.000000  0.975187  14428.000000  14428.000000
_______________________________________________
CONFUSION MATRIX: 
 [[13075    64]
 [  294   995]]

_______________________________________________
AUC Score: 
 0.9880131111099434



## XGBoost (Feature Selection)

In [61]:
thresholds[::10]

array([0.00039659, 0.00073222, 0.00086165, 0.00091957, 0.00104091,
       0.00110194, 0.0011567 , 0.00124643, 0.00130086, 0.0013312 ,
       0.00137493, 0.00144874, 0.0015345 , 0.00161569, 0.00168937,
       0.00175464, 0.00181217, 0.00191617, 0.0019898 , 0.00206531,
       0.0021515 , 0.00220199, 0.00228318, 0.00235755, 0.00243207,
       0.00250207, 0.00254894, 0.00270162, 0.00279516, 0.00293152,
       0.00312616, 0.00326243, 0.00339987, 0.00359022, 0.00389174,
       0.00423782, 0.00439366, 0.00498297, 0.00586353, 0.00669911],
      dtype=float32)

In [64]:
thresholds = sort(xgb.feature_importances_)
store = {}

for thresh in thresholds[::10]:
    selection = SelectFromModel(xgb, threshold=thresh, prefit=True)
    select_x_train = selection.transform(x_train)
    selection_model = xgboost.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    selection_model.fit(select_x_train, y_train)
    select_x_test = selection.transform(x_test)
    predictions = selection_model.predict(select_x_test)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test,predictions)
    n = select_x_train.shape[1]
    roc_auc = roc_auc_score(y_test, selection_model.predict_proba(select_x_test)[:, 1])
    store[n] = {"Threshold": thresh, "Accuracy": accuracy, "F1-Score": f1, "ROC AUC": roc_auc}
    print("Thresh=%.3f, n=%d, Accuracy: %.3f%%, F1-score: %.3f, ROC_AUC: %.3f" % (thresh, n , accuracy*100.0, f1, roc_auc))
    


Thresh=0.000, n=400, Accuracy: 97.519%, F1-score: 0.848
Thresh=0.001, n=390, Accuracy: 97.456%, F1-score: 0.844
Thresh=0.001, n=380, Accuracy: 97.366%, F1-score: 0.837
Thresh=0.001, n=370, Accuracy: 97.512%, F1-score: 0.847
Thresh=0.001, n=360, Accuracy: 97.311%, F1-score: 0.833
Thresh=0.001, n=350, Accuracy: 97.463%, F1-score: 0.844
Thresh=0.001, n=340, Accuracy: 97.332%, F1-score: 0.834
Thresh=0.001, n=330, Accuracy: 97.248%, F1-score: 0.829
Thresh=0.001, n=320, Accuracy: 97.318%, F1-score: 0.833
Thresh=0.001, n=310, Accuracy: 97.394%, F1-score: 0.839
Thresh=0.001, n=300, Accuracy: 97.290%, F1-score: 0.831
Thresh=0.001, n=290, Accuracy: 97.477%, F1-score: 0.846
Thresh=0.002, n=280, Accuracy: 97.262%, F1-score: 0.831
Thresh=0.002, n=270, Accuracy: 97.422%, F1-score: 0.841
Thresh=0.002, n=260, Accuracy: 97.373%, F1-score: 0.837
Thresh=0.002, n=250, Accuracy: 97.415%, F1-score: 0.839
Thresh=0.002, n=240, Accuracy: 97.387%, F1-score: 0.838
Thresh=0.002, n=230, Accuracy: 97.283%, F1-score