In [None]:
# We are using COLAB as it gives us access to a much faster gpu
!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv
# Tesla P100, a $5,699 gpu!!!

name, driver_version, memory.total [MiB]
Tesla P100, 460.32.03, 11441 MiB


In [None]:
# Code as per Kaggle instructions to enable colab
# Much faster to download data using google's internet than uploading from mine

!pip install -q kaggle

from google.colab import files 
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"finleysmith","key":"e680faf4a906c1241f6a06c82de84d47"}'}

In [None]:
# Following the guide given here https://www.kaggle.com/general/74235
!mkdir  -p /root/.kaggle/
!mv kaggle.json  /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!ls -lart /root/.kaggle/
!kaggle competitions download -c 'ieee-fraud-detection'
!mkdir data
!mv ieee-fraud-detection.zip data/
!unzip data/ieee-fraud-detection.zip -d data/

total 16
-rw------- 1 root root   67 Apr  7 17:07 kaggle.json
drwx------ 1 root root 4096 Apr  7 17:08 ..
drwxr-xr-x 2 root root 4096 Apr  7 17:08 .
Downloading ieee-fraud-detection.zip to /content
 95% 112M/118M [00:00<00:00, 151MB/s] 
100% 118M/118M [00:00<00:00, 143MB/s]
Archive:  data/ieee-fraud-detection.zip
  inflating: data/sample_submission.csv  
  inflating: data/test_identity.csv  
  inflating: data/test_transaction.csv  
  inflating: data/train_identity.csv  
  inflating: data/train_transaction.csv  


In [1]:
# Import our libs etc

import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt

import os,gc,re
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,roc_curve,auc,confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit, train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

# Import the files from data directory
train_transaction, test_transaction = pd.read_csv('data/train_transaction.csv'), pd.read_csv('data/test_transaction.csv')
train_identity, test_identity = pd.read_csv('data/train_identity.csv'), pd.read_csv('data/test_identity.csv')

# We notice that some of the columns for test and train identity are miss-named using - instead of _ so we will correct
test_identity.columns = [col.replace("-","_") for col in test_identity.columns]


# Merge the transaction and identity types
df_train = train_transaction.merge(train_identity, on = ['TransactionID'], how = 'left')
df_test = test_transaction.merge(test_identity, on = ['TransactionID'], how = 'left')

del train_transaction, train_identity, test_transaction, test_identity

  from pandas import MultiIndex, Int64Index


In [2]:
# Lets ensure we are using the right types
# column details
cat_cols = (['ProductCD'] + ['card%d' % i for i in range(1, 7)] + ['addr1', 'addr2', 'P_emaildomain', 'R_emaildomain'] + 
            ['M%d' % i for i in range(1, 10)] + ['DeviceType', 'DeviceInfo'] + ['id_%d' % i for i in range(12, 39)])

# Make them all strings for now
type_map = {c: str for c in cat_cols}
df_train[cat_cols] = df_train[cat_cols].astype(type_map, copy=False)
df_test[cat_cols] = df_test[cat_cols].astype(type_map, copy=False)
id_cols = ['TransactionID', 'TransactionDT']

# Create a list of all the numeric columns
numeric_cols = (['TransactionAmt', 'dist1', 'dist2'] + ['C%d' % i for i in range(1, 15)] +
                  ['D%d' % i for i in range(1, 16)] + ['V%d' % i for i in range(1, 340)] + ['id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 
    'id_09', 'id_10', 'id_11'])

# Taken from our EDA notebook (Part 1), we list all the v-values we want
v_cols = ['V1', 'V3', 'V4', 'V6', 'V8', 'V11', 'V13', 'V14', 'V17', 'V20', 
 'V23', 'V26', 'V27', 'V30', 'V36', 'V37', 'V40', 'V41', 'V44', 'V47', 'V48', 'V54', 'V56', 'V59', 
 'V62', 'V65', 'V67', 'V68', 'V70', 'V76', 'V78', 'V80', 'V82', 'V86', 'V88', 'V89', 'V91', 'V96', 
 'V98', 'V99', 'V104', 'V107', 'V108', 'V111', 'V115', 'V117', 'V120', 'V121', 'V123', 'V124', 'V127', 
 'V129', 'V130', 'V136', 'V138', 'V139', 'V142', 'V147', 'V156', 'V162', 'V165', 'V160', 'V166', 'V178',
 'V176', 'V173', 'V182', 'V187', 'V203', 'V205', 'V207', 'V215', 'V169', 'V171', 'V175', 'V180', 'V185', 
 'V188', 'V198', 'V210', 'V209', 'V218', 'V223', 'V224', 'V226', 'V228', 'V229', 'V235', 'V240', 'V258', 
 'V257', 'V253', 'V252', 'V260', 'V261', 'V264', 'V266', 'V267', 'V274', 'V277', 'V220', 'V221', 'V234', 
 'V238', 'V250', 'V271', 'V294', 'V284', 'V285', 'V286', 'V291',
 'V297', 'V303', 'V305', 'V307', 'V309', 'V310', 'V320', 'V281', 'V283', 'V289', 'V296', 'V301', 'V314', 'V332', 'V325', 'V335', 'V338']

# Create a list of v columns we want to DROP
drop_cols = [col for col in df_train.columns if col[0] == 'V' and col not in v_cols]

# Get rid of the columns we need to drop
df_train = df_train.drop(columns=drop_cols)
df_test = df_test.drop(columns=drop_cols)


In [3]:
# Feature Engineering
#  Adding the day and hour
df_train['day'] = (df_train['TransactionDT']//(60*60*24)-1)%7
df_test['day'] = (df_test['TransactionDT']//(60*60*24)-1)%7

df_train['hour'] = (df_train['TransactionDT']//(60*60))%24
df_test['hour'] = (df_test['TransactionDT']//(60*60))%24

# Adding the amount after decimal for transaction amount
df_train['TransactionAmtDecimal'] = (df_train['TransactionAmt'] - np.floor(df_train['TransactionAmt'])).astype('float32')
df_test['TransactionAmtDecimal'] = (df_test['TransactionAmt'] - np.floor(df_test['TransactionAmt'])).astype('float32')

# We could try adding more features, but in early testing they were not effective

In [4]:
# First setup our X and Y data sets

y_train = df_train['isFraud'].copy()

X_train = df_train.drop('isFraud', axis = 1)
X_test = df_test.copy()

del df_train, df_test

# Fill our na values with a number that xgboost will recongise as NA
X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

# Label Encoding to convert the objects and caterogies into numeric, using our previous lists
# to assist with finding the correct type
for i in X_train.columns:
    if X_train[i].dtype=='object' or X_test[i].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(X_train[i].values) + list(X_test[i].values))
        X_train[i] = lbl.transform(list(X_train[i].values))
        X_test[i] = lbl.transform(list(X_test[i].values))

In [5]:
# Start up XGBoost
clf = xgb.XGBClassifier()

# Create a grid of parameters we want to learn over
param_grid = {
    'max_depth': [6, 12],
    'learning_rate': [0.002, 0.01, 0.02, 0.1, 0.2],
    'subsample': [0.4, 0.6, 0.8],
    'colsample_bytree': [0.4, 0.8],
    'n_estimators': [2000, 5000],
    'tree_method':['gpu_hist']}

# Use randomisedsearchcv to efficently tune our hyper-parameters
rs_clf = RandomizedSearchCV(clf, param_grid, n_iter = 5, n_jobs = -1, verbose= 20, cv = 3, scoring='roc_auc',
                            refit=False, random_state = 12)

# Bind the search to our data
rs_clf.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           enable_categorical=False, gamma=None,
                                           gpu_id=None, importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints...
                                           scale_pos_weight=None,
                                           subsample=None, tree_method=None,
                                      

In [6]:
# Print the best parameters to use
rs_clf.best_params_

{'tree_method': 'hist',
 'subsample': 0.8,
 'n_estimators': 5000,
 'max_depth': 12,
 'learning_rate': 0.02,
 'colsample_bytree': 0.4}

In [7]:
# Create our parameters for our XGBOOST model
# We make sure to enable the GPU to run much fuster on colab
# We use the parameters from the RandomisedSearchCV above
clf = xgb.XGBClassifier(
    n_estimators=5000,
    max_depth=12,
    learning_rate=0.02,
    subsample=0.8,
    colsample_bytree=0.4,
    missing=-999, # As above this is our na number
    random_state=2471, # Last 4 digits of matric
    eval_metric=["error", "auc"],
    tree_method='gpu_hist'  # Enable google's expensive gpu
)

# Fit the model to our training data
clf.fit(X_train, y_train)

In [8]:
# Create the sumbission for Kaggle
sample_submission = pd.read_csv('data/sample_submission.csv', index_col='TransactionID')
sample_submission['isFraud'] = clf.predict_proba(X_test)[:,1]
sample_submission.to_csv('simple_xgboost.csv')