In [1]:
import os
import gc
import sys
import time

import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
import xgboost as xgb

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
def read_and_reduce(filename):
    df = pd.read_csv(filename)
    return reduce_mem_usage(df)

In [4]:
def merge_and_reduce(df1, df2, how, on):
    df = df1.merge(df2, how='left', on=on)
    return reduce_mem_usage(df)

# Preprocessing

In [5]:
V_cols_train = read_and_reduce('./data/V_cols_train.csv')
V_cols_test = read_and_reduce('./data/V_cols_test.csv')

Mem. usage decreased to 342.98 Mb (70.8% reduction)
Mem. usage decreased to 301.53 Mb (70.0% reduction)


In [6]:
print(V_cols_train.shape)
print(V_cols_test.shape)

(590540, 261)
(506691, 260)


In [7]:
trans_train = read_and_reduce('./data/jinoo_train.csv')
trans_test = read_and_reduce('./data/jinoo_test.csv')

Mem. usage decreased to 33.79 Mb (53.1% reduction)
Mem. usage decreased to 28.99 Mb (53.1% reduction)


In [8]:
print(trans_train.shape)
print(trans_test.shape)

(590540, 16)
(506691, 16)


In [9]:
trans_train.columns

Index(['TransactionID', 'card1_count_full', 'card2_count_full',
       'card3_count_full', 'card4_count_full', 'card5_count_full',
       'card6_count_full', 'addr1_count_full', 'addr2_count_full',
       'Transaction_day_of_week', 'Transaction_hour_of_day',
       'TransactionAmt_to_mean_card1', 'TransactionAmt_to_mean_card4',
       'TransactionAmt_to_std_card1', 'TransactionAmt_to_std_card4',
       'emaildomain'],
      dtype='object')

In [10]:
trans_train.head()

Unnamed: 0,TransactionID,card1_count_full,card2_count_full,card3_count_full,card4_count_full,card5_count_full,card6_count_full,addr1_count_full,addr2_count_full,Transaction_day_of_week,Transaction_hour_of_day,TransactionAmt_to_mean_card1,TransactionAmt_to_mean_card4,TransactionAmt_to_std_card1,TransactionAmt_to_std_card4,emaildomain
0,2987000,56,17587,956845,9524,309,267648,43035,956415,0.0,0.0,0.19458,0.257812,0.0,0.0,
1,2987001,1338,5593,956845,347386,49491,267648,76902,956415,0.0,0.0,0.123779,0.219116,0.0,0.114258,gmail.com
2,2987002,1794,70496,956845,719649,102930,824959,48387,956415,0.0,0.0,0.608398,0.443115,0.5894,0.258545,outlook.com
3,2987003,7635,11287,956845,347386,47061,824959,17455,956415,0.0,0.0,0.405029,0.377686,0.2595,0.196899,yahoo.com
4,2987004,30,27225,956845,347386,49491,267648,7107,956415,0.0,0.0,0.515625,0.377686,0.883,0.196899,gmail.com


In [11]:
train = merge_and_reduce(V_cols_train, trans_train, how='left', on='TransactionID')
test = merge_and_reduce(V_cols_test, trans_test, how='left', on='TransactionID')

Mem. usage decreased to 379.02 Mb (0.0% reduction)
Mem. usage decreased to 332.45 Mb (0.0% reduction)


In [12]:
print(train.shape)
print(test.shape)

(590540, 276)
(506691, 275)


# XGBoost

In [13]:
y_train = train['isFraud'].copy()

In [14]:
X_train = train.drop('isFraud', axis=1)
X_test = test.copy()

In [15]:
# Fill numeric columns' NaN values with mean
for c in X_train.columns:
    if X_train[c].dtype=='float16' or  X_train[c].dtype=='float32' or  X_train[c].dtype=='float64':
        X_train[c].fillna(X_train[c].mean())
        X_test[c].fillna(X_train[c].mean())

In [16]:
# Fill categorical columns' NaN values with placeholder (-999)
X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

In [17]:
# Label-Encode categorical columns
for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))

In [18]:
# Set up K-Fold
n_fold = 4
folds = KFold(n_splits=n_fold, shuffle=True)

print(folds)

KFold(n_splits=4, random_state=None, shuffle=True)


In [19]:
# Initialize submission DataFrame
sample_submission = pd.read_csv('../../../Data/sample_submission.csv')
xgb_submission = sample_submission.copy()
xgb_submission['isFraud'] = 0

In [30]:
# auto, exact, approx, hist, gpu_hist (https://xgboost.readthedocs.io/en/latest/parameter.html)
tree_method = 'gpu_hist'
print('XGBoost --> Tree method: {}, Fold_N: {}'.format(tree_method, n_fold))
print('-------------------------------------------------------')

# Set timezone
os.environ['TZ'] = 'Asia/Seoul'
time.tzset()

# Execute K-Fold XGBoost
for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)):
    xgbclf = xgb.XGBClassifier(
        n_estimators=1000,
        max_depth=8,
        learning_rate=0.048,
        subsample=0.85,
        colsample_bytree=0.85,
        missing=-999,
        tree_method=tree_method,
        reg_alpha=0.15,
        reg_lamdba=0.85
    )

    # Print current iteration information
    start = time.time()
    now = time.localtime(start)
    print('Fold {} started..., Time: {yy}-{mm}-{dd} {hh}:{MM}:{ss}'.format(fold_n, yy=now.tm_year, mm=now.tm_mon, dd=now.tm_mday, hh=now.tm_hour, MM=now.tm_min, ss=now.tm_sec))
    
    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    xgbclf.fit(X_train_,y_train_)
    del X_train_,y_train_
    pred=xgbclf.predict_proba(X_test)[:,1]
    val=xgbclf.predict_proba(X_valid)[:,1]
    del xgbclf, X_valid
    
    # Measure time elapsed
    now = time.time()
    time_elapsed = now - start
    print('Completed, Time elapsed: {} seconds'.format(time_elapsed))
    
    print('ROC accuracy: {}'.format(roc_auc_score(y_valid, val)))
    del val,y_valid
    xgb_submission['isFraud'] = xgb_submission['isFraud']+pred/n_fold
    del pred
    gc.collect()
    
    print('-------------------------------------------------------')

XGBoost --> Tree method: gpu_hist, Fold_N: 4
-------------------------------------------------------
Fold 0 started..., Time: 2019-8-16 2:0:40
Completed, Time elapsed: 1185.2291111946106 seconds
ROC accuracy: 0.9508000309204723
-------------------------------------------------------
Fold 1 started..., Time: 2019-8-16 2:20:26
Completed, Time elapsed: 1178.115707874298 seconds
ROC accuracy: 0.9513864145943742
-------------------------------------------------------
Fold 2 started..., Time: 2019-8-16 2:40:4
Completed, Time elapsed: 1175.2693135738373 seconds
ROC accuracy: 0.9528751751794307
-------------------------------------------------------


In [34]:
exp_name = 'submission'
xgb_submission.to_csv('./{directory}/sub_xgboost.csv'.format(directory=exp_name), index=False)
xgb_submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000897
1,3663550,0.006521
2,3663551,0.000312
3,3663552,0.003687
4,3663553,0.000641
