In [5]:
# !pip3 install lightgbm

Collecting lightgbm
  Downloading https://files.pythonhosted.org/packages/77/0f/5157e6b153b3d4a70dc5fbe2ab6f209604197590f387f03177b7a249ac60/lightgbm-2.2.3-py2.py3-none-manylinux1_x86_64.whl (1.2MB)
[K    100% |████████████████████████████████| 1.2MB 1.2MB/s eta 0:00:01
[?25hCollecting scipy (from lightgbm)
  Using cached https://files.pythonhosted.org/packages/29/50/a552a5aff252ae915f522e44642bb49a7b7b31677f9580cfd11bcc869976/scipy-1.3.1-cp36-cp36m-manylinux1_x86_64.whl
Collecting numpy (from lightgbm)
  Using cached https://files.pythonhosted.org/packages/19/b9/bda9781f0a74b90ebd2e046fde1196182900bd4a8e1ea503d3ffebc50e7c/numpy-1.17.0-cp36-cp36m-manylinux1_x86_64.whl
Collecting scikit-learn (from lightgbm)
  Using cached https://files.pythonhosted.org/packages/a0/c5/d2238762d780dde84a20b8c761f563fe882b88c5a5fb03c056547c442a19/scikit_learn-0.21.3-cp36-cp36m-manylinux1_x86_64.whl
Collecting joblib>=0.11 (from scikit-learn->lightgbm)
  Using cached https://files.pythonhosted.org/packag

In [1]:
import os
import gc
import sys
sys.path.append("../..")
import time

import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.metrics import roc_auc_score, f1_score
from sklearn import preprocessing
import xgboost as xgb
import lightgbm as lgb

from Other.utils.ReduceMem import read_and_reduce, merge_and_reduce

In [2]:
def check_and_drop_column(df, column):
    if column in df.columns:
        df.drop(columns=[column], axis=1, inplace=True)

In [3]:
def print_columns(df):
    s = ''
    for col in df.columns:
        s += '"%s", ' % col
    print(s[:-2])

# Preprocessing

In [4]:
train = read_and_reduce('./merged_data/train.csv')
test = read_and_reduce('./merged_data/test.csv')

Mem. usage decreased to 596.41 Mb (64.4% reduction)
Mem. usage decreased to 518.98 Mb (63.8% reduction)


In [5]:
print(train.shape)
print(test.shape)

(590540, 372)
(506691, 371)


# LightGBM

In [6]:
y_train = train['isFraud'].copy()

In [7]:
X_train = train.drop('isFraud', axis=1)
X_test = test.copy()

In [8]:
# Fill numeric columns' NaN values with mean
for c in X_train.columns:
    if X_train[c].dtype=='float16' or  X_train[c].dtype=='float32' or  X_train[c].dtype=='float64':
        X_train[c].fillna(X_train[c].mean())
        X_test[c].fillna(X_train[c].mean())

In [9]:
# Fill categorical columns' NaN values with placeholder (-999)
X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

In [10]:
# Label-Encode categorical columns
for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))

In [11]:
# Set up K-Fold
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True)

print(folds)

KFold(n_splits=5, random_state=None, shuffle=True)


In [12]:
# Initialize submission DataFrame
sample_submission = pd.read_csv('../../../Data/sample_submission.csv')
lgb_submission = sample_submission.copy()
lgb_submission['isFraud'] = 0


# Set timezone
os.environ['TZ'] = 'Asia/Seoul'
time.tzset()

In [15]:
# If using GPU, set this option to True
gpu_enabled = False

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'n_estimators': 2000,
    'colsample_bytree': 0.85,
    'subsample_freq': 1,
    'subsample': 0.85,
    'max_bin': 255,
    'metric': 'auc',
    'n_jobs': -1,
    'num_leaves': 2**8,
    'max_depth': 10,
    'tree_learner': 'serial',
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'reg_alpha': 0.3,
    'reg_lambda': 0.243
}

if gpu_enabled:
    params.update({
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0
    })

for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)):    
    # Print current iteration information
    start = time.time()
    now = time.localtime(start)
    print('Fold {} started..., Time: {yy}-{mm}-{dd} {hh}:{MM}:{ss}'.format(fold_n, yy=now.tm_year, mm=now.tm_mon, dd=now.tm_mday, hh=now.tm_hour, MM=now.tm_min, ss=now.tm_sec))
    
    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    lgb_train = lgb.Dataset(X_train_, label=y_train_)
    lgb_test = lgb.Dataset(X_valid, label=y_valid)
    
    lgb_estimator = lgb.train(
        params,
        lgb_train,
        valid_sets = [lgb_test],
        verbose_eval=100,
        early_stopping_rounds=100
    )
    
    del X_train_, y_train_
    
    pred = lgb_estimator.predict(X_test)
    val = lgb_estimator.predict(X_valid)
    
    # Measure time elapsed
    now = time.time()
    time_elapsed = now - start
    print('Completed, Time elapsed: {} seconds'.format(time_elapsed))
    
    print('ROC accuracy: {}'.format(roc_auc_score(y_valid, val)))
    
    del val, y_valid
    
    lgb_submission['isFraud'] = lgb_submission['isFraud'] + pred/n_fold
    
    del pred
    gc.collect()
    
    print('-------------------------------------------------------')
    
print('Done.')

Fold 0 started..., Time: 2019-8-22 14:17:9
[100]	valid_0's auc: 0.947213
[200]	valid_0's auc: 0.95773
[300]	valid_0's auc: 0.964031
[400]	valid_0's auc: 0.968073
[500]	valid_0's auc: 0.970384
Completed, Time elapsed: 160.49536657333374 seconds
ROC accuracy: 0.9705486018934674
-------------------------------------------------------
Fold 1 started..., Time: 2019-8-22 14:19:50




[100]	valid_0's auc: 0.945058
[200]	valid_0's auc: 0.956605
[300]	valid_0's auc: 0.963564
[400]	valid_0's auc: 0.967453
[500]	valid_0's auc: 0.970083
Completed, Time elapsed: 161.76622891426086 seconds
ROC accuracy: 0.9703975113601729
-------------------------------------------------------
Fold 2 started..., Time: 2019-8-22 14:22:32




[100]	valid_0's auc: 0.949636
[200]	valid_0's auc: 0.959506
[300]	valid_0's auc: 0.966123
[400]	valid_0's auc: 0.969341
[500]	valid_0's auc: 0.971794
Completed, Time elapsed: 160.56827569007874 seconds
ROC accuracy: 0.9719799395133591
-------------------------------------------------------
Fold 3 started..., Time: 2019-8-22 14:25:12




[100]	valid_0's auc: 0.949669
[200]	valid_0's auc: 0.960903
[300]	valid_0's auc: 0.967069
[400]	valid_0's auc: 0.970644
[500]	valid_0's auc: 0.972845
Completed, Time elapsed: 162.33351802825928 seconds
ROC accuracy: 0.9730111885371531
-------------------------------------------------------
Fold 4 started..., Time: 2019-8-22 14:27:55




[100]	valid_0's auc: 0.948662
[200]	valid_0's auc: 0.960105
[300]	valid_0's auc: 0.96645
[400]	valid_0's auc: 0.970197
[500]	valid_0's auc: 0.972545
Completed, Time elapsed: 159.25565457344055 seconds
ROC accuracy: 0.9728676764174629
-------------------------------------------------------
Done.


In [16]:
dir_name = 'submission'
lgb_submission.to_csv('./{directory}/sub_lightgbm_cpu_1.csv'.format(directory=dir_name), index=False)
lgb_submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000611
1,3663550,0.000859
2,3663551,0.000356
3,3663552,0.001103
4,3663553,0.000514


# DONE

#### Kaggle submission score: 0.9421
##### Previous score: 0.9406, advanced 216 places on leaderboard