In [None]:
!sudo -H pip3 install lightgbm --install-option=--gpu

In [40]:
import os
import gc
import sys
sys.path.append("../..")
import time

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
import lightgbm as lgb

from Other.utils.ReduceMem import read_and_reduce, merge_and_reduce

In [2]:
def check_and_drop_column(df, column):
    if column in df.columns:
        df.drop(columns=[column], axis=1, inplace=True)

In [3]:
def print_columns(df):
    s = ''
    for col in df.columns:
        s += '"%s", ' % col
    print(s[:-2])

# Preprocessing

In [4]:
train = read_and_reduce('./merged_data/train.csv')
test = read_and_reduce('./merged_data/test.csv')

Mem. usage decreased to 596.41 Mb (64.4% reduction)
Mem. usage decreased to 518.98 Mb (63.8% reduction)


In [5]:
print(train.shape)
print(test.shape)

(590540, 372)
(506691, 371)


# LightGBM

In [19]:
!nvidia-smi

Thu Aug 22 14:55:39 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.87.00    Driver Version: 418.87.00    CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P0    72W / 149W |   1006MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [6]:
y_train = train['isFraud'].copy()

In [7]:
X_train = train.drop('isFraud', axis=1)
X_test = test.copy()

In [8]:
# Fill numeric columns' NaN values with mean
for c in X_train.columns:
    if X_train[c].dtype=='float16' or  X_train[c].dtype=='float32' or  X_train[c].dtype=='float64':
        X_train[c].fillna(X_train[c].mean())
        X_test[c].fillna(X_train[c].mean())

In [9]:
# Fill categorical columns' NaN values with placeholder (-999)
X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

In [10]:
# Label-Encode categorical columns
for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))

In [11]:
# Set up K-Fold
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True)

print(folds)

KFold(n_splits=5, random_state=None, shuffle=True)


In [12]:
# Initialize submission DataFrame
sample_submission = pd.read_csv('../../../Data/sample_submission.csv')
lgb_submission = sample_submission.copy()
lgb_submission['isFraud'] = 0


# Set timezone
os.environ['TZ'] = 'Asia/Seoul'
time.tzset()

In [43]:
np.arange(400, 300, 2000)

array([], dtype=int64)

In [50]:
# GridSearch
gridParams = {
    'learning_rate': np.arange(0.01, 0.06, 0.01),
    'n_estimators': np.arange(400, 2000, 300),
    'num_leaves': [2**2, 2**3, 2**4, 2**5, 2**6, 2**7, 2**8],
    'boosting_type': ['gbdt'],
    'objective': ['binary'],
    'colsample_by_tree': np.arange(0.6, 0.9, 0.05),
    'subsample': np.arange(0.6, 0.9, 0.05),
    'reg_alpha': np.arange(0.3, 1.1, 0.1),
    'reg_lambda': np.arange(0.2, 0.8, 0.05),
    'max_depth': np.arange(5, 11, 1)
}

lgbm_classifier = lgb.LGBMClassifier(
    boosting_type='gbdt',
    objective='binary',
    n_jobs=-1,
    silent=True,
    subsample_freq=1
)

print('-------------- default model params')
print(lgbm_classifier.get_params().keys())

# Print current iteration information
start = time.time()
now = time.localtime(start)
print('GridSearch started..., Time: {yy}-{mm}-{dd} {hh}:{MM}:{ss}'.format(yy=now.tm_year, mm=now.tm_mon, dd=now.tm_mday, hh=now.tm_hour, MM=now.tm_min, ss=now.tm_sec))

grid = GridSearchCV(lgbm_classifier, gridParams, verbose=0, cv=4, n_jobs=1)
grid.fit(X_train, y_train)

# Measure time elapsed
now = time.time()
time_elapsed = now - start
print('Completed, Time elapsed: {} seconds'.format(time_elapsed))

print('best params=======')
print(grid.best_params_)
print('best score========')
print(grid.best_score_)


-------------- default model params
dict_keys(['boosting_type', 'class_weight', 'colsample_bytree', 'importance_type', 'learning_rate', 'max_depth', 'min_child_samples', 'min_child_weight', 'min_split_gain', 'n_estimators', 'n_jobs', 'num_leaves', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'silent', 'subsample', 'subsample_for_bin', 'subsample_freq'])
GridSearch started..., Time: 2019-8-22 15:46:59




MemoryError: Unable to allocate array with shape (44, 442904) and data type float32

In [33]:
# If using GPU, set this option to True
gpu_enabled = False

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'n_estimators': 2000,
    'colsample_bytree': 0.85,
    'subsample_freq': 1,
    'subsample': 0.85,
    'max_bin': 255,
    'metric': 'auc',
    'n_jobs': -1,
    'num_leaves': 2**8,
    'max_depth': 10,
    'tree_learner': 'serial',
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'reg_alpha': 0.3,
    'reg_lambda': 0.243
}

if gpu_enabled:
    params.update({
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0
    })

for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)):    
    # Print current iteration information
    start = time.time()
    now = time.localtime(start)
    print('Fold {} started..., Time: {yy}-{mm}-{dd} {hh}:{MM}:{ss}'.format(fold_n, yy=now.tm_year, mm=now.tm_mon, dd=now.tm_mday, hh=now.tm_hour, MM=now.tm_min, ss=now.tm_sec))
    
    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    lgb_train = lgb.Dataset(X_train_, label=y_train_)
    lgb_test = lgb.Dataset(X_valid, label=y_valid)
    
    lgb_estimator = lgb.train(
        params,
        lgb_train,
        valid_sets = [lgb_test],
        verbose_eval=100,
        early_stopping_rounds=100
    )
    
    del X_train_, y_train_
    
    pred = lgb_estimator.predict(X_test)
    val = lgb_estimator.predict(X_valid)
    
    # Measure time elapsed
    now = time.time()
    time_elapsed = now - start
    print('Completed, Time elapsed: {} seconds'.format(time_elapsed))
    
    print('ROC accuracy: {}'.format(roc_auc_score(y_valid, val)))
    
    del val, y_valid
    
    lgb_submission['isFraud'] = lgb_submission['isFraud'] + pred/n_fold
    
    del pred
    gc.collect()
    
    print('-------------------------------------------------------')
    
print('Done.')

Fold 0 started..., Time: 2019-8-22 15:12:44


LightGBMError: GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1

In [16]:
dir_name = 'submission'
lgb_submission.to_csv('./{directory}/sub_lightgbm_gpu_1.csv'.format(directory=dir_name), index=False)
lgb_submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000611
1,3663550,0.000859
2,3663551,0.000356
3,3663552,0.001103
4,3663553,0.000514


# DONE

#### Kaggle submission score: 0.9421
##### Previous score: 0.9406, advanced 216 places on leaderboard