Imports

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%capture
!pip install evaluate

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
import random
import gc
import evaluate
from datatable import dt, f, ifelse, update, mean, by
from sklearn.preprocessing import OneHotEncoder
import pickle

random.seed(42)

In [None]:
amex_metric = evaluate.load("kaggle/amex")

Load Data

In [None]:
train_dt = dt.fread('/kaggle/input/amex-default-prediction/train_data.csv')
y = dt.fread('/kaggle/input/amex-default-prediction/train_labels.csv')

## Feature Engineering

Delete unused and unhelpful columns

In [None]:
del y['customer_ID']

In [None]:
del train_dt['S_2']

**Imputation**: Replace missing values with 0

In [None]:
train_dt[:,update(**{key: ifelse(f[key]==None,
                              0, 
                              f[key]) 
    for key in train_dt.names})]

Multiply the numerical columns by 100 (**Scaling**) and get the `floor` value (**Discretization**). Reduces noise in data.

In [None]:
train_dt[:,update(**{key: dt.math.floor(f[key] * 100)
                    if key not in ['customer_ID', 'D_63', 'D_64']
                    else f[key]
    for key in train_dt.names})]

**Categorical Encoding**: One-hot encode categorical columns.

**Feature Splitting**: The categorical columns are split into a column for each category

In [None]:
ohe_d_63 = OneHotEncoder(sparse=False)
ohe_d_64 = OneHotEncoder(sparse=False)
ohed_d_63 = ohe_d_63.fit_transform(train_dt['D_63'])
ohed_d_64 = ohe_d_64.fit_transform(train_dt['D_64'])

Save the encoders for using in the test set

In [None]:
with open('ohe_d_63.pkl', 'wb') as ohe_f:
    pickle.dump(ohe_d_63, ohe_f)
with open('ohe_d_64.pkl', 'wb') as ohe_f:
    pickle.dump(ohe_d_64, ohe_f)

Concatenate the transformed columns with the original dataset

In [None]:
ohe_1 = dt.Frame(ohed_d_63.astype(float))
ohe_1.names = list(map(lambda x: 'D_63_' + x, ohe_d_63.categories_[0].tolist()))
ohe_2 = dt.Frame(ohed_d_64.astype(float))
ohe_2.names = list(map(lambda x: 'D_64_' + x, ohe_d_64.categories_[0].tolist()))

In [None]:
del train_dt['D_63']
del train_dt['D_64']
train_dt = dt.cbind(train_dt, ohe_1, ohe_2)

In [None]:
train_dt.head()

Aggregate across the customer_ID and calculate the `mean`, `standard deviation`, `maximum`, `mininmum`, and `last value` for all columns. (**Variable Transformation**, **Feature Splitting**, **Creating Features**)

In [None]:
train_dt_mean = train_dt[:, mean(f[:]), by('customer_ID')]
train_dt_std = train_dt[:, dt.sd(f[:]), by('customer_ID')]
train_dt_max = train_dt[:, dt.max(f[:]), by('customer_ID')]
train_dt_min = train_dt[:, dt.min(f[:]), by('customer_ID')]
train_dt_last = train_dt[:, dt.last(f[:]), by('customer_ID')]

In [None]:
del train_dt_mean['customer_ID']
del train_dt_std['customer_ID']
del train_dt_max['customer_ID']
del train_dt_min['customer_ID']
del train_dt_last['customer_ID']

In [None]:
train_dt_mean.names = ['mean_'+key for key in train_dt_mean.names]
train_dt_std.names = ['sd_'+key for key in train_dt_std.names]
train_dt_max.names = ['max_'+key for key in train_dt_max.names]
train_dt_min.names = ['min_'+key for key in train_dt_min.names]
train_dt_last.names = ['last_'+key for key in train_dt_last.names]

In [None]:
train_dt = dt.cbind(train_dt_mean, train_dt_std, train_dt_max, train_dt_min, train_dt_last)

In [None]:
train_dt.head()

In [None]:
del train_dt_mean
del train_dt_std
del train_dt_max
del train_dt_min
del train_dt_last
gc.collect()

## Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_dt.to_pandas(), 
                                                    y.to_numpy().ravel(),
                                                    test_size=0.20,
                                                    random_state=42)

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
amex_metric.compute(references=y_test, predictions=y_pred)
# 0.572003102741842

In [None]:
xgb.save_model('model-3.json')

In [None]:
params = {'subsample': 0.6, 'n_estimators': 800, 'min_child_weight': 1, 'max_depth': 11, 'learning_rate': 0.15, 'colsample_bytree': 0.9}
lgbm = LGBMClassifier(boosting_type='dart', **params)
lgbm.fit(X_train,y_train)
y_pred = lgbm.predict(X_test)
amex_metric.compute(references=y_test, predictions=y_pred)
# 0.5838643216051032

In [None]:
lgbm.booster_.save_model('lgb-model-2.txt')

In [None]:
import os
os.chdir(r'/kaggle/working')

from IPython.display import FileLink

FileLink('lgb-model-2.txt')

In [None]:
parameters = {'learning_rate': [0.13, 0.15, 0.17],
              'max_depth': [11],
              'min_child_weight': [1],
              'subsample': [0.4, 0.5, 0.6],
              'colsample_bytree': [0.8, 0.9],
              'n_estimators': [800]}
# {'subsample': 0.6, 'n_estimators': 800, 'min_child_weight': 1, 
# 'max_depth': 11, 'learning_rate': 0.15, 'colsample_bytree': 0.9}

In [None]:
lgbm = LGBMClassifier(boosting_type='dart')
lgbm_grid = RandomizedSearchCV(lgbm,
                              parameters,
                              cv=2,
                              n_iter=5,
                              verbose=10,
                              random_state=42)

In [None]:
lgbm_grid.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

In [None]:
print(lgbm_grid.best_score_)
print(lgbm_grid.best_params_)

In [None]:
model = lgbm_grid.best_estimator_
y_pred = model.predict(X_test)
amex_metric.compute(references=y_test, predictions=y_pred)
# 0.5821566925893039

In [None]:
params = {'subsample': 0.4, 'n_estimators': 800, 'min_child_weight': 1, 'max_depth': 11, 'learning_rate': 0.13, 'colsample_bytree': 0.8}
lgbm = LGBMClassifier(boosting_type='dart', **params)
lgbm.fit(train_dt.to_pandas(), y.to_numpy().ravel())
y_pred = lgbm.predict(X_test)
amex_metric.compute(references=y_test, predictions=y_pred)
# 0.638191640789435

In [None]:
import pickle
with open('lgbm_model.pkl', 'wb') as file:
    pickle.dump(lgbm, file)

In [None]:
import os
os.chdir(r'/kaggle/working')

from IPython.display import FileLink

FileLink('lgbm_model.pkl')

## Submission

In [None]:
del train_dt
del X_train
del X_test
del y_train
del y_test
gc.collect()

In [None]:
import pickle
with open('lgbm_model.pkl', 'rb') as file:
    model = pickle.load(file)

In [None]:
test_dt = dt.fread('/kaggle/input/amex-default-prediction/test_data.csv')
# test_dt.to_jay('test_data.jay')
# test_features = pd.read_parquet('/kaggle/input/amex-parquet/test_data.parquet')

In [None]:
test_dt = dt.fread('test_data.jay')

In [None]:
test_dt.head()

In [None]:
test_dt[:,update(**{key: ifelse(f[key]==None,
                              0, 
                              f[key]) 
    for key in test_dt.names})]

In [None]:
test_dt[:,update(**{key: dt.math.floor(f[key] * 100)
                    if key not in ['customer_ID', 'S_2', 'D_63', 'D_64']
                    else f[key]
    for key in test_dt.names})]

In [None]:
with open('ohe_d_63.pkl', 'rb') as ohe_f:
    ohe_d_63 = pickle.load(ohe_f)
with open('ohe_d_64.pkl', 'rb') as ohe_f:
    ohe_d_64 = pickle.load(ohe_f)

In [None]:
ohed_d_63 = ohe_d_63.transform(test_dt['D_63'])
ohed_d_64 = ohe_d_64.transform(test_dt['D_64'])
ohe_1 = dt.Frame(ohed_d_63.astype(float))
ohe_1.names = list(map(lambda x: 'D_63_' + x, ohe_d_63.categories_[0].tolist()))
ohe_2 = dt.Frame(ohed_d_64.astype(float))
ohe_2.names = list(map(lambda x: 'D_64_' + x, ohe_d_64.categories_[0].tolist()))
del ohed_d_63
del ohed_d_64

In [None]:
del test_dt['S_2']
del test_dt['D_63']
del test_dt['D_64']
test_dt = dt.cbind(test_dt, ohe_1, ohe_2)

In [None]:
# test_features.fillna(0, inplace=True)
gc.collect()

In [None]:
test_dt_mean = test_dt[:, mean(f[:]), by('customer_ID')]
test_dt_std = test_dt[:, dt.sd(f[:]), by('customer_ID')]
test_dt_max = test_dt[:, dt.max(f[:]), by('customer_ID')]
test_dt_min = test_dt[:, dt.min(f[:]), by('customer_ID')]
test_dt_last = test_dt[:, dt.last(f[:]), by('customer_ID')]

In [None]:
del test_dt_mean['customer_ID']
del test_dt_std['customer_ID']
del test_dt_max['customer_ID']
del test_dt_min['customer_ID']
del test_dt_last['customer_ID']
gc.collect()

In [None]:
test_dt_mean.names = ['mean_'+key for key in test_dt_mean.names]
test_dt_std.names = ['sd_'+key for key in test_dt_std.names]
test_dt_max.names = ['max_'+key for key in test_dt_max.names]
test_dt_min.names = ['min_'+key for key in test_dt_min.names]
test_dt_last.names = ['last_'+key for key in test_dt_last.names]

In [None]:
test_dt = dt.cbind(test_dt_mean, test_dt_std, test_dt_max, test_dt_min, test_dt_last)

In [None]:
test_dt.head()

In [None]:
del test_dt_mean
del test_dt_std
del test_dt_max
del test_dt_min
del test_dt_last
gc.collect()

In [None]:
test_dt.to_jay('test_dt_preproc.jay')

In [None]:
test_dt = dt.fread('test_dt_preproc.jay')

In [None]:
test_dt = test_dt.to_pandas()

In [None]:
len(test_dt.columns)

In [None]:
gc.collect()

In [None]:
y_pred = model.predict_proba(test_dt)

In [None]:
submission = dt.fread('/kaggle/input/amex-default-prediction/sample_submission.csv')

In [None]:
submission['prediction'] = y_pred[:, 1]

In [None]:
submission.to_csv('submission_4.csv')