In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%capture
!pip install evaluate

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import random
import gc
import evaluate
from datatable import dt, f, ifelse, update, mean, by

random.seed(42)

In [None]:
amex_metric = evaluate.load("kaggle/amex")

In [None]:
# df = pd.read_parquet('/kaggle/input/amex-parquet/train_data.parquet')
# df = pd.read_csv('/kaggle/input/amex-default-prediction/train_data.csv', nrows=50000)
# y = pd.read_csv('/kaggle/input/amex-default-prediction/train_labels.csv')
train_dt = dt.fread('/kaggle/input/amex-default-prediction/train_data.csv')
# test_features = pd.read_parquet('/kaggle/input/amex-parquet/test_data.parquet')

In [None]:
y = dt.fread('/kaggle/input/amex-default-prediction/train_labels.csv')

In [None]:
del y['customer_ID']

In [None]:
del train_dt['S_2']
del train_dt['D_63']
del train_dt['D_64']

In [None]:
train_dt[:,update(**{key: ifelse(f[key]==None,
                              0, 
                              f[key]) 
    for key in train_dt.names})]

In [None]:
train_dt_mean = train_dt[:, mean(f[:]), by('customer_ID')]
train_dt_std = train_dt[:, dt.sd(f[:]), by('customer_ID')]
train_dt_max = train_dt[:, dt.max(f[:]), by('customer_ID')]
train_dt_min = train_dt[:, dt.min(f[:]), by('customer_ID')]
train_dt_last = train_dt[:, dt.last(dt.count()), dt.by('customer_ID')]

In [None]:
del train_dt_mean['customer_ID']
del train_dt_std['customer_ID']
del train_dt_max['customer_ID']
del train_dt_min['customer_ID']
del train_dt_last['customer_ID']

In [None]:
train_dt_mean.names = ['mean_'+key for key in train_dt_mean.names]
train_dt_std.names = ['sd_'+key for key in train_dt_std.names]
train_dt_max.names = ['max_'+key for key in train_dt_max.names]
train_dt_min.names = ['min_'+key for key in train_dt_min.names]
train_dt_last.names = ['last_'+key for key in train_dt_last.names]

In [None]:
train_dt = dt.cbind(train_dt_mean, train_dt_std, train_dt_max, train_dt_min, train_dt_last)

In [None]:
train_dt.head()

In [None]:
del train_dt_mean
del train_dt_std
del train_dt_max
del train_dt_min
del train_dt_last
gc.collect()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_dt.to_pandas(), 
                                                    y.to_numpy(),
                                                    test_size=0.20,
                                                    random_state=42)

In [None]:
knn = KNeighborsClassifier(n_neighbors=17)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
amex_metric.compute(references=y_test, predictions=y_pred)

In [None]:
y_train.shape

In [None]:
svc = SVC(gamma='auto')
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)
amex_metric.compute(references=y_test, predictions=y_pred)

In [None]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', base_score=0.5,
                      n_estimators=22, random_state=42)
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
amex_metric.compute(references=y_test, predictions=y_pred)

In [None]:
xgb.save_model('model.json')

In [None]:
lgbm = LGBMClassifier(boosting_type='dart')
lgbm.fit(X_train,y_train.ravel())
y_pred = lgbm.predict(X_test)
amex_metric.compute(references=y_test, predictions=y_pred)

In [None]:
lgbm.booster_.save_model('lgb-model.txt')

In [None]:
import os
os.chdir(r'/kaggle/working')

from IPython.display import FileLink

FileLink('lgb-model.txt')

In [None]:
parameters = {'learning_rate': [0.1, 0.15],
              'max_depth': [9, 10, 11],
              'min_child_weight': [1, 2, 3],
              'subsample': [0.7, 0.8, 0.9],
              'colsample_bytree': [0.7, 0.8, 0.9],
              'n_estimators': [150, 200, 250]}

In [None]:
lgbm = LGBMClassifier(boosting_type='dart')
lgbm_grid = RandomizedSearchCV(lgbm,
                              parameters,
                              cv=4,
                              verbose=10,
                              random_state=42)

In [None]:
lgbm_grid.fit(X_train, y_train.ravel())

In [None]:
print(lgbm_grid.best_score_)
print(lgbm_grid.best_params_)

In [None]:
model = lgbm_grid.best_estimator_
amex_metric.compute(references=y_test, predictions=y_pred)

In [None]:
lgbm = LGBMClassifier(boosting_type='dart', **lgbm_grid.best_params_)
lgbm.fit(train_dt.to_pandas(), y.to_numpy().ravel())
y_pred = lgbm.predict(X_test)
amex_metric.compute(references=y_test, predictions=y_pred)

# Submission

In [None]:
del train_dt
del X_train
del X_test
del y_train
del y_test

In [None]:
test_dt = dt.fread('/kaggle/input/amex-default-prediction/test_data.csv')
# test_features = pd.read_parquet('/kaggle/input/amex-parquet/test_data.parquet')

In [None]:
del test_dt['S_2']
del test_dt['D_63']
del test_dt['D_64']
# test_features = test_features.drop(columns=['S_2'])

In [None]:
test_dt[:,update(**{key: ifelse(f[key]==None,
                              0, 
                              f[key]) 
    for key in test_dt.names})]
# for column in test_features:
#     if test_features[column].dtype == 'float32':
#         test_features[column]=pd.to_numeric(test_features[column], downcast='float')
#     if test_features[column].dtype == 'int64':
#         test_features[column]=pd.to_numeric(test_features[column], downcast='integer')
#     if test_features[column].dtype == 'category':
#         test_features[column]=test_features[column].astype('object')

In [None]:
# test_features.fillna(0, inplace=True)
gc.collect()

In [None]:
test_dt.shape

In [None]:
test_dt_mean = test_dt[:, mean(f[:]), by('customer_ID')]
test_dt_std = test_dt[:, dt.sd(f[:]), by('customer_ID')]
test_dt_max = test_dt[:, dt.max(f[:]), by('customer_ID')]
test_dt_min = test_dt[:, dt.min(f[:]), by('customer_ID')]
test_dt_last = test_dt[:, dt.last(dt.count()), dt.by('customer_ID')]

In [None]:
del test_dt_mean['customer_ID']
del test_dt_std['customer_ID']
del test_dt_max['customer_ID']
del test_dt_min['customer_ID']
del test_dt_last['customer_ID']

In [None]:
test_dt_mean.names = ['mean_'+key for key in test_dt_mean.names]
test_dt_std.names = ['sd_'+key for key in test_dt_std.names]
test_dt_max.names = ['max_'+key for key in test_dt_max.names]
test_dt_min.names = ['min_'+key for key in test_dt_min.names]
test_dt_last.names = ['last_'+key for key in test_dt_last.names]

In [None]:
test_dt = dt.cbind(test_dt_mean, test_dt_std, test_dt_max, test_dt_min, test_dt_last)

In [None]:
test_dt.head()

In [None]:
test_dt = test_dt.to_pandas()

In [None]:
gc.collect()

In [None]:
del test_dt['customer_ID']

In [None]:
y_pred = lgbm.predict_proba(test_dt)

In [None]:
submission = dt.fread('/kaggle/input/amex-default-prediction/sample_submission.csv')

In [None]:
submission['prediction'] = y_pred[:, 1]

In [None]:
submission.to_csv('submission_2.csv')