In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
import random
import gc

random.seed(42)

In [None]:
def amex_metric_np(preds, target):
    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / np.sum(target)
    weighted_target = target * weight
    lorentz = (weighted_target / weighted_target.sum()).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()
    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos
    gini_max = 10 * n_neg * (n_pos + 20 * n_neg - 19) / (n_pos + 20 * n_neg)
    g = gini / gini_max
    return 0.5 * (g + d)

In [None]:
df = pd.read_parquet('/kaggle/input/amex-parquet/train_data.parquet')
# y = pd.read_csv('/kaggle/input/amex-default-prediction/train_labels.csv')

In [None]:
# X['S_2_1'] = pd.DatetimeIndex(X.S_2).month
# X['S_2_2'] = pd.DatetimeIndex(X.S_2).year
df = df.drop(columns=['S_2'])

In [None]:
df.info(memory_usage = "deep")

In [None]:
for column in df:
    if df[column].dtype == 'float32':
        df[column]=pd.to_numeric(df[column], downcast='float')
    if df[column].dtype == 'int64':
        df[column]=pd.to_numeric(df[column], downcast='integer')

In [None]:
sample = df['customer_ID'].unique() # y.customer_ID.sample(n=50000, random_state=42)

In [None]:
df_1 = df[df['customer_ID'].isin(sample)]

In [None]:
# X_1.drop(columns=['D_63', 'D_64'])
df = df.fillna(0)

In [None]:
df_new = df.groupby(['customer_ID']).mean()

In [None]:
len(df_new.columns)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_new.drop(columns=['target']), 
                                                    df_new['target'],
                                                    test_size=0.20,
                                                    random_state=42)

In [None]:
knn = KNeighborsClassifier(n_neighbors=17)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
amex_metric_np(y_test, y_pred)

In [None]:
svc = SVC(gamma='auto')
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)
amex_metric_np(y_test, y_pred)

In [None]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', base_score=0.5,
                      n_estimators=22, random_state=42)
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
amex_metric_np(y_test, y_pred)

In [None]:
xgb.save_model('model.json')

In [None]:
import os
os.chdir(r'/kaggle/working')

from IPython.display import FileLink

FileLink('model.json')

## XGBClassifier Grid Search

In [None]:
parameters = {'nthread':[4],
              'learning_rate': [0.1, 0.15],
              'max_depth': [9, 10, 11],
              'min_child_weight': [1, 2, 3],
              'subsample': [0.7, 0.8, 0.9],
              'colsample_bytree': [0.7, 0.8, 0.9],
              'n_estimators': [150, 200, 250]}

In [None]:
model = XGBClassifier(use_label_encoder=False)
# xgb_grid = GridSearchCV(model,
#                         parameters,
#                         cv=3,
#                         n_jobs=1,
#                         verbose=2)
xgb_grid = RandomizedSearchCV(model,
                              parameters,
                              cv=4,
                              n_jobs=1,
                              verbose=2,
                              random_state=42)

In [None]:
xgb_grid.fit(X_train, y_train)

In [None]:
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

In [None]:
model = xgb_grid.best_estimator_
amex_metric_np(y_test, model.predict(X_test))

# Submission

In [None]:
del df
del df_new
del X_train
del X_test
del y_train
del y_test

In [None]:
xgb = XGBClassifier()
xgb.load_model('/kaggle/input/xgb-model/model.json')

In [None]:
from datatable import dt, f, ifelse, update, mean, by

In [None]:
test_dt = dt.fread('/kaggle/input/amex-default-prediction/test_data.csv')
# test_features = pd.read_parquet('/kaggle/input/amex-parquet/test_data.parquet')

In [None]:
del test_dt['S_2']
del test_dt['D_63']
del test_dt['D_64']
# test_features = test_features.drop(columns=['S_2'])

In [None]:
test_dt[:,update(**{key: ifelse(f[key]==None,
                              0, 
                              f[key]) 
    for key in test_dt.names})]
# for column in test_features:
#     if test_features[column].dtype == 'float32':
#         test_features[column]=pd.to_numeric(test_features[column], downcast='float')
#     if test_features[column].dtype == 'int64':
#         test_features[column]=pd.to_numeric(test_features[column], downcast='integer')
#     if test_features[column].dtype == 'category':
#         test_features[column]=test_features[column].astype('object')

In [None]:
# test_features.fillna(0, inplace=True)
gc.collect()

In [None]:
test_dt = test_dt[:, mean(f[:]), by('customer_ID')]

In [None]:
test_dt = test_dt.to_pandas()

In [None]:
gc.collect()

In [None]:
del test_dt['customer_ID']

In [None]:
y_pred = xgb.predict_proba(test_dt)

In [None]:
submission = dt.fread('/kaggle/input/amex-default-prediction/sample_submission.csv')

In [None]:
submission['prediction'] = y_pred[:, 1]

In [None]:
submission.to_csv('submission_1.csv')