The target binary variable is calculated by observing 18 months performance window after the latest credit card statement, and if the customer does not pay due amount in 120 days after their latest statement date it is considered a default event.

The dataset contains aggregated profile features for each customer at each statement date. Features are anonymized and normalized, and fall into the following general categories:

- D_* = Delinquency variables
- S_* = Spend variables
- P_* = Payment variables
- B_* = Balance variables
- R_* = Risk variables

**Task is to predict, for each customer_ID, the probability of a future payment default (target = 1).**

Note: that the negative class has been subsampled for this dataset at 5%, and thus receives a 20x weighting in the scoring metric.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
import gc
import glob
import tqdm
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import lightgbm as lgbm
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
# set the warning off
import warnings
warnings.filterwarnings("ignore")
import time
import cupy, cudf # GPU libraries
import matplotlib.pyplot as plt, gc, os
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import xgboost as xgb


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# kaggle utils
#import ../input/kaggle-utils/kaggle_utils.py as kaggle_utils
#../input/kaggle-utils
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

You can see that the data set consist of 16.39 GB for training data and 33.89 GB for test data. 

In [2]:
pd.set_option("display.max_columns", None)

## Some functions for kaggle utils & amex metric

In [3]:
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

## About dataset: CSV, Parquet and Feather

- We cant directly import as csv due to memory restriction.
- Both CSV and Parquet formats are used to store data, but they can't be any more different internally. CSVs are what you call row storage, while Parquet files organize the data in columns. In a nutshell, column storage files are more lightweight, as adequate compression can be done for each column
- Also provide Low storage consumption. 

- Feather is a fast, lightweight, and easy-to-use binary file format for storing data frames. It has a few specific design goals: Lightweight, minimal API: make pushing data frames in and out of memory as simple as possible. Language agnostic: Feather files are the same whether written by Python or R code.

In [3]:
#Reading Data as Parquet
train_df = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/train.parquet').groupby('customer_ID').tail(2).set_index('customer_ID', drop=True).sort_index()
train_labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv').set_index('customer_ID', drop=True).sort_index()
train_df = pd.merge(train_df, train_labels, left_index=True, right_index=True)
test_df = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/test.parquet').groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()

In [5]:
train_df.head()

# **Some Stats & EDA**

In [4]:
# all_cols = train_df.columns
# #cat_cols
# non_use_cols = ['S_2','B_30','B_38','D_114','D_116','D_117','D_120','D_126','D_63','D_64','D_66','D_68', 'target']
# feature_cols = [col for col in all_cols if col not in non_use_cols]

In [18]:
feat_Delinquency = [c for c in train_df.columns if c.startswith('D_')]
feat_Spend = [c for c in train_df.columns if c.startswith('S_')]
feat_Payment = [c for c in train_df.columns if c.startswith('P_')]
feat_Balance = [c for c in train_df.columns if c.startswith('B_')]
feat_Risk = [c for c in train_df.columns if c.startswith('R_')]
print(f'Total number of Delinquency variables: {len(feat_Delinquency)}')
print(f'Total number of Spend variables: {len(feat_Spend)}')
print(f'Total number of Payment variables: {len(feat_Payment)}')
print(f'Total number of Balance variables: {len(feat_Balance)}')
print(f'Total number of Risk variables: {len(feat_Risk)}')
labels=['Delinquency', 'Spend','Payment','Balance','Risk']
values= [len(feat_Delinquency), len(feat_Spend),len(feat_Payment), len(feat_Balance),len(feat_Risk)]
fig_1 = go.Figure()
fig_1.add_trace(go.Pie(values = values,labels = labels,hole = 0.6, 
                     hoverinfo ='label+percent'))
fig_1.update_traces(textfont_size = 12, hoverinfo ='label+percent',textinfo ='label', 
                  showlegend = False,marker = dict(colors =["#70d6ff","#ff9770"]),
                  title = dict(text = 'Feature Distribution'))  
fig_1.show()

# Basic Level Analysis

In [None]:
categorical_col = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
descrete_cols=['B_30', 'B_38', 'D_63', 'D_64', 'D_66', 'D_68',
          'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'target']
#all categorial columns are stored in categorical_col
categorical_col.extend(descrete_cols)

In [9]:
# #EDA from
# #https://www.kaggle.com/code/duanchenliu/data-exploring-dl/notebook
target_col = 'target'
#Target Distribution
count = train_df[target_col].value_counts()
print(count)
print("percentage of not default --- >",count[0]/train_df.shape[0])
print("percentage of default --->", count[1]/train_df.shape[0])
fig = go.Figure()
fig.add_trace(go.Bar(x= ['Paid', "Default"],y=count.values,
                     marker_color = ['#9900cc','#ffff80'],
                     text = [str(round(count[0]/train_df.shape[0],2) * 100) + '%' , str(round(count[1]/train_df.shape[0], 2) * 100) + '%']))
fig.update_layout(template = 'plotly_dark',
                  title = "target value distribution",
                  width = 500,
                  height = 500)

### correlation with target

In [16]:
#col = [c for c in data.columns if data[c].dtypes != 'object']
corr = train_df.corrwith(train_df[target_col], axis=0)
val = [str(round(v ,2) *100) + '%' for v in corr.values]

fig = go.Figure()
fig.add_trace(go.Bar(y=corr.index, x= corr.values,
                     orientation='h',
                     marker_color = '#9900cc',
                     text = val,
                     textposition = 'outside',
                     textfont_color = '#ffff80'))
fig.update_layout(template = 'plotly_dark',
                  title = "Correlation with Target",
                  width = 800,
                  height = 3000)
fig.update_xaxes(range=[-2,2])

# negative correlation top 5: P_2 -67%, B_2 -56%, B_18 -55%, B_33 -52%, D_62 -37% 
# postive correlation top 5: B_9 54%, D_55 54%, D_44 53%, D_61 53%, B_3 51%

In [None]:
# NAN_VALUE = -127
# train = train.fillna(NAN_VALUE) 
# print('shape of data:', train.shape)

# def process_and_feature_engineer(df):
#     # FEATURE ENGINEERING FROM 
#     # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
#     all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
#     cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
#     num_features = [col for col in all_cols if col not in cat_features]

#     test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
#     test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

#     test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
#     test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

#     df = cudf.concat([test_num_agg, test_cat_agg], axis=1)
#     del test_num_agg, test_cat_agg
#     print('shape after engineering', df.shape )
    
#     return df

# train = process_and_feature_engineer(train)

~## Above code : System getting stopped due to huge data so we will import either the agg. dataset~

In [None]:
# #aggregated data, min max avg count etc, here the target label is already added
# #link https://www.kaggle.com/datasets/huseyincot/amex-agg-data-pickle
# train = pd.read_pickle("../input/amex-agg-data-pickle/train_agg.pkl", compression="gzip")
# test = pd.read_pickle("../input/amex-agg-data-pickle/test_agg.pkl", compression="gzip")

## Missing Data

In [19]:
#Custom Color Palette 🎨
custom_colors = ["#ffd670","#70d6ff","#ff4d6d","#8338ec","#90cf8e"]
customPalette = sns.set_palette(sns.color_palette(custom_colors))
sns.palplot(sns.color_palette(custom_colors),size=1.2)
plt.tick_params(axis='both', labelsize=0, length = 0)
background_color = 'white'
missing = pd.DataFrame(columns = ['% Missing values'],data = train_df.isnull().sum()/len(train_df))
fig = plt.figure(figsize = (20, 60),facecolor=background_color)
gs = fig.add_gridspec(1, 2)
gs.update(wspace = 0.5, hspace = 0.5)
ax0 = fig.add_subplot(gs[0, 0])
for s in ["right", "top","bottom","left"]:
    ax0.spines[s].set_visible(False)
sns.heatmap(missing,cbar = False,annot = True,fmt =".2%", linewidths = 2,cmap = custom_colors,vmax = 1, ax = ax0)
plt.show()

In [25]:
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
del_cols = [c for c in train_df.columns if (c.startswith(('D','t'))) & (c not in cat_cols)]
df_del = train_df[del_cols]
spd_cols = [c for c in train_df.columns if (c.startswith(('S','t'))) & (c not in cat_cols)]
df_spd = train_df[spd_cols]
pay_cols = [c for c in train_df.columns if (c.startswith(('P','t'))) & (c not in cat_cols)]
df_pay = train_df[pay_cols]
bal_cols = [c for c in train_df.columns if (c.startswith(('B','t'))) & (c not in cat_cols)]
df_bal = train_df[bal_cols]
ris_cols = [c for c in train_df.columns if (c.startswith(('R','t'))) & (c not in cat_cols)]
df_ris = train_df[ris_cols]

In [21]:
plt.figure(figsize =(11,11))
corr = df_del.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))
sns.heatmap(corr, mask = mask, robust = True, center = 0,square = True, linewidths =.6, cmap = custom_colors)
plt.title('Correlation of Delinquency Variables')
plt.show()

In [23]:
# #https://www.kaggle.com/code/devsubhash/amex-eda-default-prediction
# fig, axes = plt.subplots(8, 3, figsize = (16,18))
# fig.suptitle('Distribution of Spend Variables', fontsize = 15, x = 0.5, y = 1)
# for i, ax in enumerate(axes.reshape(-1)):
#     if i < len(spd_cols) - 1:
#         sns.kdeplot(x = spd_cols[i], hue ='target', data = df_spd, fill = True, ax = ax, palette =["#e63946","#8338ec"])
#         ax.tick_params()
#         ax.xaxis.get_label()
#         ax.set_ylabel('')
# plt.tight_layout()
# plt.show()

In [27]:
plt.figure(figsize = (11,11))
corr = df_spd.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask = mask, robust = True, center = 0,square = True, linewidths = .6, cmap = custom_colors)
plt.title('Correlation of Spend Variables')
plt.show()

In [29]:
plt.figure(figsize = (6,6))
corr = df_pay.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))
sns.heatmap(corr, mask = mask, robust = True, center = 0,square = True, linewidths = .6, cmap = custom_colors)
plt.title('Correlation of Payment Variables')
plt.show()

In [30]:
plt.figure(figsize = (11,11))
corr = df_bal.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))
sns.heatmap(corr, mask = mask, robust=True, center = 0,square = True, linewidths =.6, cmap = custom_colors)
plt.title('Correlation of Balance Variables')
plt.show()

In [31]:
plt.figure(figsize=(11,11))
corr = df_ris.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask = mask, robust = True, center = 0, square = True, linewidths =.6, cmap = custom_colors)
plt.title('Correlation of Risk Variables')
plt.show()

There are several strong correlations with the target variable. Payment 2 is the most negatively correlated with the probability of defaulting with a correlation of -0.67, while Delinquency 48 is the most positively correlated overall at 0.61. Delinquency 87 is also missing from the correlations above due to the proportion of null values. In fact, 24 of the top 30 features with missing values are in Delinquency variables.

In [32]:
#Removing outlier cols
outlier_list = []
outlier_col = []
for col in feature_cols :
    temp_df = train_df[(train_df[col] > train_df[col].mean() + train_df[col].std() * 200) |
                       (train_df[col] < train_df[col].mean() - train_df[col].std() * 200) ]
    if len(temp_df) >0 and len(temp_df) <6 : 
        outliers = temp_df.index.to_list()
        outlier_list.extend(outliers)
        outlier_col.append(col)
        #print(col, len(temp_df))
outlier_list = list(set(outlier_list))
train_df.drop(outlier_list, inplace = True)

In [33]:
y = train_df['target'].copy()
x = train_df[feature_cols]

# Modelling

## 1. LGBM: Acc as per AMEX Metric - 78.6%

In [34]:
#Cross validation (KFold = 3)
kf = KFold(n_splits = 4)
models = []
lgbm_params ={"objective":"binary",
              "random_seed":1234}

for train_index, val_index in kf.split(x):
    X_train = x.iloc[train_index]
    X_valid = x.iloc[val_index]
    Y_train = y.iloc[train_index]
    Y_valid = y.iloc[val_index]
    
    lgbm_train = lgbm.Dataset(X_train, Y_train)
    lgbm_eval = lgbm.Dataset(X_valid, Y_valid, reference=lgbm_train)
    
    model_lgbm = lgbm.train(lgbm_params,
                           lgbm_train,
                           valid_sets = lgbm_eval,
                           num_boost_round = 300,
                           early_stopping_rounds = 20,
                           verbose_eval = 10,
                           )
    y_pred = model_lgbm.predict(X_valid, num_iteration = model_lgbm.best_iteration)
        
    print (accuracy_score(Y_valid, np.round(y_pred)))
    
    models.append(model_lgbm)

In [35]:
#test_df = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/test.parquet').groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()
test_df = test_df[feature_cols]

In [36]:
gc.collect()

In [37]:
preds = []
for model in models:
    pred = model.predict(test_df)
    preds.append(pred)
    
preds_array = np.array(preds)
preds_mean = np.mean(preds_array, axis =0)
#Submission
sub = pd.read_csv('../input/amex-default-prediction/sample_submission.csv')
sub["prediction"] = preds_mean
sub.to_csv('submission_lgbm.csv', index=False)
sub.head()

In [38]:
gc.collect()

## 2. CatBoostClassifier: Acc as per AMEX Metric - 78.6%

In [5]:
train_df = pd.read_feather('../input/amexfeather/train_data.ftr')
train_df = train_df.groupby('customer_ID').tail(1).set_index('customer_ID')

test_df = pd.read_feather('../input/amexfeather/test_data.ftr')
test_df = test_df.groupby('customer_ID').tail(1).set_index('customer_ID')

df_subm = pd.read_csv("../input/amex-default-prediction/sample_submission.csv")

all_cols = train_df.columns
#cat_cols
non_use_cols = ['S_2','B_30','B_38','D_114','D_116','D_117','D_120','D_126','D_63','D_64','D_66','D_68', 'target']
feature_cols = [col for col in all_cols if col not in non_use_cols]

In [6]:
#Removing outlier cols
outlier_list = []
outlier_col = []
for col in feature_cols :
    temp_df = train_df[(train_df[col] > train_df[col].mean() + train_df[col].std() * 200) |
                       (train_df[col] < train_df[col].mean() - train_df[col].std() * 200) ]
    if len(temp_df) >0 and len(temp_df) <6 : 
        outliers = temp_df.index.to_list()
        outlier_list.extend(outliers)
        outlier_col.append(col)
        #print(col, len(temp_df))
outlier_list = list(set(outlier_list))
train_df.drop(outlier_list, inplace = True)

In [7]:
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

lab_enc = LabelEncoder()
for cat_feat in cat_cols:
    train_df[cat_feat] = lab_enc.fit_transform(train_df[cat_feat])
    test_df[cat_feat] = lab_enc.transform(test_df[cat_feat])
# define dataset
X = train_df.drop('target', axis=1)
y = train_df['target']

# creating dataset split for prediction
X_train, X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42) # 80-20 split

clf = CatBoostClassifier(iterations = 3001, random_state = 42, nan_mode ='Min',task_type ="GPU")
clf.fit(X_train, y_train, eval_set = [(X_test, y_test)], cat_features=cat_cols,  verbose = 100)
preds = clf.predict_proba(X_test)[:, 1]


In [8]:
y_preds = clf.predict_proba(test_df)[:, 1]
df_subm["prediction"] = y_preds
df_subm.to_csv('submission_catb.csv', index=False)
df_subm

## 3: Didn't work because LogR. dont work with NaN

In [None]:
train_df = pd.read_feather('../input/amexfeather/train_data.ftr')
test_df = pd.read_feather('../input/amexfeather/test_data.ftr')
df_subm = pd.read_csv("../input/amex-default-prediction/sample_submission.csv")

gc.collect()

train_df = train_df.groupby('customer_ID').tail(1).set_index('customer_ID')
test_df = test_df.groupby('customer_ID').tail(1).set_index('customer_ID')

del test_df['S_2']
gc.collect()

In [None]:
titles=['Delinquency '+str(i).split('_')[1] if i.startswith('D') else 'Spend '+str(i).split('_')[1] 
        if i.startswith('S') else 'Payment '+str(i).split('_')[1]  if i.startswith('P') 
        else 'Balance '+str(i).split('_')[1] if i.startswith('B') else 
        'Risk '+str(i).split('_')[1] for i in train_df.columns[:-1]]
cat_cols=['Balance 30', 'Balance 38', 'Delinquency 63', 'Delinquency 64', 'Delinquency 66', 'Delinquency 68',
          'Delinquency 114', 'Delinquency 116', 'Delinquency 117', 'Delinquency 120', 'Delinquency 126', 'Target']
test_df.columns=titles[1:]
titles.append('Target')
train_df.columns=titles

df = pd.read_feather('../input/amexfeather/train_data.ftr')
df_cat = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126','D_63','D_64', 'D_66', 'D_68'] 

df_all = list(df.columns)
df_all.remove("customer_ID")
df_all.remove("S_2")
df_all.remove("D_142")

#finding set of numerical features by cosnducting simple set operations
df_num = list(set(df_all) - set(df_cat))
df = df[df_all]
perc = 20.0 # Like N %
min_count =  int(((100-perc)/100)*df.shape[0] + 1)
df = df.dropna( axis=1, 
                thresh=min_count)
df=df.dropna()
df=df.reset_index()
df=df.drop("index",axis=1)

df_all = list(df.columns)

df_num = list(set(df_all) - set(df_cat))

df_cat = list(set(df_all) - set(df_num))
df_all=list(df.columns)
df_all.remove("target")
df_encoded = pd.get_dummies( df[df_all], 
                                        columns = df_cat,
                                        drop_first = True )
X = df_encoded
Y = df['target']
train_X, test_X, train_y, test_y = train_test_split( X,
                                                    Y,
                                                    test_size = 0.3,
                                                    random_state = 42 )
sc = StandardScaler()
train_X = sc.fit_transform(train_X)
test_X = sc.transform(test_X)
logit = LogisticRegression()
logit.fit( train_X, train_y)
pred_y = logit.predict(test_X)

In [None]:
cm = confusion_matrix(test_y,pred_y)
print(cm)
accuracy_score(test_y, pred_y)

gc.collect()

y_preds = logit.predict_proba(test_df)[:, 1]
df_subm["prediction"] = y_preds
df_subm.to_csv('submission_LR.csv', index=False)
df_subm

## 4th XGB: Mixed Feature - Acc as per AMEX Metric - 79.4%

Mixed Feature Means for every columns we have we have added min, max, avg, std, last because for few reason which is defaulter can be identifiable using these Mixed features as per quantile analysis.

Need of aggregated columns as features because:
https://www.kaggle.com/code/narendra/timeseries-analysis-with-quantiles

In [3]:
print('RAPIDS version',cudf.__version__)
#https://www.kaggle.com/code/sietseschrder/xgboost-starter-0-793
# VERSION NAME FOR SAVED MODEL FILES
VER = 1
# TRAIN RANDOM SEED
SEED = 42
# FILL NAN VALUE
NAN_VALUE = -127 # will fit in int8
# FOLDS PER MODEL
FOLDS = 5
def read_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
    else: df = cudf.read_parquet(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = cudf.to_datetime( df.S_2 )
    # SORT BY CUSTOMER AND DATE (so agg('last') works correctly)
    df = df.fillna(-127)
    return df
print('Reading train data...')
TRAIN_PATH = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'
train = read_file(path = TRAIN_PATH)

def process_and_feature_engineer(df):
    # FEATURE ENGINEERING FROM 
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]

#     df.groupby("customer_ID").fillna("backfill")
    test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

    test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

    df = cudf.concat([test_num_agg, test_cat_agg], axis=1)
    del test_num_agg, test_cat_agg
    print('shape after engineering', df.shape )
    
    return df

train = process_and_feature_engineer(train)

# ADD TARGETS
targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
targets = targets.set_index('customer_ID')
train = train.merge(targets, left_index=True, right_index=True, how='left')
train.target = train.target.astype('int8')
del targets

# NEEDED TO MAKE CV DETERMINISTIC (cudf merge above randomly shuffles rows)
train = train.sort_index().reset_index()

# FEATURES
FEATURES = train.columns[1:-1]
print(f'There are {len(FEATURES)} features!')

In [4]:
# NEEDED WITH DeviceQuantileDMatrix BELOW
class IterLoadForDMatrix(xgb.core.DataIter):
    def __init__(self, df=None, features=None, target=None, batch_size=256*1024):
        self.features = features
        self.target = target
        self.df = df
        self.it = 0 # set iterator to 0
        self.batch_size = batch_size
        self.batches = int( np.ceil( len(df) / self.batch_size ) )
        super().__init__()

    def reset(self):
        '''Reset the iterator'''
        self.it = 0

    def next(self, input_data):
        '''Yield next batch of data.'''
        if self.it == self.batches:
            return 0 # Return 0 when there's no more batch.
        
        a = self.it * self.batch_size
        b = min( (self.it + 1) * self.batch_size, len(self.df) )
        dt = cudf.DataFrame(self.df.iloc[a:b])
        input_data(data=dt[self.features], label=dt[self.target]) #, weight=dt['weight'])
        self.it += 1
        return 1
    
def xgboost_amex_metric_mod(predt: np.ndarray, dtrain: xgb.DMatrix):
    y = dtrain.get_label()
    return 'AMEXcustom', 1 - amex_metric_mod(y, predt)
# LOAD XGB LIBRARY

print('XGB Version',xgb.__version__)

# XGB MODEL PARAMETERS
xgb_parms = { 
    'max_depth':4, 
    'learning_rate':0.03, 
    'subsample':0.8,
    'colsample_bytree':0.6, 
    'objective':'binary:logistic',
    'tree_method':'gpu_hist',
    'predictor':'gpu_predictor',
    'random_state':SEED
}

In [5]:
importances = []
oof = []
train = train.to_pandas() # free GPU memory
TRAIN_SUBSAMPLE = 1.0
gc.collect()
fold_model_scores = []

skf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
for fold,(train_idx, valid_idx) in enumerate(skf.split(
            train, train.target )):
    
    # TRAIN WITH SUBSAMPLE OF TRAIN FOLD DATA
    if TRAIN_SUBSAMPLE<1.0:
        np.random.seed(SEED)
        train_idx = np.random.choice(train_idx, 
                       int(len(train_idx)*TRAIN_SUBSAMPLE), replace=False)
        np.random.seed(None)
    
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)
    
    # TRAIN, VALID, TEST FOR FOLD K
    Xy_train = IterLoadForDMatrix(train.loc[train_idx], FEATURES, 'target')
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, 'target']
    
    dtrain = xgb.DeviceQuantileDMatrix(Xy_train, max_bin=256)
    dvalid = xgb.DMatrix(data=X_valid, label=y_valid)
    
    # TRAIN MODEL FOLD K
    model = xgb.train(xgb_parms, 
                dtrain=dtrain,
                evals=[(dtrain,'train'),(dvalid,'valid')],
                custom_metric=xgboost_amex_metric_mod,
                num_boost_round=9999,
                early_stopping_rounds=1000,
                verbose_eval=100) 
    model.save_model(f'XGB_v{VER}_fold{fold}.xgb')
    
    # GET FEATURE IMPORTANCE FOR FOLD K
    dd = model.get_score(importance_type='weight')
    df = pd.DataFrame({'feature':dd.keys(),f'importance_{fold}':dd.values()})
    importances.append(df)
            
    # INFER OOF FOLD K
    oof_preds = model.predict(dvalid)
    acc = amex_metric_mod(y_valid.values, oof_preds)
    fold_model_scores.append(acc)
    print('Kaggle Metric =',acc,'\n')
    
    # SAVE OOF
    df = train.loc[valid_idx, ['customer_ID','target'] ].copy()
    df['oof_pred'] = oof_preds
    print(df.head())
    oof.append( df )
    
    del dtrain, Xy_train, dd, df
    del X_valid, y_valid, dvalid, model
    _ = gc.collect()
    
print('#'*25)
oof = pd.concat(oof,axis=0,ignore_index=True).set_index('customer_ID')

print(oof.head())
print(len(oof))
acc = amex_metric_mod(oof.target.values, oof.oof_pred.values)
print('OVERALL CV Kaggle Metric =',acc)

print(len(oof))

# CLEAN RAM
del train
_ = gc.collect()

In [6]:
oof_xgb = pd.read_parquet(TRAIN_PATH, columns=['customer_ID']).drop_duplicates()
oof_xgb['customer_ID_hash'] = oof_xgb['customer_ID'].apply(lambda x: int(x[-16:],16) ).astype('int64')
oof_xgb = oof_xgb.set_index('customer_ID_hash')
oof_xgb = oof_xgb.merge(oof, left_index=True, right_index=True)
oof_xgb = oof_xgb.sort_index().reset_index(drop=True)
oof_xgb.to_csv(f'oof_xgb_v{VER}.csv',index=False)
oof_xgb.head()

# CLEAR VRAM, RAM FOR INFERENCE BELOW
del oof_xgb, oof
_ = gc.collect()

# CALCULATE SIZE OF EACH SEPARATE TEST PART
def get_rows(customers, test, NUM_PARTS = 4, verbose = ''):
    chunk = len(customers)//NUM_PARTS
    if verbose != '':
        print(f'We will process {verbose} data as {NUM_PARTS} separate parts.')
        print(f'There will be {chunk} customers in each part (except the last part).')
        print('Below are number of rows in each part:')
    rows = []

    for k in range(NUM_PARTS):
        if k==NUM_PARTS-1: cc = customers[k*chunk:]
        else: cc = customers[k*chunk:(k+1)*chunk]
        s = test.loc[test.customer_ID.isin(cc)].shape[0]
        rows.append(s)
    if verbose != '': print( rows )
    return rows,chunk

# COMPUTE SIZE OF 4 PARTS FOR TEST DATA
NUM_PARTS = 4
TEST_PATH = '../input/amex-data-integer-dtypes-parquet-format/test.parquet'

print(f'Reading test data...')
test = read_file(path = TEST_PATH, usecols = ['customer_ID','S_2'])
customers = test[['customer_ID']].drop_duplicates().sort_index().values.flatten()
rows,num_cust = get_rows(customers, test[['customer_ID']], NUM_PARTS = NUM_PARTS, verbose = 'test')


# INFER TEST DATA IN PARTS
skip_rows = 0
skip_cust = 0
test_preds = []

for k in range(NUM_PARTS):
    
    # READ PART OF TEST DATA
    print(f'\nReading test data...')
    test = read_file(path = TEST_PATH)
    test = test.iloc[skip_rows:skip_rows+rows[k]]
    skip_rows += rows[k]
    print(f'=> Test part {k+1} has shape', test.shape )
    
    # PROCESS AND FEATURE ENGINEER PART OF TEST DATA
    test = process_and_feature_engineer(test)
    if k==NUM_PARTS-1: test = test.loc[customers[skip_cust:]]
    else: test = test.loc[customers[skip_cust:skip_cust+num_cust]]
    skip_cust += num_cust
    
    # TEST DATA FOR XGB
    X_test = test[FEATURES]
    dtest = xgb.DMatrix(data=X_test)
    test = test[['P_2_mean']] # reduce memory
    del X_test
    gc.collect()

    # INFER XGB MODELS ON TEST DATA
    model = xgb.Booster()
    model.load_model(f'XGB_v{VER}_fold0.xgb')
    preds = model.predict(dtest) * (fold_model_scores[0] / sum(fold_model_scores))
    for f in range(1,FOLDS):
        model.load_model(f'XGB_v{VER}_fold{f}.xgb')
        preds += model.predict(dtest) * (fold_model_scores[f] / sum(fold_model_scores))
    
    test_preds.append(preds)

    # CLEAN MEMORY
    del dtest, model
    _ = gc.collect()
# WRITE SUBMISSION FILE
test_preds = np.concatenate(test_preds)   
test = cudf.DataFrame(index=customers,data={'prediction':test_preds})

In [7]:
sub = cudf.read_csv('../input/amex-default-prediction/sample_submission.csv')[['customer_ID']]
sub['customer_ID_hash'] = sub['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
sub = sub.set_index('customer_ID_hash')
sub = sub.merge(test[['prediction']], left_index=True, right_index=True, how='left')
sub = sub.reset_index(drop=True)

# DISPLAY PREDICTIONS
sub.to_csv(f'submission_xgb_v1.csv',index=False)