<a href="https://www.kaggle.com/code/erwanchesneau/amex-model-averaging-xgboost-cateboost-0-794?scriptVersionId=104558227" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# LOAD LIBRARIES
import os
import gc
import pickle
import pandas as pd
import numpy as np # CPU libraries
import cupy, cudf # GPU libraries
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
import xgboost as xgb
from catboost import Pool, CatBoostClassifier
try : 
    from pytorch_widedeep.preprocessing import TabPreprocessor
    from pytorch_widedeep.models import TabTransformer, WideDeep
except ModuleNotFoundError :
    !pip install pytorch_widedeep
    from pytorch_widedeep.preprocessing import TabPreprocessor
    from pytorch_widedeep.models import TabTransformer, WideDeep
print('RAPIDS version',cudf.__version__)

RAPIDS version 21.10.01


In [2]:
# VERSION NAME FOR SAVED MODEL FILES
VER = 2

# TRAIN RANDOM SEED
SEED = 42

# FILL NAN VALUE
NAN_VALUE = -127 # will fit in int8

# FOLDS PER MODEL
FOLDS = 10

TRAIN_PATH = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'

ODIR = "/kaggle/working/echesneau/"
if not os.path.isdir(ODIR):
    os.makedirs(ODIR)

TRAIN_SUBSAMPLE = 1.0



In [3]:
def read_file(path = '', usecols = None):
    """
    function to load dataset
    The function is modified frm the original one
    The Fillna is done only during the processing
    """
    # LOAD DATAFRAME
    if usecols is not None:
        data = cudf.read_parquet(path, columns=usecols)
    else:
        data = cudf.read_parquet(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    data['customer_ID'] = data['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    data.S_2 = cudf.to_datetime( data.S_2 )
    print('shape of data:', data.shape)

    return data

In [4]:
def process_and_feature_engineer(data):
    """
    function to process database
    FEATURE ENGINEERING FROM
    https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    """
    all_cols = [c for c in list(data.columns) if c not in ['customer_ID','S_2']]
    cat_feat = ["B_30","B_38","D_114","D_116","D_117",\
                    "D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_feat]

    test_num_agg = data.groupby("customer_ID")[num_features].agg(['mean', \
                                                                  'std', \
                                                                  'min', \
                                                                  'max', \
                                                                  'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

    test_cat_agg = data.groupby("customer_ID")[cat_feat].agg(['count', \
                                                              'last', \
                                                              'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

    data = cudf.concat([test_num_agg, test_cat_agg], axis=1)
    del test_num_agg, test_cat_agg
    data = data.fillna(NAN_VALUE)
    print('shape after engineering', data.shape )
    return data

In [5]:
def amex_metric_mod(y_true, y_pred):
    """
    function to calculate the metric of the competion
    from https://www.kaggle.com/kyakovlev
    and https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
    """
    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [6]:
result_all = pd.DataFrame(columns=['model', 'preprocessing', 'name', 'y_valid_pred', 'y_pred','valid_acc', 'acc'])
result_sum = pd.DataFrame(columns=['model', 'preprocessing', 'name', 'y_pred', 'acc'])

# @huseyincot preprocessing

The processing proposed by @Huyseioncot seems to be interessting and it is one of the most used.
So we decide to base the predictions on this processing.

## Load and process

Parquet format is use to save GPU/RAM memory.

In [7]:
print('Reading train data...')
train = read_file(path = TRAIN_PATH)

Reading train data...
shape of data: (5531451, 190)


In [8]:
train = process_and_feature_engineer(train)

shape after engineering (458913, 918)


In [9]:
train.head()

Unnamed: 0_level_0,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_last,D_39_mean,D_39_std,D_39_min,D_39_max,D_39_last,...,D_63_nunique,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223358381327749917,0.415868,0.057145,0.340178,0.498727,0.387708,2.615385,4.628507,0,16,0,...,1,13,2,1,13,-1,1,13,3,2
-9223193039457028513,0.974068,0.013094,0.964483,1.002478,1.001372,0.0,0.0,0,0,0,...,2,13,0,1,13,-1,1,13,6,1
-9223189665817919541,0.802447,0.038025,0.694073,0.828761,0.694073,0.0,0.0,0,0,0,...,1,13,0,1,13,-1,1,13,6,1
-9223188534444851899,0.791203,0.002688,0.786647,0.794826,0.787945,0.0,0.0,0,0,0,...,1,13,3,2,13,-1,1,13,5,1
-9223173911659837606,0.115666,0.078554,0.038207,0.252421,0.040486,4.384615,6.144625,0,17,13,...,1,13,0,2,13,-1,1,13,6,2


Targets are added in the database

In [10]:
# ADD TARGETS
targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
targets = targets.set_index('customer_ID')
train = train.merge(targets, left_index=True, right_index=True, how='left')
train.target = train.target.astype('int8')
del targets

# NEEDED TO MAKE CV DETERMINISTIC (cudf merge above randomly shuffles rows)
train = train.sort_index().reset_index()

# FEATURES
FEATURES = train.columns[1:-1]
print(f'There are {len(FEATURES)} features!')

There are 918 features!


In [13]:
with open(ODIR+'/all_features.pkl', 'wb') as ofile :
    pickle.dump(FEATURES, ofile)

Features are needed for the prediction on the test set, 
we save it.

## XGBoost

XGBoost seems to be one of the most efficent model.

In [11]:
train = train.to_pandas() # free GPU memory
gc.collect()

In [None]:
print('XGB Version',xgb.__version__)

# XGB MODEL PARAMETERS
xgb_parms = { 
    'max_depth':4, 
    'learning_rate':0.05, 
    'subsample':0.8,
    'colsample_bytree':0.6, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist',
    'predictor':'gpu_predictor',
    'random_state':SEED
}

Beause of memory limitation, database is split into flods.
A model is train for each fold. 
Amex metric is calculated on the validation set, train set and all fold data.
At the end, the global metric is calculated

In [None]:
oof = []
skf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
for fold,(train_idx, valid_idx) in enumerate(skf.split(
            train, train.target )):
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)
    dtrain = xgb.DMatrix(data=train.loc[train_idx, FEATURES], label=train.loc[train_idx, 'target'])
    dvalid = xgb.DMatrix(data=train.loc[valid_idx, FEATURES], label=train.loc[valid_idx, 'target'])
    model = xgb.train(xgb_parms, 
                      dtrain=dtrain,
                      evals=[(dtrain,'train'),(dvalid,'valid')],
                      num_boost_round=9999,
                      #num_boost_round=99,
                      early_stopping_rounds=100,
                      verbose_eval=100) 
    model.save_model(f'{ODIR}/XGB_all_features_v{VER}_fold{fold}.xgb')
    valid_pred = model.predict(dvalid)
    val_acc = amex_metric_mod(train.loc[valid_idx, 'target'].values, valid_pred)
    print('Kaggle Metric on valid set =',val_acc,'\n')
    
    df = train.loc[valid_idx, ['customer_ID','target'] ].copy()
    df['oof_pred'] = valid_pred
    oof.append( df )

    del dtrain, dvalid, df
    _ = gc.collect()
    
    dall = xgb.DMatrix(data=train[FEATURES], label=train['target'])
    pred = model.predict(dall)
    all_acc = amex_metric_mod(train['target'].values, pred)
    print('Kaggle Metric on all dataset =',all_acc,'\n')
    result_all = result_all.append({'model' : "XGBoost", 
                                    'preprocessing' : "huseyincot_all_feat", 
                                    'name' : f'XGB_all_features_v{VER}_fold{fold}', 
                                    'y_valid_pred' : valid_pred, 
                                    'valid_acc' : val_acc,
                                    'y_pred' : pred,
                                    'acc' : all_acc
                                   }, 
                                   ignore_index=True
                                  )
    del dall, pred, valid_pred
    _ = gc.collect()
print('#'*25)
oof = pd.concat(oof,axis=0,ignore_index=True).set_index('customer_ID')
acc = amex_metric_mod(oof.target.values, oof.oof_pred.values)
result_sum = result_sum.append({'model' : "XGBoost", 
                                'preprocessing':"huseyincot_all_feat", 
                                'name' : "XGBoost_huseyincot_all_feat", 
                                'y_pred' : oof, 
                                'acc': acc
                               },
                               ignore_index=True
                              )
print('OVERALL CV Kaggle Metric =',acc)

del oof, acc 
_ = gc.collect()

## CatBoost

CatBoost is based on the same method than XGBoost but could  be more efficient.
We apply the same code than before but training a catboost.

In [None]:
# GET CATEG VARIABLES
cat_features = ["B_30", "B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
categ = []
#print(train.columns)
for col in train.columns :
    if col not in ['customer_ID', 'target'] :
        var = '_'.join(col.split('_')[:2])
        if var in cat_features :
            categ.append(col)

In [None]:
oof = []
skf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
for fold,(train_idx, valid_idx) in enumerate(skf.split(
            train, train.target )):
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)
    train_pool = Pool(train.loc[train_idx, FEATURES], 
                      train.loc[train_idx, 'target'],
                      cat_features=categ
                     )
    valid_pool = Pool(train.loc[valid_idx, FEATURES], 
                      train.loc[valid_idx, 'target'],
                      cat_features=categ
                     )
    model = CatBoostClassifier(iterations=9999, 
                               random_state=SEED, 
                               task_type="GPU",
                               loss_function = 'Logloss',
                               #learning_rate=0.05
                               )
    model.fit(train_pool, eval_set=valid_pool,
              #od_type="Iter",
              early_stopping_rounds=100,
              #od_wait=100,
              verbose=100)
    model.save_model(f'{ODIR}/CTB_all_features_v{VER}_fold{fold}.ctb')
    valid_pred = model.predict_proba(valid_pool)[:,1]
    val_acc = amex_metric_mod(train.loc[valid_idx, 'target'].values, valid_pred)
    print('Kaggle Metric on valid set =',val_acc,'\n')
    
    df = train.loc[valid_idx, ['customer_ID','target'] ].copy()
    df['oof_pred'] = valid_pred
    oof.append( df )

    del train_pool, valid_pool, df
    _ = gc.collect()
    
    all_pool = Pool(train[FEATURES], 
                    train['target'],
                    cat_features=categ
                     )
    pred = model.predict_proba(all_pool)[:,1]
    all_acc = amex_metric_mod(train['target'].values, pred)
    print('Kaggle Metric on all dataset =',all_acc,'\n')
    result_all = result_all.append({'model' : "CateBoost", 
                                    'preprocessing' : "huseyincot_all_feat", 
                                    'name' : f'CTB_all_features_v{VER}_fold{fold}', 
                                    'y_valid_pred' : valid_pred, 
                                    'valid_acc' : val_acc,
                                    'y_pred' : pred,
                                    'acc' : all_acc
                                   }, 
                                   ignore_index=True
                                  )
    del all_pool, pred, valid_pred
    _ = gc.collect()
    
print('#'*25)
oof = pd.concat(oof,axis=0,ignore_index=True).set_index('customer_ID')
acc = amex_metric_mod(oof.target.values, oof.oof_pred.values)
result_sum = result_sum.append({'model' : "CateBoost", 
                                'preprocessing':"huseyincot_all_feat", 
                                'name' : "CTB_huseyincot_all_feat", 
                                'y_pred' : oof, 
                                'acc': acc
                               },
                               ignore_index=True
                              )
print('OVERALL CV Kaggle Metric =',acc)

del oof, acc 
_ = gc.collect()

In [None]:
del train 
_=gc.collect()

# Removing Columns with a majority of NaN

The EDA shows us that some features contain huge amount of NaN values.
These features are removed.

## Load Dataset

In [None]:
train = read_file(path = TRAIN_PATH)

## Select features

Features are deleted if more than 20% of values are NaN. 

In [None]:
counter = train.isnull().sum(axis=0).sort_values(ascending=False)/len(train)*100
rm_nan = counter[counter>20].index
rm_nan = list(rm_nan.to_array())
print(f"{len(rm_nan)}/{len(train.columns)}")

In [None]:
FEATURES_2 = [col for col in train.columns if col not in rm_nan]

In [None]:
train = train[FEATURES_2]

## Processing

The same processing is applied

In [None]:
train = process_and_feature_engineer(train)

In [None]:
# ADD TARGETS
targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
targets = targets.set_index('customer_ID')
train = train.merge(targets, left_index=True, right_index=True, how='left')
train.target = train.target.astype('int8')
del targets

# NEEDED TO MAKE CV DETERMINISTIC (cudf merge above randomly shuffles rows)
train = train.sort_index().reset_index()


In [None]:
FEATURES_2 = train.columns[1:-1]

In [None]:
with open(ODIR+'/all_features_2.pkl', 'wb') as ofile :
    pickle.dump(FEATURES_2, ofile)

## XGBoost

In [None]:
train = train.to_pandas() # free GPU memory
TRAIN_SUBSAMPLE = 1.0
gc.collect()

In [None]:
print('XGB Version',xgb.__version__)

# XGB MODEL PARAMETERS
xgb_parms = { 
    'max_depth':4, 
    'learning_rate':0.05, 
    'subsample':0.8,
    'colsample_bytree':0.6, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist',
    'predictor':'gpu_predictor',
    'random_state':SEED
}

In [None]:
oof = []
skf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
for fold,(train_idx, valid_idx) in enumerate(skf.split(
            train, train.target )):
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)
    dtrain = xgb.DMatrix(data=train.loc[train_idx, FEATURES_2], label=train.loc[train_idx, 'target'])
    dvalid = xgb.DMatrix(data=train.loc[valid_idx, FEATURES_2], label=train.loc[valid_idx, 'target'])
    model = xgb.train(xgb_parms, 
                      dtrain=dtrain,
                      evals=[(dtrain,'train'),(dvalid,'valid')],
                      num_boost_round=9999,
                      #num_boost_round=99,
                      early_stopping_rounds=100,
                      verbose_eval=100) 
    model.save_model(f'{ODIR}/XGB_nonan_features_v{VER}_fold{fold}.xgb')
    valid_pred = model.predict(dvalid)
    val_acc = amex_metric_mod(train.loc[valid_idx, 'target'].values, valid_pred)
    print('Kaggle Metric on valid set =',val_acc,'\n')
    
    df = train.loc[valid_idx, ['customer_ID','target'] ].copy()
    df['oof_pred'] = valid_pred
    oof.append( df )

    del dtrain, dvalid, df
    _ = gc.collect()
    
    dall = xgb.DMatrix(data=train[FEATURES_2], label=train['target'])
    pred = model.predict(dall)
    all_acc = amex_metric_mod(train['target'].values, pred)
    print('Kaggle Metric on all dataset =',all_acc,'\n')
    #result_all = result_all.append({'model' : "XGBoost", 
    #                                'preprocessing' : "huseyincot_nonan_feat", 
    #                                'name' : f'XGB_nonan_features_v{VER}_fold{fold}', 
    #                                'y_valid_pred' : valid_pred, 
    #                                'valid_acc' : val_acc,
    #                                'y_pred' : pred,
    #                                'acc' : all_acc
    #                               }, 
    #                               ignore_index=True
    #                              )
    del dall, pred, valid_pred
    _ = gc.collect()
print('#'*25)
oof = pd.concat(oof,axis=0,ignore_index=True).set_index('customer_ID')
acc = amex_metric_mod(oof.target.values, oof.oof_pred.values)
#result_sum = result_sum.append({'model' : "XGBoost", 
#                                'preprocessing':"huseyincot_nonan_feat", 
#                                'name' : "XGBoost_huseyincot_nonan_feat", 
#                                'y_pred' : oof, 
#                                'acc': acc
#                              },
#                               ignore_index=True
#                              )
print('OVERALL CV Kaggle Metric =',acc)

del oof, acc 
_ = gc.collect()

## CateBoost

In [None]:
# GET CATEG VARIABLES
cat_features = ["B_30", "B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
categ = []
#print(train.columns)
for col in train.columns :
    if col not in ['customer_ID', 'target'] :
        var = '_'.join(col.split('_')[:2])
        if var in cat_features :
            categ.append(col)

In [None]:
try :
    del all_pool, pred, valid_pred
    _ = gc.collect()
except :
    pass

In [None]:
oof = []
skf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
for fold,(train_idx, valid_idx) in enumerate(skf.split(
            train, train.target )):
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)
    train_pool = Pool(train.loc[train_idx, FEATURES_2], 
                      train.loc[train_idx, 'target'],
                      cat_features=categ
                     )
    valid_pool = Pool(train.loc[valid_idx, FEATURES_2], 
                      train.loc[valid_idx, 'target'],
                      cat_features=categ
                     )
    model = CatBoostClassifier(iterations=9999, 
                               random_state=SEED, 
                               task_type="GPU",
                               loss_function = 'Logloss',
                               #learning_rate=0.05
                               )
    model.fit(train_pool, eval_set=valid_pool,
              #od_type="Iter",
              early_stopping_rounds=100,
              #od_wait=100,
              verbose=100)
    model.save_model(f'{ODIR}/CTB_nonan_features_v{VER}_fold{fold}.ctb')
    valid_pred = model.predict_proba(valid_pool)[:,1]
    val_acc = amex_metric_mod(train.loc[valid_idx, 'target'].values, valid_pred)
    print('Kaggle Metric on valid set =',val_acc,'\n')
    
    df = train.loc[valid_idx, ['customer_ID','target'] ].copy()
    df['oof_pred'] = valid_pred
    oof.append( df )

    del train_pool, valid_pool, df
    _ = gc.collect()
    
    all_pool = Pool(train[FEATURES_2], 
                    train['target'],
                    cat_features=categ
                     )
    pred = model.predict_proba(all_pool)[:,1]
    all_acc = amex_metric_mod(train['target'].values, pred)
    print('Kaggle Metric on all dataset =',all_acc,'\n')
    #result_all = result_all.append({'model' : "CateBoost", 
    #                                'preprocessing' : "huseyincot_nonan_feat", 
    #                                'name' : f'CTB_nonan_features_v{VER}_fold{fold}', 
    #                                'y_valid_pred' : valid_pred, 
    #                                'valid_acc' : val_acc,
    #                                'y_pred' : pred,
    #                                'acc' : all_acc
    #                               }, 
    #                               ignore_index=True
    #                              )
    del all_pool, pred, valid_pred
    _ = gc.collect()
    
print('#'*25)
oof = pd.concat(oof,axis=0,ignore_index=True).set_index('customer_ID')
acc = amex_metric_mod(oof.target.values, oof.oof_pred.values)
#result_sum = result_sum.append({'model' : "CateBoost", 
#                                'preprocessing':"huseyincot_all_feat", 
#                                'name' : "CTB_huseyincot_all_feat", 
#                                'y_pred' : oof, 
#                                'acc': acc
#                               },
#                               ignore_index=True
#                              )
print('OVERALL CV Kaggle Metric =',acc)

del oof, acc 
_ = gc.collect()

In [None]:
del train 
_=gc.collect()

# Features importances

A selection of most important features is done using the drop columns importances method.
The goal is to use only most important features for the prediction.

## Load data

In [None]:
train = read_file(path = TRAIN_PATH)

In [None]:
train = process_and_feature_engineer(train)

In [None]:
# ADD TARGETS
targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
targets = targets.set_index('customer_ID')
train = train.merge(targets, left_index=True, right_index=True, how='left')
train.target = train.target.astype('int8')
del targets

# NEEDED TO MAKE CV DETERMINISTIC (cudf merge above randomly shuffles rows)
train = train.sort_index().reset_index()

Because of the important size of the train set and in order to spped up the selection,
we select only 1/400 of rows.

In [None]:
train = train.loc[range(int(len(train)/400))]
train=train.to_pandas()
print(train.shape)

In [None]:
FEATURES_tmp = train.columns[1:-1]

## Select features

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from copy import deepcopy
def dropcol_importances(rf, X_train, y_train):
    rf_ = deepcopy(rf)
    rf_.random_state = 999
    rf_.fit(X_train, y_train)
    baseline = rf_.oob_score_
    imp = []
    for i, col in enumerate(X_train.columns):
        print(f"{i}/{len(X_train.columns)}", end="\r")
        X = X_train.drop(col, axis=1)
        rf_ = deepcopy(rf)
        rf_.random_state = 999
        rf_.fit(X, y_train)
        o = rf_.oob_score_
        imp.append(baseline - o)
    imp = np.array(imp)
    I = pd.DataFrame(
            data={'Feature':X_train.columns,
                  'Importance':imp})
    I = I.set_index('Feature')
    I = I.sort_values('Importance', ascending=True)
    return I

In [None]:
rf = RandomForestClassifier(
         n_estimators=100,
         # better generality with 5
         min_samples_leaf=5, 
         n_jobs=-1,
         oob_score=True)
rf.fit(train[FEATURES_tmp], train['target']) # rf must be pre-trained

In [None]:
dc_imp = dropcol_importances(rf,train[FEATURES_tmp] , train['target'])

In [None]:
dc_imp

The effect of each features on the accuracy is plot here

In [None]:
dc_imp.plot.barh()

Only features with an importance > 0 are conserved

In [None]:
dc_imp[dc_imp["Importance"]>0]

In [None]:
FEATURES_3 = dc_imp[dc_imp["Importance"]>0].index.to_list()
with open(ODIR+'/all_features_3.pkl', 'wb') as ofile :
    pickle.dump(FEATURES_3, ofile)

In [None]:
MODEL_PATH = "../input/amex-output-echesneau"
if os.path.isfile(MODEL_PATH+"/all_features_3.pkl") :
    with open(MODEL_PATH+"/all_features_3.pkl", 'rb') as f :
        FEATURES_3 = pickle.load(f)

## prepare data for modelization

The same processing is then applY.

In [None]:
train = read_file(path = TRAIN_PATH)
train = process_and_feature_engineer(train)

In [None]:
# ADD TARGETS
targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
targets = targets.set_index('customer_ID')
train = train.merge(targets, left_index=True, right_index=True, how='left')
train.target = train.target.astype('int8')
del targets

# NEEDED TO MAKE CV DETERMINISTIC (cudf merge above randomly shuffles rows)
train = train.sort_index().reset_index()


## XGBoost

In [None]:
train = train.to_pandas() # free GPU memory
TRAIN_SUBSAMPLE = 1.0
gc.collect()

In [None]:
print('XGB Version',xgb.__version__)

# XGB MODEL PARAMETERS
xgb_parms = { 
    'max_depth':4, 
    'learning_rate':0.05, 
    'subsample':0.8,
    'colsample_bytree':0.6, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist',
    'predictor':'gpu_predictor',
    'random_state':SEED
}

In [None]:
oof = []
skf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
for fold,(train_idx, valid_idx) in enumerate(skf.split(
            train, train.target )):
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)
    dtrain = xgb.DMatrix(data=train.loc[train_idx, FEATURES_3], label=train.loc[train_idx, 'target'])
    dvalid = xgb.DMatrix(data=train.loc[valid_idx, FEATURES_3], label=train.loc[valid_idx, 'target'])
    model = xgb.train(xgb_parms, 
                      dtrain=dtrain,
                      evals=[(dtrain,'train'),(dvalid,'valid')],
                      num_boost_round=9999,
                      #num_boost_round=99,
                      early_stopping_rounds=100,
                      verbose_eval=100) 
    model.save_model(f'{ODIR}/XGB_dc0_features_v{VER}_fold{fold}.xgb')
    valid_pred = model.predict(dvalid)
    val_acc = amex_metric_mod(train.loc[valid_idx, 'target'].values, valid_pred)
    print('Kaggle Metric on valid set =',val_acc,'\n')
    
    df = train.loc[valid_idx, ['customer_ID','target'] ].copy()
    df['oof_pred'] = valid_pred
    oof.append( df )

    del dtrain, dvalid, df
    _ = gc.collect()
    
    dall = xgb.DMatrix(data=train[FEATURES_3], label=train['target'])
    pred = model.predict(dall)
    all_acc = amex_metric_mod(train['target'].values, pred)
    print('Kaggle Metric on all dataset =',all_acc,'\n')
    #result_all = result_all.append({'model' : "XGBoost", 
    #                                'preprocessing' : "huseyincot_dc0_feat", 
    #                                'name' : f'XGB_dc0_features_v{VER}_fold{fold}', 
    #                                'y_valid_pred' : valid_pred, 
    #                                'valid_acc' : val_acc,
    #                                'y_pred' : pred,
    #                                'acc' : all_acc
    #                               }, 
    #                               ignore_index=True
    #                              )
    del dall, pred, valid_pred
    _ = gc.collect()
print('#'*25)
oof = pd.concat(oof,axis=0,ignore_index=True).set_index('customer_ID')
acc = amex_metric_mod(oof.target.values, oof.oof_pred.values)
#result_sum = result_sum.append({'model' : "XGBoost", 
#                                'preprocessing':"huseyincot_dc0_feat", 
#                                'name' : "XGBoost_huseyincot_dc0_feat", 
#                                'y_pred' : oof, 
#                                'acc': acc
#                              },
#                               ignore_index=True
#                              )
print('OVERALL CV Kaggle Metric =',acc)

del oof, acc 
_ = gc.collect()

## CateBoost

In [None]:
# GET CATEG VARIABLES
cat_features = ["B_30", "B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
categ = []
#print(train.columns)
for col in FEATURES_3 :
    if col not in ['customer_ID', 'target'] :
        var = '_'.join(col.split('_')[:2])
        if var in cat_features :
            categ.append(col)

In [None]:
oof = []
skf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
for fold,(train_idx, valid_idx) in enumerate(skf.split(
            train, train.target )):
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)
    train_pool = Pool(train.loc[train_idx, FEATURES_3], 
                      train.loc[train_idx, 'target'],
                      cat_features=categ
                     )
    valid_pool = Pool(train.loc[valid_idx, FEATURES_3], 
                      train.loc[valid_idx, 'target'],
                      cat_features=categ
                     )
    model = CatBoostClassifier(iterations=9999, 
                               random_state=SEED, 
                               task_type="GPU",
                               loss_function = 'Logloss',
                               #learning_rate=0.05
                               )
    model.fit(train_pool, eval_set=valid_pool,
              #od_type="Iter",
              early_stopping_rounds=100,
              #od_wait=100,
              verbose=100)
    model.save_model(f'{ODIR}/CTB_dc0_features_v{VER}_fold{fold}.ctb')
    valid_pred = model.predict_proba(valid_pool)[:,1]
    val_acc = amex_metric_mod(train.loc[valid_idx, 'target'].values, valid_pred)
    print('Kaggle Metric on valid set =',val_acc,'\n')
    
    df = train.loc[valid_idx, ['customer_ID','target'] ].copy()
    df['oof_pred'] = valid_pred
    oof.append( df )

    del train_pool, valid_pool, df
    _ = gc.collect()
    
    all_pool = Pool(train[FEATURES_3], 
                    train['target'],
                    cat_features=categ
                     )
    pred = model.predict_proba(all_pool)[:,1]
    all_acc = amex_metric_mod(train['target'].values, pred)
    print('Kaggle Metric on all dataset =',all_acc,'\n')
    #result_all = result_all.append({'model' : "CateBoost", 
    #                                'preprocessing' : "huseyincot_dc0_feat", 
    #                                'name' : f'CTB_dc0_features_v{VER}_fold{fold}', 
    #                                'y_valid_pred' : valid_pred, 
    #                                'valid_acc' : val_acc,
    #                                'y_pred' : pred,
    #                                'acc' : all_acc
    #                               }, 
    #                               ignore_index=True
    #                              )
    del all_pool, pred, valid_pred
    _ = gc.collect()
    
print('#'*25)
oof = pd.concat(oof,axis=0,ignore_index=True).set_index('customer_ID')
acc = amex_metric_mod(oof.target.values, oof.oof_pred.values)
#result_sum = result_sum.append({'model' : "CateBoost", 
#                                'preprocessing':"huseyincot_all_feat", 
#                                'name' : "CTB_huseyincot_all_feat", 
#                                'y_pred' : oof, 
#                                'acc': acc
#                               },
#                               ignore_index=True
#                              )
print('OVERALL CV Kaggle Metric =',acc)

del oof, acc 
_ = gc.collect()

In [None]:
del train 
_=gc.collect()

All Model should be download in order to be used in another notebook to create the submission file.