<a href="https://www.kaggle.com/code/erwanchesneau/notebook-amex-submission-model-averaging?scriptVersionId=104555281" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Python librairies

In [2]:
# LOAD LIBRARIES
import pandas as pd
import numpy as np # CPU libraries
import cupy, cudf # GPU libraries
import matplotlib.pyplot as plt
import gc, os
from sklearn.model_selection import KFold
import xgboost as xgb
import pickle
from catboost import Pool, CatBoostClassifier
print('RAPIDS version',cudf.__version__)

RAPIDS version 21.10.01


# variables

In [3]:
# VERSION NAME FOR SAVED MODEL FILES
VER = 4

# TRAIN RANDOM SEED
SEED = 42

# FILL NAN VALUE
NAN_VALUE = -127 # will fit in int8

# FOLDS PER MODEL
FOLDS = 5

TRAIN_PATH = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'

TEST_PATH = '../input/amex-data-integer-dtypes-parquet-format/test.parquet'

MODEL_PATH = "../input/amex-outputs-echesneau-lr005"

cat_features = ["B_30", "B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]

In [4]:
len(os.listdir(MODEL_PATH))

33

33 models are saved in the MODEL_PATH.
The average of the predictions will be calculated

# functions

In [5]:
def read_file(path = '', usecols = None):
    """
    function to load dataset
    The function is modified frm the original one 
    The Fillna is done only during the processing
    """
    # LOAD DATAFRAME
    if usecols is not None: 
        df = cudf.read_parquet(path, columns=usecols)
    else: 
        df = cudf.read_parquet(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = cudf.to_datetime( df.S_2 )
    # SORT BY CUSTOMER AND DATE (so agg('last') works correctly)
    #df = df.sort_values(['customer_ID','S_2'])
    #df = df.reset_index(drop=True)
    # FILL NAN
    #df = df.fillna(NAN_VALUE) 
    print('shape of data:', df.shape)
    
    return df

In [6]:
def process_and_feature_engineer(df):
    """
    function to process database
    FEATURE ENGINEERING FROM 
    https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    """
    # FEATURE ENGINEERING FROM 
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117",\
                    "D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]

    test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', \
                                                                'std', \
                                                                'min', \
                                                                'max', \
                                                                'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

    test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', \
                                                                'last', \
                                                                'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

    df = cudf.concat([test_num_agg, test_cat_agg], axis=1)
    del test_num_agg, test_cat_agg
    df = df.fillna(NAN_VALUE)
    
    return df

In [7]:
def amex_metric_mod(y_true, y_pred):
    """
    function to calculate the metric of the competion
    from https://www.kaggle.com/kyakovlev
    and https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534 
    """
    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [8]:
def get_rows(customers, test, NUM_PARTS = 4, verbose = ''):
    """
    function to split the database
    CALCULATE SIZE OF EACH SEPARATE TEST PART
    """
    chunk = len(customers)//NUM_PARTS
    if verbose != '':
        print(f'We will process {verbose} data as {NUM_PARTS} separate parts.')
        print(f'There will be {chunk} customers in each part (except the last part).')
        print('Below are number of rows in each part:')
    rows = []

    for k in range(NUM_PARTS):
        if k==NUM_PARTS-1: cc = customers[k*chunk:]
        else: cc = customers[k*chunk:(k+1)*chunk]
        s = test.loc[test.customer_ID.isin(cc)].shape[0]
        rows.append(s)
    if verbose != '': print( rows )
    return rows,chunk

# load data

All models do not used the same features.
They are load from MODEL_PATH

In [9]:
if os.path.isfile(MODEL_PATH+"/all_features.pkl") :
    with open(MODEL_PATH+"/all_features.pkl", 'rb') as f :
        FEATURES = pickle.load(f)
if os.path.isfile(MODEL_PATH+"/all_features_2.pkl") :
    with open(MODEL_PATH+"/all_features_2.pkl", 'rb') as f :
        FEATURES_2 = pickle.load(f)
if os.path.isfile(MODEL_PATH+"/all_features_3.pkl") :
    with open(MODEL_PATH+"/all_features_3.pkl", 'rb') as f :
        FEATURES_3 = pickle.load(f)

# Test on train data

The accuracy is calculated on train set before test set.

In [10]:
NUM_PARTS = 3
print(f'Reading train data...')
train = read_file(path = TRAIN_PATH, usecols = ['customer_ID','S_2'])
customers = train[['customer_ID']].drop_duplicates().sort_index().values.flatten()
rows,num_cust = get_rows(customers, train[['customer_ID']], NUM_PARTS = NUM_PARTS, verbose = 'test')

Reading train data...
shape of data: (5531451, 2)
We will process test data as 3 separate parts.
There will be 152971 customers in each part (except the last part).
Below are number of rows in each part:
[1845301, 1842259, 1843891]


In [11]:
# INFER Train DATA IN PARTS
skip_rows = 0
skip_cust = 0
for k in range(NUM_PARTS):
    # READ PART OF TRAIN DATA
    print(f'\nReading train data...')
    train = read_file(path = TRAIN_PATH)
    if k == 0 :
        c = train[['customer_ID']].drop_duplicates().sort_index()['customer_ID'].to_pandas()
        result_preds = pd.DataFrame(index=c.to_list())
        del c
        #result_preds = test[['P_2']].to_pandas().copy(deep=True)
        #result_preds = result_preds[['customer_ID']].drop_duplicates().sort_index()
    train = train.iloc[skip_rows:skip_rows+rows[k]]
    print(f'=> Train part {k+1} has shape', train.shape )
    print(f"From line {skip_rows} to {skip_rows+rows[k]}")
    skip_rows += rows[k]
    
    for file in os.listdir(MODEL_PATH) :
        if os.path.isfile(MODEL_PATH+"/"+file) and not file.endswith('.pkl'):
            print(f"Process model : {file}")
            if "_".join(file.split('_')[1:3]) == "all_features" :
                proc = process_and_feature_engineer(train).to_pandas()
                proc = proc[FEATURES]
            elif  "_".join(file.split('_')[1:3]) == "nonan_features" :
                proc = process_and_feature_engineer(train).to_pandas()
                proc = proc[FEATURES_2]
            elif "_".join(file.split('_')[1:3]) == "dc0_features" :
                proc = process_and_feature_engineer(train).to_pandas()
                proc = proc[FEATURES_3]
            else :
                print(f"Unknown preprocessing : {'_'.join(file.split('_')[1:3])}")
            if file.startswith("XGB") :
                tmp = proc[['B_2_max']].copy(deep=True)
                dtest = xgb.DMatrix(data=proc)
                model = xgb.Booster()
                model.load_model(f'{MODEL_PATH}/{file}')
                tmp[f"{file}_parts-{k}"] = model.predict(dtest)
                tmp.drop('B_2_max', axis=1,inplace=True)
                result_preds = result_preds.merge(tmp, left_index=True, right_index=True, how='left')
                del dtest
            elif file.startswith("CTB") :
                tmp = proc[['B_2_max']].copy(deep=True)
                model = CatBoostClassifier()
                model.load_model(f'{MODEL_PATH}/{file}')
                tmp[f"{file}_parts-{k}"] = model.predict_proba(proc)[:,1]
                tmp.drop('B_2_max', axis=1,inplace=True)
                result_preds = result_preds.merge(tmp, left_index=True, right_index=True, how='left')
            else :
                print(f"Unknown model {file}")
                
            try :
                del proc, model, tmp
                _ = gc.collect()
            except :
                pass
    del train
    _ = gc.collect()


Reading train data...
shape of data: (5531451, 190)
=> Train part 1 has shape (1845301, 190)
From line 0 to 1845301
Process model : XGB_nonan_features_v2_fold2.xgb
Process model : XGB_dc0_features_v2_fold1.xgb
Process model : XGB_dc0_features_v2_fold0.xgb
Process model : CTB_all_features_v2_fold4.ctb
Process model : CTB_all_features_v2_fold3.ctb
Process model : CTB_all_features_v2_fold2.ctb
Process model : XGB_all_features_v2_fold2.xgb
Process model : XGB_nonan_features_v2_fold4.xgb
Process model : XGB_all_features_v2_fold1.xgb
Process model : CTB_nonan_features_v2_fold4.ctb
Process model : XGB_dc0_features_v2_fold2.xgb
Process model : CTB_all_features_v2_fold0.ctb
Process model : XGB_all_features_v2_fold3.xgb
Process model : XGB_all_features_v2_fold0.xgb
Process model : XGB_dc0_features_v2_fold3.xgb
Process model : XGB_dc0_features_v2_fold4.xgb
Process model : XGB_nonan_features_v2_fold3.xgb
Process model : CTB_nonan_features_v2_fold2.ctb
Process model : CTB_nonan_features_v2_fold0.c

Results are store in a DataFrame. 
Folds are merge into one row.
Targets are merge.

In [12]:
results = result_preds.groupby(['_'.join(col.split('_')[:-1]) for col in result_preds.columns], axis=1).mean()
targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
targets = targets.set_index('customer_ID')

In [13]:
results = results.merge(targets.to_pandas(), left_index=True, right_index=True, how='left')

The AMEX metric is calculated for all models

In [14]:
amex_metrix = pd.DataFrame(columns=['model', 'amex_metrics'])
for col in [columns for columns in results.columns if columns != 'target']:
    amex_metrix = amex_metrix.append({"model" : col , 'amex_metrics' : amex_metric_mod(results['target'], results[col])},
                                    ignore_index=True)
amex_metrix

Unnamed: 0,model,amex_metrics
0,CTB_all_features_v2_fold0.ctb,0.851374
1,CTB_all_features_v2_fold1.ctb,0.84194
2,CTB_all_features_v2_fold2.ctb,0.834228
3,CTB_all_features_v2_fold3.ctb,0.851709
4,CTB_all_features_v2_fold4.ctb,0.848916
5,CTB_dc0_features_v2_fold0.ctb,0.846078
6,CTB_dc0_features_v2_fold1.ctb,0.846582
7,CTB_dc0_features_v2_fold2.ctb,0.839954
8,CTB_dc0_features_v2_fold3.ctb,0.834161
9,CTB_dc0_features_v2_fold4.ctb,0.843203


No conclusion on the accuracy of models could be done due to a overfitting on the train set.

# Create submission CSV

Prediction on the test set and creation of the submission file.

In [15]:
# COMPUTE SIZE OF 5 PARTS FOR TEST DATA
NUM_PARTS = 5
print(f'Reading test data...')
test = read_file(path = TEST_PATH, usecols = ['customer_ID','S_2'])
customers = test[['customer_ID']].drop_duplicates().sort_index().values.flatten()
rows,num_cust = get_rows(customers, test[['customer_ID']], NUM_PARTS = NUM_PARTS, verbose = 'test')

Reading test data...
shape of data: (11363762, 2)
We will process test data as 5 separate parts.
There will be 184924 customers in each part (except the last part).
Below are number of rows in each part:
[2273670, 2271314, 2273248, 2273840, 2271690]


This part of the code could be modify to use only selection of models.

In [None]:
# INFER TEST DATA IN PARTS
skip_rows = 0
skip_cust = 0
#test_preds = []
for k in range(NUM_PARTS):
    # READ PART OF TEST DATA
    print(f'\nReading test data...')
    test = read_file(path = TEST_PATH)
    if k == 0 :
        c = test[['customer_ID']].drop_duplicates().sort_index()['customer_ID'].to_pandas()
        result_preds = pd.DataFrame(index=c.to_list())
        del c
        #result_preds = test[['P_2']].to_pandas().copy(deep=True)
        #result_preds = result_preds[['customer_ID']].drop_duplicates().sort_index()
    test = test.iloc[skip_rows:skip_rows+rows[k]]
    #test = test.to_pandas()
    print(f'=> Test part {k+1} has shape', test.shape )
    print(f"From line {skip_rows} to {skip_rows+rows[k]}")
    skip_rows += rows[k]
    
    for file in os.listdir(MODEL_PATH) :
        if os.path.isfile(MODEL_PATH+"/"+file) and not file.endswith('.pkl'):
            print(f"Process model : {file}")
            if "_".join(file.split('_')[1:3]) == "all_features" :
                proc = process_and_feature_engineer(test).to_pandas()
                proc = proc[FEATURES]
            elif  "_".join(file.split('_')[1:3]) == "nonan_features" :
                proc = process_and_feature_engineer(test).to_pandas()
                proc = proc[FEATURES_2]
            elif "_".join(file.split('_')[1:3]) == "dc0_features" :
                proc = process_and_feature_engineer(test).to_pandas()
                proc = proc[FEATURES_3]
            else :
                print(f"Unknown preprocessing : {'_'.join(file.split('_')[1:3])}")
            if file.startswith("XGB") :
                tmp = proc[['B_2_max']].copy(deep=True)
                dtest = xgb.DMatrix(data=proc)
                model = xgb.Booster()
                model.load_model(f'{MODEL_PATH}/{file}')
                tmp[f"{file}_parts-{k}"] = model.predict(dtest)
                tmp.drop('B_2_max', axis=1,inplace=True)
                result_preds = result_preds.merge(tmp, left_index=True, right_index=True, how='left')
                del dtest
            elif file.startswith("CTB") :
                tmp = proc[['B_2_max']].copy(deep=True)
                #categ = []
                #for col in proc.columns :
                #    var = '_'.join(col.split('_')[:2])
                #    if var in cat_features :
                #        categ.append(col)
                model = CatBoostClassifier()
                model.load_model(f'{MODEL_PATH}/{file}')
                tmp[f"{file}_parts-{k}"] = model.predict_proba(proc)[:,1]
                tmp.drop('B_2_max', axis=1,inplace=True)
                result_preds = result_preds.merge(tmp, left_index=True, right_index=True, how='left')
            else :
                print(f"Unknown model {file}")
            
            try :
                del proc, model, tmp
                _ = gc.collect()
            except :
                pass
    del test
    _ = gc.collect()

The final prediction is the mean value of each row.

In [None]:
pred = result_preds.mean(axis=1).reset_index()
pred.columns = ["customer_ID", "prediction"]
pred

In [None]:
result_preds

A DataFrame is created on the expected format and saved.

In [None]:
sub = cudf.read_csv('../input/amex-default-prediction/sample_submission.csv')[['customer_ID']]
sub['customer_ID_hash'] = sub['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
sub = sub.set_index('customer_ID_hash').to_pandas()
tmp = pred.set_index("customer_ID")
sub = sub.merge(tmp[['prediction']], left_index=True, right_index=True, how='left')
sub = sub.reset_index(drop=True)
sub.head()

In [None]:
sub.to_csv(f'submission_xgb_v{VER}.csv',index=False)
print('Submission file shape is', sub.shape )