In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # setup env. variables for GPUs

import tensorflow as tf
import tensorflow.keras.backend as K
print(f"Using TensorFlow version {tf.__version__}")

# RESTRICT TENSORFLOW TO 8GB OF GPU RAM
# SO THAT WE HAVE 8GB RAM FOR RAPIDS
LIMIT = 8
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
  except RuntimeError as e:
    print(e)
print(f"We will restrict TensorFlow to max {LIMIT}GB GPU RAM")
print(f"then RAPIDS can use {(16-LIMIT)}GB GPU RAM")

Using TensorFlow version 2.6.4
We will restrict TensorFlow to max 8GB GPU RAM
then RAPIDS can use 8GB GPU RAM


2022-08-09 11:09:39.272855: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-09 11:09:39.377265: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-09 11:09:39.378072: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-09 11:09:39.387898: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv


## Process Train Data
The original dataset is too large to fit in main memory in one go, therefore we will process data in chunks/batches. We split train data into 10 parts and after processing it we will save it to disk. We split test data into 20 parts. This method will avoid memory errors.

In [3]:
# AFTER PROCESSING DATA ONCE, UPLOAD TO KAGGLE DATASET
# THEN SET VARIABLE BELOW TO FALSE
# AND ATTACH DATASET TO NOTEBOOK AND PUT PATH TO DATASET BELOW
PROCESS_DATA = True
PATH_TO_DATA = './data/'
#PATH_TO_DATA = '../input/amex-tensorflow/data/'

# AFTER TRAINING MODEL, UPLOAD TO KAGGLE DATASET
# THEN SET VARIABLE BELOW TO FALSE
# AND ATTACH DATASET TO NOTEBOOK AND PUT PATH TO DATASET BELOW
TRAIN_MODEL = True
PATH_TO_MODEL = './model/'
#PATH_TO_MODEL = '../input/amex-data-for-transformers-and-rnns/model/'

INFER_TEST = True

In [4]:
import cupy, cudf
import numpy as np
import pandas as pd
import gc

if PROCESS_DATA:
    # Load Targets
    targets = cudf.read_csv('/kaggle/input/amex-default-prediction/train_labels.csv')
    targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    print(f"There are {targets.shape[0]} train targets")
    
    # Get train column names
    train = cudf.read_csv('../input/amex-default-prediction/train_data.csv', nrows=1) # reads only one row
    T_COLS = train.columns
    print(f"There are {len(T_COLS)} train dataframe columns")
    
    # Get train customer names (use pandas to avoid memory error)
    train = pd.read_csv('/kaggle/input/amex-default-prediction/train_data.csv', usecols=['customer_ID'])
    train['customer_ID'] = train['customer_ID'].apply(lambda x: int(x[-16:],16) ).astype('int64')
    customers = train.drop_duplicates().sort_index().values.flatten() # 1-D arrays
    print(f"There are {len(customers)} unique customers in train.")

There are 458913 train targets
There are 190 train dataframe columns
There are 458913 unique customers in train.


In [5]:
# Calculate size of each separate file
def get_rows(customers, train, NUM_FILES=10, verbose = ''):
    chunk = len(customers) // NUM_FILES
    if verbose != '':
        print(f"We will split {verbose} data into {NUM_FILES} separate files.")
        print(f"There will be {chunk} customers in each file (except the last file).")
        print("Below are number of rows in each file:")
    rows = []
    
    for k in range(NUM_FILES):
        if k == (NUM_FILES-1):
            cc = customers[k*chunk:]
        else:
            cc = customers[k*chunk:(k+1)*chunk]
        s = train.loc[train.customer_ID.isin(cc)].shape[0]
        rows.append(s)
        
    if verbose != '':
        print( rows )
        
    return rows

if PROCESS_DATA:
    NUM_FILES = 10
    rows = get_rows(customers, train, NUM_FILES = NUM_FILES, verbose = 'train')

We will split train data into 10 separate files.
There will be 45891 customers in each file (except the last file).
Below are number of rows in each file:
[553403, 552855, 554025, 554330, 552004, 552378, 552822, 553151, 553493, 552990]


## Pre-process and Feature Engineering
Now we will do the following using [Rapids](https://rapids.ai/):
* Reduces memory usage of customer_ID column by converting to int64
* Reduces memory usage of date time column (then deletes the column).
* We fill NANs
* Label encodes the categorical columns
* We reduce memory usage dtypes of columns
* Converts every customer into a 3D array with sequence length 13 and feature length 188
The columns have been rearanged to have the 11 categorical features first. This makes building the TensorFlow model later easier.

In [6]:
def feature_engineering(train, PAD_CUSTOMER_TO_13_ROWS = True, targets = None):
        
    # REDUCE STRING COLUMNS 
    # from 64 bytes to 8 bytes, and 10 bytes to 3 bytes respectively
    train['customer_ID'] = train['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    train.S_2 = cudf.to_datetime( train.S_2 )
    train['year'] = (train.S_2.dt.year-2000).astype('int8')
    train['month'] = (train.S_2.dt.month).astype('int8')
    train['day'] = (train.S_2.dt.day).astype('int8')
    del train['S_2']
        
    # LABEL ENCODE CAT COLUMNS (and reduce to 1 byte)
    # with 0: padding, 1: nan, 2,3,4,etc: values
    d_63_map = {'CL':2, 'CO':3, 'CR':4, 'XL':5, 'XM':6, 'XZ':7}
    train['D_63'] = train.D_63.map(d_63_map).fillna(1).astype('int8')

    d_64_map = {'-1':2,'O':3, 'R':4, 'U':5}
    train['D_64'] = train.D_64.map(d_64_map).fillna(1).astype('int8')
    
    CATS = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_66', 'D_68']
    OFFSETS = [2,1,2,2,3,2,3,2,2] #2 minus minimal value in full train csv
    # then 0 will be padding, 1 will be NAN, 2,3,4,etc will be values
    for c,s in zip(CATS,OFFSETS):
        train[c] = train[c] + s
        train[c] = train[c].fillna(1).astype('int8')
    CATS += ['D_63','D_64']
    
    # ADD NEW FEATURES HERE
    # EXAMPLE: train['feature_189'] = etc etc etc
    # EXAMPLE: train['feature_190'] = etc etc etc
    # IF CATEGORICAL, THEN ADD TO CATS WITH: CATS += ['feaure_190'] etc etc etc
    
    # REDUCE MEMORY DTYPE
    SKIP = ['customer_ID','year','month','day']
    for c in train.columns:
        if c in SKIP: continue
        if str( train[c].dtype )=='int64':
            train[c] = train[c].astype('int32')
        if str( train[c].dtype )=='float64':
            train[c] = train[c].astype('float32')
            
    # PAD ROWS SO EACH CUSTOMER HAS 13 ROWS
    if PAD_CUSTOMER_TO_13_ROWS:
        tmp = train[['customer_ID']].groupby('customer_ID').customer_ID.agg('count')
        more = cupy.array([],dtype='int64') 
        for j in range(1,13):
            i = tmp.loc[tmp==j].index.values
            more = cupy.concatenate([more,cupy.repeat(i,13-j)])
        df = train.iloc[:len(more)].copy().fillna(0)
        df = df * 0 - 1 #pad numerical columns with -1
        df[CATS] = (df[CATS] * 0).astype('int8') #pad categorical columns with 0
        df['customer_ID'] = more
        train = cudf.concat([train,df],axis=0,ignore_index=True)
        
    # ADD TARGETS (and reduce to 1 byte)
    if targets is not None:
        train = train.merge(targets,on='customer_ID',how='left')
        train.target = train.target.astype('int8')
        
    # FILL NAN
    train = train.fillna(-0.5) #this applies to numerical columns
    
    # SORT BY CUSTOMER THEN DATE
    train = train.sort_values(['customer_ID','year','month','day']).reset_index(drop=True)
    train = train.drop(['year','month','day'],axis=1)
    
    # REARRANGE COLUMNS WITH 11 CATS FIRST
    COLS = list(train.columns[1:])
    COLS = ['customer_ID'] + CATS + [c for c in COLS if c not in CATS]
    train = train[COLS]
    
    return train

In [7]:
if PROCESS_DATA:
    # CREATE PROCESSED TRAIN FILES AND SAVE TO DISK        
    for k in range(NUM_FILES):

        # READ CHUNK OF TRAIN CSV FILE
        skip = int(np.sum( rows[:k] ) + 1) #the plus one is for skipping header
        train = cudf.read_csv('../input/amex-default-prediction/train_data.csv', nrows=rows[k], 
                              skiprows=skip, header=None, names=T_COLS)

        # FEATURE ENGINEER DATAFRAME
        train = feature_engineering(train, targets = targets)

        # SAVE FILES
        print(f'Train_File_{k+1} has {train.customer_ID.nunique()} customers and shape',train.shape)
        tar = train[['customer_ID','target']].drop_duplicates().sort_index()
        if not os.path.exists(PATH_TO_DATA): os.makedirs(PATH_TO_DATA)
        tar.to_parquet(f'{PATH_TO_DATA}targets_{k+1}.pqt',index=False)
        data = train.iloc[:,1:-1].values.reshape((-1,13,188))
        cupy.save(f'{PATH_TO_DATA}data_{k+1}',data.astype('float32'))

    # CLEAN MEMORY
    del train, tar, data
    del targets
    gc.collect()

Train_File_1 has 45891 customers and shape (596583, 190)
Train_File_2 has 45891 customers and shape (596583, 190)
Train_File_3 has 45891 customers and shape (596583, 190)
Train_File_4 has 45891 customers and shape (596583, 190)
Train_File_5 has 45891 customers and shape (596583, 190)
Train_File_6 has 45891 customers and shape (596583, 190)
Train_File_7 has 45891 customers and shape (596583, 190)
Train_File_8 has 45891 customers and shape (596583, 190)
Train_File_9 has 45891 customers and shape (596583, 190)
Train_File_10 has 45894 customers and shape (596622, 190)


## AMEX metric code for Model Evaluation

In [8]:
# COMPETITION METRIC FROM Konstantin Yakovlev
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

## Build Model

In [9]:
# Gated Recurrent Unit Model, Find more on the link below
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/GRU

def build_model():
    
    # INPUT - FIRST 11 COLUMNS ARE CAT, NEXT 177 ARE NUMERIC
    inp = tf.keras.Input(shape=(13,188))
    embeddings = []
    for k in range(11):
        emb = tf.keras.layers.Embedding(10, 4, embeddings_initializer='uniform',
                                        embeddings_regularizer=None, 
                                        activity_regularizer=None, 
                                        embeddings_constraint=None, 
                                        mask_zero=False, 
                                        input_length=None)
        embeddings.append(emb(inp[:,:,k]))

    x = tf.keras.layers.Concatenate()([inp[:,:,11:]]+embeddings)
    
    # SIMPLE RNN BACKBONE
    x = tf.keras.layers.GRU(units=128, return_sequences=False)(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    
    # OUTPUT
    x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    
    # COMPILE MODEL
    model = tf.keras.Model(inputs=inp, outputs=x)
    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    loss = tf.keras.losses.BinaryCrossentropy()
    model.compile(loss=loss, optimizer = opt)
    
    return model

## Train Model

In [10]:
if TRAIN_MODEL:
    # SAVE TRUE AND OOF
    true = np.array([])
    oof = np.array([])
    VERBOSE = 2 # use 1 for interactive 

    for fold in range(5):

        # INDICES OF TRAIN AND VALID FOLDS
        valid_idx = [2*fold+1, 2*fold+2]
        train_idx = [x for x in [1,2,3,4,5,6,7,8,9,10] if x not in valid_idx]

        print('#'*25)
        print(f'### Fold {fold+1} with valid files', valid_idx)

        # READ TRAIN DATA FROM DISK
        X_train = []; y_train = []
        for k in train_idx:
            X_train.append( np.load(f'{PATH_TO_DATA}data_{k}.npy'))
            y_train.append( pd.read_parquet(f'{PATH_TO_DATA}targets_{k}.pqt') )
        X_train = np.concatenate(X_train,axis=0)
        y_train = pd.concat(y_train).target.values
        print('### Training data shapes', X_train.shape, y_train.shape)

        # READ VALID DATA FROM DISK
        X_valid = []; y_valid = []
        for k in valid_idx:
            X_valid.append( np.load(f'{PATH_TO_DATA}data_{k}.npy'))
            y_valid.append( pd.read_parquet(f'{PATH_TO_DATA}targets_{k}.pqt') )
        X_valid = np.concatenate(X_valid,axis=0)
        y_valid = pd.concat(y_valid).target.values
        print('### Validation data shapes', X_valid.shape, y_valid.shape)
        print('#'*25)

        # BUILD AND TRAIN MODEL
        K.clear_session()
        model = build_model()
        h = model.fit(X_train,y_train, 
                      validation_data = (X_valid,y_valid),
                      batch_size=512, epochs=8, verbose=VERBOSE)
        if not os.path.exists(PATH_TO_MODEL): os.makedirs(PATH_TO_MODEL)
        model.save_weights(f'{PATH_TO_MODEL}gru_fold_{fold+1}.h5')

        # INFER VALID DATA
        print('Inferring validation data...')
        p = model.predict(X_valid, batch_size=512, verbose=VERBOSE).flatten()

        print()
        print(f'Fold {fold+1} CV=', amex_metric_mod(y_valid, p) )
        print()
        true = np.concatenate([true, y_valid])
        oof = np.concatenate([oof, p])
        
        # CLEAN MEMORY
        del model, X_train, y_train, X_valid, y_valid, p
        gc.collect()

    # PRINT OVERALL RESULTS
    print('#'*25)
    print(f'Overall CV =', amex_metric_mod(true, oof) )
    K.clear_session()

#########################
### Fold 1 with valid files [1, 2]
### Training data shapes (367131, 13, 188) (367131,)
### Validation data shapes (91782, 13, 188) (91782,)
#########################


2022-08-09 11:20:41.735131: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 3589072656 exceeds 10% of free system memory.
2022-08-09 11:20:46.007281: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 3589072656 exceeds 10% of free system memory.
2022-08-09 11:20:49.081957: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/8


2022-08-09 11:20:52.598127: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005
2022-08-09 11:21:02.928017: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 897260832 exceeds 10% of free system memory.
2022-08-09 11:21:04.221465: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 897260832 exceeds 10% of free system memory.


718/718 - 17s - loss: 0.2406 - val_loss: 0.2331
Epoch 2/8
718/718 - 9s - loss: 0.2271 - val_loss: 0.2292
Epoch 3/8
718/718 - 9s - loss: 0.2239 - val_loss: 0.2298
Epoch 4/8
718/718 - 9s - loss: 0.2213 - val_loss: 0.2267
Epoch 5/8
718/718 - 10s - loss: 0.2191 - val_loss: 0.2255
Epoch 6/8
718/718 - 9s - loss: 0.2172 - val_loss: 0.2268
Epoch 7/8
718/718 - 9s - loss: 0.2152 - val_loss: 0.2259
Epoch 8/8
718/718 - 9s - loss: 0.2126 - val_loss: 0.2262
Inferring validation data...


2022-08-09 11:22:11.044883: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 897260832 exceeds 10% of free system memory.


180/180 - 1s

Fold 1 CV= 0.7841543630128613

#########################
### Fold 2 with valid files [3, 4]
### Training data shapes (367131, 13, 188) (367131,)
### Validation data shapes (91782, 13, 188) (91782,)
#########################
Epoch 1/8
718/718 - 13s - loss: 0.2378 - val_loss: 0.2366
Epoch 2/8
718/718 - 9s - loss: 0.2266 - val_loss: 0.2379
Epoch 3/8
718/718 - 10s - loss: 0.2236 - val_loss: 0.2271
Epoch 4/8
718/718 - 9s - loss: 0.2206 - val_loss: 0.2298
Epoch 5/8
718/718 - 9s - loss: 0.2187 - val_loss: 0.2248
Epoch 6/8
718/718 - 9s - loss: 0.2164 - val_loss: 0.2248
Epoch 7/8
718/718 - 9s - loss: 0.2144 - val_loss: 0.2243
Epoch 8/8
718/718 - 10s - loss: 0.2118 - val_loss: 0.2261
Inferring validation data...
180/180 - 1s

Fold 2 CV= 0.7792243451201806

#########################
### Fold 3 with valid files [5, 6]
### Training data shapes (367131, 13, 188) (367131,)
### Validation data shapes (91782, 13, 188) (91782,)
#########################
Epoch 1/8
718/718 - 14s - loss: 0.24

## Process Test Data¶
We'll process test data same as we processed train data.

In [11]:
if PROCESS_DATA:
    # GET TEST COLUMN NAMES
    test = cudf.read_csv('../input/amex-default-prediction/test_data.csv', nrows=1)
    T_COLS = test.columns
    print(f'There are {len(T_COLS)} test dataframe columns')
    
    # GET TEST CUSTOMER NAMES (use pandas to avoid memory error)
    test = pd.read_csv('../input/amex-default-prediction/test_data.csv', usecols=['customer_ID'])
    test['customer_ID'] = test['customer_ID'].apply(lambda x: int(x[-16:],16) ).astype('int64')
    
    customers = test.drop_duplicates().sort_index().values.flatten()
    print(f'There are {len(customers)} unique customers in test.')

There are 190 test dataframe columns
There are 924621 unique customers in test.


In [12]:
NUM_FILES = 20
if PROCESS_DATA:
    # Calculate size of each separate file
    rows = get_rows(customers, test, NUM_FILES = NUM_FILES, verbose = 'test')

We will split test data into 20 separate files.
There will be 46231 customers in each file (except the last file).
Below are number of rows in each file:
[567933, 568482, 569369, 567886, 567539, 568041, 568138, 567596, 568543, 567539, 568421, 568745, 568279, 568333, 568327, 568901, 568300, 568001, 567372, 568017]


In [13]:
if PROCESS_DATA:
    # SAVE TEST CUSTOMERS INDEX
    test_customer_hashes = cupy.array([],dtype='int64')
    
    # CREATE PROCESSED TEST FILES AND SAVE TO DISK
    for k in range(NUM_FILES):

        # READ CHUNK OF TEST CSV FILE
        skip = int(np.sum( rows[:k] ) + 1) #the plus one is for skipping header
        test = cudf.read_csv('/kaggle/input/amex-default-prediction/test_data.csv', nrows=rows[k], 
                              skiprows=skip, header=None, names=T_COLS)

        # FEATURE ENGINEER DATAFRAME
        test = feature_engineering(test, targets = None)
        
        # SAVE TEST CUSTOMERS INDEX
        cust = test[['customer_ID']].drop_duplicates().sort_index().values.flatten()
        test_customer_hashes = cupy.concatenate([test_customer_hashes,cust])

        # SAVE FILES
        print(f'Test_File_{k+1} has {test.customer_ID.nunique()} customers and shape',test.shape)
        data = test.iloc[:,1:].values.reshape((-1,13,188))
        cupy.save(f'{PATH_TO_DATA}test_data_{k+1}',data.astype('float32'))
        
    # SAVE CUSTOMER INDEX OF ALL TEST FILES
    cupy.save(f'{PATH_TO_DATA}test_hashes_data', test_customer_hashes)

    # CLEAN MEMORY
    del test, data
    gc.collect()

Test_File_1 has 46231 customers and shape (601003, 189)
Test_File_2 has 46231 customers and shape (601003, 189)
Test_File_3 has 46231 customers and shape (601003, 189)
Test_File_4 has 46231 customers and shape (601003, 189)
Test_File_5 has 46231 customers and shape (601003, 189)
Test_File_6 has 46231 customers and shape (601003, 189)
Test_File_7 has 46231 customers and shape (601003, 189)
Test_File_8 has 46231 customers and shape (601003, 189)
Test_File_9 has 46231 customers and shape (601003, 189)
Test_File_10 has 46231 customers and shape (601003, 189)
Test_File_11 has 46231 customers and shape (601003, 189)
Test_File_12 has 46231 customers and shape (601003, 189)
Test_File_13 has 46231 customers and shape (601003, 189)
Test_File_14 has 46231 customers and shape (601003, 189)
Test_File_15 has 46231 customers and shape (601003, 189)
Test_File_16 has 46231 customers and shape (601003, 189)
Test_File_17 has 46231 customers and shape (601003, 189)
Test_File_18 has 46231 customers and sha

## Infer test data

In [14]:
if INFER_TEST:
    # INFER TEST DATA
    start = 0; end = 0
    sub = cudf.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv')
    
    # REARANGE SUB ROWS TO MATCH PROCESSED TEST FILES
    sub['hash'] = sub['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    test_hash_index = cupy.load(f'{PATH_TO_DATA}test_hashes_data.npy')
    sub = sub.set_index('hash').loc[test_hash_index].reset_index(drop=True)
    
    for k in range(NUM_FILES):
        # BUILD MODEL
        K.clear_session()
        model = build_model()
        
        # LOAD TEST DATA
        print(f'Inferring Test_File_{k+1}')
        X_test = np.load(f'{PATH_TO_DATA}test_data_{k+1}.npy')
        end = start + X_test.shape[0]

        # INFER 5 FOLD MODELS
        model.load_weights(f'{PATH_TO_MODEL}gru_fold_1.h5')
        p = model.predict(X_test, batch_size=512, verbose=0).flatten() 
        for j in range(1,5):
            model.load_weights(f'{PATH_TO_MODEL}gru_fold_{j+1}.h5')
            p += model.predict(X_test, batch_size=512, verbose=0).flatten()
        p /= 5.0

        # SAVE TEST PREDICTIONS
        sub.loc[start:end-1,'prediction'] = p
        start = end
        
        # CLEAN MEMORY
        del model, X_test, p
        gc.collect()

Inferring Test_File_1
Inferring Test_File_2
Inferring Test_File_3
Inferring Test_File_4
Inferring Test_File_5
Inferring Test_File_6
Inferring Test_File_7
Inferring Test_File_8
Inferring Test_File_9
Inferring Test_File_10
Inferring Test_File_11
Inferring Test_File_12
Inferring Test_File_13
Inferring Test_File_14
Inferring Test_File_15
Inferring Test_File_16
Inferring Test_File_17
Inferring Test_File_18
Inferring Test_File_19
Inferring Test_File_20


## Submission File

In [15]:
if INFER_TEST:
    sub.to_csv('submission.csv',index=False)
    print('Submission file shape is', sub.shape )
    display(sub.head())

Submission file shape is (924621, 2)


Unnamed: 0,customer_ID,prediction
0,038be0571bd6b3776cb8512731968f4de302c811030124...,0.003108
1,0074a0233ef766b52884608cc8cf9098f59d885b5d59fc...,0.000215
2,060b8b7f30f795a0e93995d45b29461ffa6ece0eeb5c3d...,0.093942
3,03a1d125bdd776000bf0b28238d0bea240ad581d332e70...,0.15348
4,0290f245dd35ba899af52316ccc62b2627e7ae18cd76a2...,0.277354
