### 0. Imports and requirements

In [1]:
import os
import sys
import pickle

import numpy as np
import pandas as pd


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model, layers as L

from sklearn.model_selection import train_test_split
from tqdm import tqdm

os.environ["CUDA_VISIBLE_DEVICES"] = '0'

# let's add the root folder, it contains all the necessary useful functions for data processing
sys.path.append('../../')
sys.path.append('../')

2022-07-25 14:33:19.572863: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-25 14:33:19.701491: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-07-25 14:33:19.736599: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-07-25 14:33:21.204037: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural N

1 Physical GPUs, 1 Logical GPUs


In [2]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

### 1. Data Preprocessing

* The basic neural network solution provides all data processing and all stages of preprocessing. In this notebook, we will omit this section and use ready-made data.

In [3]:
path_to_dataset = '../../../val_buckets'
dir_with_datasets = os.listdir(path_to_dataset)
dataset_val = sorted([os.path.join(path_to_dataset, x) for x in dir_with_datasets])
dataset_val

['../../../val_buckets/processed_chunk_000.pkl',
 '../../../val_buckets/processed_chunk_001.pkl',
 '../../../val_buckets/processed_chunk_002.pkl',
 '../../../val_buckets/processed_chunk_003.pkl',
 '../../../val_buckets/processed_chunk_004.pkl']

In [4]:
path_to_dataset = '../../../train_buckets'
dir_with_datasets = os.listdir(path_to_dataset)
dataset_train = sorted([os.path.join(path_to_dataset, x) for x in dir_with_datasets])
dataset_train

['../../../train_buckets/processed_chunk_000.pkl',
 '../../../train_buckets/processed_chunk_001.pkl',
 '../../../train_buckets/processed_chunk_002.pkl',
 '../../../train_buckets/processed_chunk_003.pkl',
 '../../../train_buckets/processed_chunk_004.pkl',
 '../../../train_buckets/processed_chunk_005.pkl',
 '../../../train_buckets/processed_chunk_006.pkl',
 '../../../train_buckets/processed_chunk_007.pkl',
 '../../../train_buckets/processed_chunk_008.pkl',
 '../../../train_buckets/processed_chunk_009.pkl']

### 2. Modeling

In [5]:
from data_generators import batches_generator, transaction_features
from tf_training import train_epoch, eval_model, inference
from training_aux import EarlyStopping

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
transaction_features

['currency',
 'operation_kind',
 'card_type',
 'operation_type',
 'operation_type_group',
 'ecommerce_flag',
 'payment_system',
 'income_flag',
 'mcc',
 'country',
 'city',
 'mcc_category',
 'day_of_week',
 'hour',
 'weekofyear',
 'amnt',
 'days_before',
 'hour_diff']

* All features in our model will be categorical. To represent them in the model, we use categorical embeddings. To do this, you need to set the dimension of the latent space for each categorical feature. We use [formula](https://forums.fast.ai/t/sizeof-embedding-for-categorical-variables/42608 ) from the library `fast.ai `. All mappings are stored in the `embedding_projections.pkl` file

In [7]:
with open('../constants/embedding_projections.pkl', 'rb') as f:
    embedding_projections = pickle.load(f)

* Implementing the model. We will present all input features in the form of embeddings, we will configure them to get a vector representation of the transaction. We use SpatialDropout to regularize embeddings. Let's imagine the `product` attribute as a separate embedding.
Let's feed the sequences into the `BiGRU` recurrent network and the `Conv1D` convolutional neural network. 
For a recurrent network, we use all the hidden states of the network to get an aggregated view of the transaction history - we skip all the hidden states of `BiGRU` through `AvgPooling` and through `MaxPooling'.
For a convolutional network, we will skip all embedding through 3 different branches of the convolutional network with different parameters, for each branch we will apply `GlobalMaxPooling'.
Concatenate all the results. Based on this input, we will build a small `MLP` that acts as a classifier for the target task.

In [9]:
def build_transactions_rnn(transactions_cat_features, embedding_projections, product_col_name='product', 
                          rnn_units=128, classifier_units=32, optimizer=None):
    if not optimizer:
        optimizer = keras.optimizers.Adam(lr=1e-3)
        
    inputs = []
    cat_embeds = []
    # Categorical features embedding layers
    for feature_name in transactions_cat_features:
        inp = L.Input(shape=(None, ), dtype='uint32', name=f'input_{feature_name}')
        inputs.append(inp)
        source_size, projection = embedding_projections[feature_name]
        emb = L.Embedding(source_size+1, projection, trainable=True, mask_zero=False, name=f'embedding_{feature_name}')(inp)
        cat_embeds.append(emb)
    
    # product feature
    inp = L.Input(shape=(1, ), dtype='uint32', name=f'input_product')
    inputs.append(inp)
    source_size, projection = embedding_projections['product']
    product_emb = L.Embedding(source_size+1, projection, trainable=True, mask_zero=False, name=f'embedding_product')(inp)
    product_emb_reshape = L.Reshape((projection, ))(product_emb)
    
    concated_cat_embeds = L.concatenate(cat_embeds)
    dropout_embeds = L.SpatialDropout1D(0.1)(concated_cat_embeds)
    # RNN
    sequences = L.Bidirectional(L.GRU(units=rnn_units, return_sequences=True))(dropout_embeds)
    pooled_avg_sequences = L.GlobalAveragePooling1D()(sequences)
    pooled_max_sequences = L.GlobalMaxPooling1D()(sequences)
    
    # Convolutoion NN
    # Convolution layers architecture
    n_conv_1 = n_conv_2 = n_conv_3 = 256 
    k_conv_1 = 3
    k_conv_2 = 2
    k_conv_3 = 4
    # Conv layers
    conv_1 = L.Conv1D(n_conv_1, k_conv_1, activation='relu', name='conv_1')(dropout_embeds) 
    conv_1_1 = L.Conv1D(n_conv_1, k_conv_1, activation='relu', name='conv_1_1')(conv_1)
    maxp_1 = L.GlobalMaxPooling1D(name='maxp_1')(conv_1)

    conv_2 = L.Conv1D(n_conv_2, k_conv_2, activation='relu', name='conv_2')(dropout_embeds)
    conv_2_1 = L.Conv1D(n_conv_2, k_conv_2, activation='relu', name='conv_2_1')(conv_2)
    maxp_2 = L.GlobalMaxPooling1D(name='maxp_2')(conv_2)

    conv_3 = L.Conv1D(n_conv_3, k_conv_3, activation='relu', name='conv_3')(dropout_embeds)
    conv_3_1 = L.Conv1D(n_conv_3, k_conv_3, activation='relu', name='conv_3_1')(conv_3)
    maxp_3 = L.GlobalMaxPooling1D(name='maxp_3')(conv_3)

    # concatenate RNN and CNN
    concated = L.concatenate([pooled_avg_sequences, pooled_max_sequences, product_emb_reshape, maxp_1, maxp_2, maxp_3])
    # MLP
    dense_intermediate = L.Dense(classifier_units, activation='relu', 
                                 kernel_regularizer=keras.regularizers.L1L2(1e-7, 1e-5))(concated)
    proba = L.Dense(1, activation='sigmoid')(dense_intermediate)
    
    model = Model(inputs=inputs, outputs=proba)
    model.compile(loss='binary_crossentropy', optimizer=optimizer)
    return model

### 3. Training

In [12]:
! rm -r ../../rnn_baseline/checkpoints/tf_advanced_baseline
! mkdir ../../rnn_baseline/checkpoints/tf_advanced_baseline

* In order to detect retraining, we use Early Stopping

In [12]:
path_to_checkpoints = '../../rnn_baseline/models/commit_2/'
es = EarlyStopping(patience=3, mode='max', verbose=True, save_path=os.path.join(path_to_checkpoints, 'best_checkpoint.pt'), 
                   metric_name='ROC-AUC', save_format='tf')

In [10]:
tf.keras.backend.clear_session()
model = build_transactions_rnn(transaction_features, embedding_projections, classifier_units=128)

  super().__init__(name, **kwargs)


* Let's start the training cycle, we will log the loss every epoch, as well as the roc-auc for validation and training. We will save the weights after each epoch, as well as the best ones using early_stopping.

In [22]:
num_epochs = 50
train_batch_size = 128
val_batch_szie = 128

In [None]:
%%time
training_histories =[]
best_roc_auc = 0
for epoch in range(num_epochs):
    print(f'Starting epoch {epoch+1}')
    training_history = train_epoch(model, dataset_train, batch_size=train_batch_size, shuffle=True, cur_epoch=epoch, 
                steps_per_epoch=7270) #default steps_per_epoch=7270
    training_histories.append(training_history)
    
    val_roc_auc = eval_model(model, dataset_val, batch_size=val_batch_szie)
    if val_roc_auc>best_roc_auc:
        best_roc_auc = val_roc_auc
        model.save_weights(os.path.join(path_to_checkpoints, f'epoch_{epoch+1}_val_{val_roc_auc:.3f}.hdf5'))
        print(f'Saved model with val_roc_auc: {val_roc_auc}')
        train_roc_auc = eval_model(model, dataset_train, batch_size=val_batch_szie)
    print(f'Epoch {epoch+1} completed. Val roc-auc: {val_roc_auc}')
    
    es(val_roc_auc, model)
    
    if es.early_stop:
        print('Early stopping reached. Stop training...')
        break

Starting epoch 1
1080/7270 [===>..........................] - ETA: 13:53 - loss: 0.1391

In [27]:
print(f'Saved model with val_roc_auc: {best_roc_auc}')

Saved model with val_roc_auc: 0.7948809484933049


In [59]:
print(f'Epoch {epoch+1} completed. Train roc-auc: {train_roc_auc}, Val roc-auc: {val_roc_auc}')

Epoch 5 completed. Train roc-auc: 0.7986922694571151, Val roc-auc: 0.7801614959015882


### 4. Submission

* Everything is ready to make predictions for the test sample. You only need to prepare the data in the same format as for train.

In [21]:
test_frame = pd.read_csv('../../../test_target_contest.csv')
test_frame.head()

Unnamed: 0,app_id,product
0,1063620,0
1,1063621,0
2,1063622,1
3,1063623,1
4,1063624,2


In [22]:
path_to_test_dataset = '../../../test_buckets/'
dir_with_test_datasets = os.listdir(path_to_test_dataset)
dataset_test = sorted([os.path.join(path_to_test_dataset, x) for x in dir_with_test_datasets])

dataset_test

['../../../test_buckets/processed_chunk_000.pkl',
 '../../../test_buckets/processed_chunk_001.pkl',
 '../../../test_buckets/processed_chunk_002.pkl',
 '../../../test_buckets/processed_chunk_003.pkl',
 '../../../test_buckets/processed_chunk_004.pkl']

* A separate question is which of the constructed models to use in order to make predictions for the test. You can choose the best one by early_stopping. In this case, there is a risk that we will fit the validation sample, especially if it is not very representative, but this is the most basic option (we use it). You can make different versions of the ensembling using weights from different eras. This approach requires additional code. Finally, you can choose a model that shows good results on validation and at the same time is not too retrained for train sampling.

In [23]:
! ls $path_to_checkpoints

best_checkpoint.pt.data-00000-of-00001	epoch_2_val_0.779.hdf5
best_checkpoint.pt.index		epoch_3_val_0.774.hdf5
checkpoint				epoch_3_val_0.783.hdf5
epoch_1_val_0.500.hdf5			epoch_4_val_0.777.hdf5
epoch_1_val_0.563.hdf5			epoch_4_val_0.792.hdf5
epoch_1_val_0.757.hdf5			epoch_5_val_0.780.hdf5
epoch_1_val_0.764.hdf5			epoch_5_val_0.794.hdf5
epoch_1_val_0.774.hdf5			epoch_6_val_0.795.hdf5
epoch_1_val_0.776.hdf5			epoch_7_val_0.796.hdf5
epoch_2_val_0.770.hdf5			epoch_8_val_0.800.hdf5


In [28]:
tf.keras.backend.clear_session()
model = build_transactions_rnn(transaction_features, embedding_projections, classifier_units=128)

In [12]:
path_to_checkpoints = '../../rnn_baseline/checkpoints/tf_advanced_baseline/'
model.load_weights(os.path.join(path_to_checkpoints, 'best_checkpoint.pt'))

test_preds = inference(model, dataset_test, batch_size=128)


<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7fb449a2ba30>

In [26]:
test_preds.head()

Unnamed: 0,app_id,score
0,1063655,0.014757
1,1063672,0.033401
2,1063694,0.007221
3,1063709,0.036303
4,1063715,0.020206
