In [None]:
from IPython.core.display import display, HTML, clear_output

from scipy.spatial import distance_matrix
import pandas as pd
import numpy as np
import random
import shutil
import os
import gc

from sklearn.preprocessing import QuantileTransformer, MinMaxScaler, minmax_scale
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.utils import get_custom_objects
from tensorflow.keras.layers import Activation
from tensorflow.keras.backend import sigmoid

from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.metrics import Metric

import torch
import torch.nn as nn
from torch.optim import Adam
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts

import lightgbm as lgb

from joblib import Parallel, delayed

from tqdm.notebook import tqdm

import warnings
warnings.simplefilter(action='ignore')


In [3]:
DATA_PATH = '../input/optiver-realized-volatility-prediction'
DATA_EXTRA_PATH = '../input/optiver'
TRAIN_PATH = DATA_PATH + '/train.csv'
TEST_PATH = DATA_PATH + '/test.csv'
TRAIN_FOLD_PATH = DATA_EXTRA_PATH + '/train_fold.csv'


# Function to process features as input to FFNN model
def process_nn_data(train_nn, test_nn):

    train_nn.replace([np.inf, -np.inf], np.nan,inplace=True)
    test_nn.replace([np.inf, -np.inf], np.nan,inplace=True)

    train_nn = train_nn[colNames]
    test_nn = test_nn[colNames]

    for col in colNames:
        qt = QuantileTransformer(random_state=21,n_quantiles=2000, output_distribution='normal')
        train_nn[col] = qt.fit_transform(train_nn[[col]])
        test_nn[col] = qt.transform(test_nn[[col]])

    train_nn[colNames] = train_nn[colNames].fillna(train_nn[colNames].mean())
    test_nn[colNames] = test_nn[colNames].fillna(train_nn[colNames].mean())

    train_nn[['stock_id','time_id','target']] = train_master[['stock_id','time_id','target']]
    test_nn[['stock_id','time_id']] = test_master[['stock_id','time_id']]

    return train_nn, test_nn

# Function to process features as input to TabNet model
def process_tabnet_data(train, test):

    train.replace([np.inf, -np.inf], np.nan,inplace=True)
    test.replace([np.inf, -np.inf], np.nan,inplace=True)

    for col in train.columns.to_list()[5:]:
        train[col] = train[col].fillna(train[col].mean())
        train = train.fillna(0)

    for col in test.columns.to_list()[3:]:
        test[col] = test[col].fillna(test[col].mean())
        test = test.fillna(0)

    X = train.drop(['row_id', 'target', 'time_id'], axis = 1)
    y = train['target']

    X_test=test.drop(['time_id','row_id'], axis=1)

    nunique = X.nunique()
    types = X.dtypes

    categorical_columns = []
    categorical_dims =  {}

    for col in X.columns:
        if  col == 'stock_id':
            l_enc = LabelEncoder()
            X[col] = l_enc.fit_transform(X[col].values)
            X_test[col] = l_enc.transform(X_test[col].values)
            categorical_columns.append(col)
            categorical_dims[col] = len(l_enc.classes_)
        else:
            scaler = StandardScaler()
            X[col] = scaler.fit_transform(X[col].values.reshape(-1, 1))
            X_test[col] = scaler.transform(X_test[col].values.reshape(-1, 1))

    cat_idxs = [ i for i, f in enumerate(X.columns.tolist()) if f in categorical_columns]

    cat_dims = [ categorical_dims[f] for i, f in enumerate(X.columns.tolist()) if f in categorical_columns]

    return X, y, X_test, cat_idxs, cat_dims



### Models

In [7]:
# FFNN base model
def base_model(input_shape, hidden_units, stock_embedding_size):

    # Each instance will consist of two inputs: a single user id, and a single movie id
    stock_id_input = keras.Input(shape=(1,), name='stock_id')
    num_input = keras.Input(shape=(input_shape,), name='num_data')


    #embedding, flatenning and concatenating
    stock_embedded = keras.layers.Embedding(max(cat_data)+1, stock_embedding_size,
                                           input_length=1, name='stock_embedding')(stock_id_input)
    stock_flattened = keras.layers.Flatten()(stock_embedded)
    out = keras.layers.Concatenate()([stock_flattened, num_input])

    # Add one or more hidden layers
    for n_hidden in hidden_units:

        out = keras.layers.Dense(n_hidden, activation='swish')(out)

    # A single output: our predicted rating
    out = keras.layers.Dense(1, activation='linear', name='prediction')(out)

    model = keras.Model(
    inputs = [stock_id_input, num_input],
    outputs = out,
    )

    return model

# Function to train FFNN
def train_and_evaluate_nn(train_nn, test_nn):

    oof_predictions_nn = np.zeros(train_nn.shape[0])
    test_predictions_nn = np.zeros(test_nn.shape[0])

    es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=20, verbose=0,
    mode='min',restore_best_weights=True)

    plateau = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.2, patience=7, verbose=0,
    mode='min')

    scaler = MinMaxScaler(feature_range=(-1, 1))
    num_data = scaler.fit_transform(train_nn[colNames].values)

    for fold in range(5):
        print('CV {}/{}'.format(fold+1, 5))

        trn_ind = train_nn[kfolds!=fold].index
        val_ind = train_nn[kfolds==fold].index

        y_train = train_nn.loc[trn_ind, 'target']
        y_test = train_nn.loc[val_ind, 'target']

        num_data = scaler.transform( train_nn.loc[trn_ind, colNames].values)
        num_data_test = scaler.transform( train_nn.loc[val_ind, colNames].values)

        cat_data = train_nn['stock_id'][trn_ind]
        cat_data_test = train_nn['stock_id'][val_ind]

        # 3 NN models per fold
        for ff in range(3):

            model = base_model(input_shape, hidden_units, stock_embedding_size)

            model.compile(
              keras.optimizers.Adam(learning_rate=0.006),
              loss=root_mean_squared_per_error
            )

            model.fit([cat_data, num_data],
                    y_train,
                    batch_size=2048,
                    epochs=1000,
                    validation_data=([cat_data_test, num_data_test], y_test),
                    callbacks=[es, plateau],
                    validation_batch_size=len(y_test),
                    shuffle=True,
                  verbose = 0)

            preds = model.predict([cat_data_test, num_data_test]).reshape(1,-1)[0]
            oof_predictions_nn[val_ind] += preds

            score = round(rmspe(y_true=y_test, y_pred=preds),5)
            print('Fold {}/{}: {}'.format(fold, ff, score))

            test_predictions_nn += model.predict([test_nn['stock_id'], scaler.transform(test_nn[colNames].values)]).reshape(1,-1)[0].clip(0,1e10)
            gc.collect()

        del num_data, num_data_test, cat_data, cat_data_test, y_train, y_test
        gc.collect()

    test_predictions_nn = test_predictions_nn / 15.0
    oof_predictions_nn = oof_predictions_nn/3.0
    rmspe_score = rmspe(train_nn['target'], oof_predictions_nn)
    print(f'Our out of folds RMSPE is {rmspe_score}')

    return test_predictions_nn, oof_predictions_nn

# Function to train TabNet model
def train_tabnet(X, y, X_test, tabnet_params):

    oof_predictions = np.zeros((X.shape[0], 1))
    test_predictions = np.zeros(X_test.shape[0])

    for fold in range(5):

        print(f'Training fold {fold + 1}')

        trn_ind = kfolds!=fold
        val_ind = kfolds==fold


        clf =  TabNetRegressor(**tabnet_params)
        clf.fit(
          X[trn_ind].values, y[trn_ind].values.reshape(-1,1),
          eval_set=[(X[val_ind].values, y[val_ind].values.reshape(-1,1))],
          max_epochs = 200,
          patience = 50,
          batch_size = 1024*10,
          virtual_batch_size = 128*10,
          num_workers = 4,
          drop_last = False,
          eval_metric=[RMSPE],
          loss_fn=RMSPELoss,
          )

        saving_path_name = BASE_PATH + '/models/tabnet_latest_2/model_fold_{}'.format(fold)
        saved_filepath = clf.save_model(saving_path_name)

        oof_predictions[val_ind] = clf.predict(X[val_ind].values)
        test_predictions+=clf.predict(X_test.values).flatten()/5

    print(f'OOF score across folds: {rmspe(y, oof_predictions.flatten())}')

    return test_predictions, oof_predictions

# Function to get TabNet predictions from pre-trained models
def get_tabnet_preds(X_test, tabnet_params):

    preds = []

    shutil.make_archive('model_fold_0', 'zip', '../input/optiver-tabnet/model_fold_0')
    shutil.make_archive('model_fold_1', 'zip', '../input/optiver-tabnet/model_fold_1')
    shutil.make_archive('model_fold_2', 'zip', '../input/optiver-tabnet/model_fold_2')
    shutil.make_archive('model_fold_3', 'zip', '../input/optiver-tabnet/model_fold_3')
    shutil.make_archive('model_fold_4', 'zip', '../input/optiver-tabnet/model_fold_4')

    modelpath = ['./model_fold_0.zip','./model_fold_1.zip',
                './model_fold_2.zip','./model_fold_3.zip','./model_fold_4.zip']

    clf = TabNetRegressor(**tabnet_params)

    for path in modelpath:

        clf.load_model(path)
        preds.append(clf.predict(X_test.values).squeeze(-1))

    preds = np.clip(np.mean(preds,axis=0), 0, 1e10)

    return preds

#https://bignerdranch.com/blog/implementing-swish-activation-function-in-keras/
def swish(x, beta = 1):
    return (x * sigmoid(beta * x))

get_custom_objects().update({'swish': Activation(swish)})


### Metrics

In [8]:
# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Function to early stop with root mean squared percentage error in LGBM
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

# Function to calculate the root mean squared percentage error in TF
def root_mean_squared_per_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square( (y_true - y_pred)/ y_true )))

# Class to be able to use custom RMSPE loss in TabNet
class RMSPE(Metric):
    def __init__(self):
        self._name = "rmspe"
        self._maximize = False

    def __call__(self, y_true, y_score):

        return np.sqrt(np.mean(np.square((y_true - y_score) / y_true)))

 # Function to calculate the root mean squared percentage error in Torch
def RMSPELoss(y_pred, y_true):
    return torch.sqrt(torch.mean( ((y_true - y_pred) / y_true) ** 2 )).clone()


## Calculate Features

In [9]:
# Calculate distance matrix from reconstructed stock prices
d_df = calc_distance_matrix()

# Kfolds are in csv file
train, test = read_train_test()
kfolds = pd.read_csv(TRAIN_FOLD_PATH)['kfold']

train, test = calc_features_from_raw_data(train, test)

train = get_time_agg(train)
test = get_time_agg(test)

train = calc_taus(train)
test = calc_taus(test)

train, test = calc_tick_features(train, test)
train, test = calc_cluster_features(train, test)

colNames = [col for col in list(train.columns)
            if col not in {"stock_id", "time_id", "target", "row_id"}]

train_master = train.copy()
test_master = test.copy()

## FFNN

In [None]:
train = train_master.copy()
test = test_master.copy()

train, test = process_nn_data(train, test)
gc.collect()

tf.random.set_seed(42)
hidden_units = (128,64,32)
stock_embedding_size = 24
cat_data = train['stock_id']
input_shape = len(colNames)

predictions_nn, oof_predictions_nn = train_and_evaluate_nn(train, test)

del train, test
gc.collect()


## TabNet

In [None]:
random_seed(42)

train = train_master.copy()
test = test_master.copy()

X, y, X_test, cat_idxs, cat_dims = process_tabnet_data(train, test)

tabnet_params = dict(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=8,
    n_d = 16,
    n_a = 16,
    n_steps = 2,
    gamma = 1.3,
    n_independent = 2,
    n_shared = 2,
    lambda_sparse = 0,
    optimizer_fn = Adam,
    optimizer_params = dict(lr = 2e-2, weight_decay = 1e-5),
    mask_type = "entmax",
    scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
    scheduler_fn = CosineAnnealingWarmRestarts,
    seed = 23,
    verbose = 10)

tabnet_predictions = get_tabnet_preds(X_test, tabnet_params)


## Final Predictions

In [None]:
train, test = read_train_test()

test['target'] = (tabnet_predictions + predictions_nn+predictions_lgb)/3.0
test[['row_id', 'target']].to_csv('submission.csv',index = False)
test[['row_id', 'target']].head()
