# Volatility prediction with VLSTMs

(c) Damien Challet and Vincent Ragel (2023)

This notebook, together with the VLSTM.py file, contains the code used to produce the results of 

Damien Challet and Vincent Ragel, Recurrent neural networks with long and flexible memory: application to price volatility prediction (2023)

In [1]:
import os

import pdb

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Input, Dense, LSTM,  Activation, Reshape, Flatten, Dropout, Lambda, RepeatVector


from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)


gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

import VLSTM

2023-07-18 12:34:54.214055: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-18 12:34:58.523045: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:66] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda-11/lib64
2023-07-18 12:34:58.523086: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-07-18 12:34:58.523562: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:163] no NVIDIA GPU device i

In [2]:
import warnings; warnings.simplefilter('ignore')
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

import scipy.special as scsp
import statsmodels.api as sm
import math
import seaborn as sns; sns.set()


In [3]:
import os
from pathlib import Path
import pickle

dirResults="results"
dirPlots=dirResults+"/plots"
dirHistory=dirResults+"/history"
dirModels=dirResults+"/models"

if not os.path.isdir(dirPlots):
    os.makedirs(dirPlots)

if not os.path.isdir(dirHistory):
    os.makedirs(dirHistory)

if not os.path.isdir(dirModels):
    os.makedirs(dirModels)


    

In [4]:
# From Deep Learning for Time Series Forecasting (c) Jason Brownlee

#split a univariate sequence into samples
def split_sequence_univariate(sequence, n_steps):
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence)-1:
            break
            
        # gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)


# Transforms a 2d-X and y to 3d X and y


# split a multivariate sequence into samples
def split_sequences_multivariate(sequences, n_steps):
    X, y = list(), list()
    for i in range(len(sequences)):
# find the end of this pattern
        end_ix = i + n_steps
# check if we are beyond the dataset
        if end_ix > len(sequences)-1:
            break
# gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :], sequences[end_ix, -1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [5]:
data_source="Oxford"
Symbol="all"

#Oxford-Man Institute data
DF=pd.read_csv("data/oxfordmanrealizedvolatilityindices.csv.gz")

DF["log_rk_twoscale"]=np.log(DF["rk_twoscale"])

DF_ret=DF.groupby("Symbol").apply(lambda df: np.log(df["close_price"]).diff()).reset_index()
DF_ret.drop(columns="level_1",inplace=True)
DF_ret.rename(columns={"close_price": "ret_CC"},inplace=True)

DF["ret_CC"]=DF_ret["ret_CC"]
DF.dropna(inplace=True)
DF.rename(columns={"Unnamed: 0":"date"},inplace=True)
if not Symbol=="all":
    DF=DF[DF['Symbol']==Symbol]

alldates=DF["date"].unique()     # used later to define train / validation / test periods
alldates.sort()

alldates_DF=pd.DataFrame({'date':alldates,'date_idx':range(len(alldates))})     # assign one index per date
alldates_DF.set_index("date",inplace=True)



DF2=DF.groupby("Symbol").apply(lambda df: df.set_index("date").join(alldates_DF))

DF=DF2.drop(columns=["Symbol"]).reset_index()
DF.replace([np.inf, -np.inf], np.nan, inplace=True)
DF.dropna(inplace=True)


In [6]:
def create_sequences_multi_asset(data,T_seq,X_cols,Y_col):
    
    X_cols_eff=X_cols
    if not Y_col in X_cols_eff:
        X_cols_eff=X_cols_eff.append(Y_col)   # the split functions assume that Y is the last X_cols
         
    if(len(X_cols_eff)==1):
        func_split=split_sequence_univariate
    else:
        func_split=split_sequences_multivariate
        
    sequs=data.groupby("Symbol").apply(lambda df: func_split(df[X_cols_eff].values,T_seq))
    
    if not Y_col in X_cols: # one removes Y_col from predictors, then
        print("TODO: not Y_col in X_cols")

    return sequs.T.to_dict()
    

import random    
    
def extract_sequences_from_dict(sequs_dict,t0,t1,batch_size,col_idx=0,shuffle=True):
    
    num_batches=(t1-t0)//batch_size
    t0_eff=t1-num_batches*batch_size
    X=[]
    Y=[]
    
    symbols=[*sequs_dict]
    if shuffle and len(symbols)>1:
        random.shuffle(symbols)
    
    for symbol in symbols:
        #pdb.set_trace()

        #print(sequs_symbol[0])
        sequs_symbol=sequs_dict[symbol]
        
        sequs=sequs_symbol[0]

        for idx in range(sequs.shape[0]):  # scans all the sequences
            myseq=np.array(sequs[idx])

            if t0<=min(myseq[:,col_idx]) and t1>max(myseq[:,col_idx]):
                X.append(np.delete(myseq,col_idx,axis=1))
                Y.append(sequs_symbol[1][idx])
                 
    return np.array(X),np.array(Y)

    
    

In [7]:
batch_size=128
num_epochs=1000
val_split=0.2
test_split=0.2

Nruns=20

forceRecompute=False



In [8]:

if len(DF["Symbol"].unique())>1:
    dataTag="Oxf_all"
    length_symbol=DF.groupby("Symbol").apply(lambda df: df.shape[0])
else:
    dataTag="Oxf_"+DF["Symbol"].unique()
    length_symbol=1

columns_sel=["open_to_close","ret_CC","log_rk_twoscale"]

t0_train=0
t1_train=int(length_symbol.max()*(1-val_split-test_split))

t0_val=t1_train
t1_val=int(length_symbol.max()*(1-test_split))

t0_test=t1_val
t1_test=length_symbol.max()

col_idx=0

In [9]:
params=[]
for N_I in range(10,105,15):
    for neurons in range(1,6):
        for with_bias in [True,False]:
            for model_type in ["VLSTM","LSTM"]:
                for bound_alpha in [True]:
                    if model_type=="LSTM" and not bound_alpha:
                        continue
                    for i in range(Nruns):

                        params.append({"N_I":N_I,
                                      "neurons":neurons,
                                      "with_bias":with_bias,
                                      "bound_alpha":bound_alpha,
                                      "run":i,
                                      "model_type":model_type})
                        
print(len(params))

2800


In [10]:
N_I_prev=None

for p in params:
    N_I=p["N_I"]
    neurons=p["neurons"]
    with_bias=p["with_bias"]
    bound_alpha=p["bound_alpha"]
    run=p["run"]
    model_type=p["model_type"]
    
    file_root=dataTag+"_"+columns_sel[-1]+"_"+model_type+"_NH:"+str(neurons)+"_bias:"+str(with_bias)
    if model_type=="VLSTM":
        file_root=file_root+"_boundalpha:"+str(bound_alpha)
    file_root=file_root+"_Tseq:"+str(N_I)+"_batchSize:"+str(batch_size)+"_predictors:"+','.join(columns_sel)+"_run:"+str(run)
    print(file_root)

    if N_I!=N_I_prev:
        sequs_dict=create_sequences_multi_asset(DF,N_I,["date_idx"]+columns_sel,columns_sel[-1])
        X_train,Y_train=extract_sequences_from_dict(sequs_dict,t0_train,t1_train,batch_size,col_idx)
        X_val,Y_val=extract_sequences_from_dict(sequs_dict,t0_val,t1_val,batch_size,col_idx)
        X_test,Y_test=extract_sequences_from_dict(sequs_dict,t0_test,t1_test,batch_size,col_idx)
            
        N_I_prev=N_I
    
   
    model = Sequential()

    if model_type=="VLSTM":
        model.add(VLSTM.VLSTM(neurons,use_bias=with_bias,bound_alpha=bound_alpha,input_shape=(X_train.shape[1],X_train.shape[2])))
    elif model_type=="LSTM":
        model.add(LSTM(neurons,use_bias=with_bias,input_shape=(X_train.shape[1],X_train.shape[2])))

    model.add(Dense(neurons,activation="sigmoid"))
    model.add(Dense(1,activation="linear"))
    model.compile(loss='mean_squared_error', optimizer='adam')

    file_model=dirModels+"/model_"+file_root+".h5"
    file_hist=dirHistory+"/history_"+file_root+".pkl"
    
    if not Path(file_hist).is_file():
        print(" "+file_hist+" does not exist, touching it")
        Path(file_hist).touch()
    else:
        if not forceRecompute:
            print(" "+file_hist+" already exists, skipping")
            continue
            

    # train model
    
    hist=model.fit(X_train,Y_train,batch_size=batch_size,epochs=num_epochs,
                       validation_data=(X_val,Y_val),callbacks=[es],
                       shuffle=False,verbose=1)  
    loss_test=model.evaluate(X_test,Y_test,verbose=0)
    hist.history["loss_test"]=loss_test

    with open(file_hist, 'wb') as file:
        pickle.dump(hist.history, file)



    model.save_weights(file_model)

Oxf_all_log_rk_twoscale_VLSTM_NH:1_bias:True_boundalpha:True_Tseq:10_batchSize:128_predictors:open_to_close,ret_CC,log_rk_twoscale_run:0


2023-07-18 12:35:12.560732: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


 results/history/history_Oxf_all_log_rk_twoscale_VLSTM_NH:1_bias:True_boundalpha:True_Tseq:10_batchSize:128_predictors:open_to_close,ret_CC,log_rk_twoscale_run:0.pkl does not exist, touching it
Epoch 1/1000
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


KeyboardInterrupt: 

In [None]:
import sys
sys.exit() 