# Dependencies

In [1]:
import os
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm 
from sklearn.metrics import accuracy_score, classification_report

# Dataset

Downlad data

In [2]:
if not os.path.isfile('data.zip'):
    !wget https://raw.githubusercontent.com/zcakhaa/DeepLOB-Deep-Convolutional-Neural-Networks-for-Limit-Order-Books/master/data/data.zip
    !unzip -n data.zip
    print("Data downloaded.")
else:
    print("Data already downloaded.")

--2022-02-11 17:07:07--  https://raw.githubusercontent.com/zcakhaa/DeepLOB-Deep-Convolutional-Neural-Networks-for-Limit-Order-Books/master/data/data.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56278154 (54M) [application/zip]
Saving to: ‘data.zip’


2022-02-11 17:07:10 (180 MB/s) - ‘data.zip’ saved [56278154/56278154]

Archive:  data.zip
  inflating: Test_Dst_NoAuction_DecPre_CF_7.txt  
  inflating: Test_Dst_NoAuction_DecPre_CF_9.txt  
  inflating: Test_Dst_NoAuction_DecPre_CF_8.txt  
  inflating: Train_Dst_NoAuction_DecPre_CF_7.txt  
Data downloaded.


We load data and split them such that:
- The train set is split in 80% train and 20% validation.
- The test set is a combinattion of multiple test files.

In [3]:
dec_data = np.loadtxt('Train_Dst_NoAuction_DecPre_CF_7.txt') 
# We split according to a 80-20 ratio.
dec_train = dec_data[:, :int(np.floor(dec_data.shape[1] * 0.8))]
dec_val = dec_data[:, int(np.floor(dec_data.shape[1] * 0.8)):]

dec_test1 = np.loadtxt('Test_Dst_NoAuction_DecPre_CF_7.txt')
dec_test2 = np.loadtxt('Test_Dst_NoAuction_DecPre_CF_8.txt')
dec_test3 = np.loadtxt('Test_Dst_NoAuction_DecPre_CF_9.txt')
dec_test = np.hstack((dec_test1, dec_test2, dec_test3))

print(dec_train.shape, dec_val.shape, dec_test.shape)

(149, 203800) (149, 50950) (149, 139587)


Data refer to 7 days, the first 5 days are in the training and validation set while the last 2 are the test set.

The first 40 columns of the FI-2010 dataset are 10 levels of ask and bid information for a limit order book, we only use these 40 features in our network. 
The last 5 columns of the FI-2010 dataset are the labels with different prediction horizons.

In [4]:
x_training_data = dec_train.T[:, :40]
x_validation_data = dec_val.T[:, :40]
x_test_data = dec_test.T[:, :40]

In [5]:
print("Train set: ", x_training_data.shape) 
print("Val set: ", x_validation_data.shape) 
print("Test set: ", x_test_data.shape)

print("\n10 levels, each one being (ask-price, ask-volume, bid-price, bid-volume):")
print(f"t0: {x_training_data[0]}")
print(f"t1: {x_training_data[1]}")

Train set:  (203800, 40)
Val set:  (50950, 40)
Test set:  (139587, 40)

10 levels, each one being (ask-price, ask-volume, bid-price, bid-volume):
t0: [0.2615  0.00353 0.2606  0.00326 0.2618  0.002   0.2604  0.00682 0.2619
 0.00164 0.2602  0.00786 0.262   0.00532 0.26    0.00893 0.2621  0.00151
 0.2599  0.00159 0.2623  0.00837 0.2595  0.001   0.2625  0.0015  0.2593
 0.00143 0.2626  0.00787 0.2591  0.00134 0.2629  0.00146 0.2588  0.00123
 0.2633  0.00311 0.2579  0.00128]
t1: [0.2615  0.00211 0.2606  0.00326 0.2619  0.00164 0.2604  0.00682 0.262
 0.00138 0.2602  0.00786 0.2621  0.00545 0.2601  0.00393 0.2625  0.0015
 0.26    0.005   0.2626  0.00787 0.2599  0.00159 0.2629  0.00146 0.2595
 0.001   0.2633  0.00311 0.2593  0.00143 0.2637  0.00165 0.2591  0.00134
 0.2646  0.00138 0.2588  0.00123]
