# Model Training

In [1]:
from __future__ import print_function
from __future__ import division

import pickle
import pandas as pd 
import numpy as np
from sklearn.linear_model import Ridge

from src.data_reader import Dataset
from src.model import Model
from src.utils import get_leaf_file_names

# Configurations

In [2]:
# Global config
_RAW_DATA_PATH = './data/rawdata'
_DATA_PATH = './data/date_data'
_OUTPUT_DIR = './output/'

# Data configuration
_TIME_FRAME_SIZE_Y = (1, 5, 15, 30)
_TIME_FRAME_SIZE_X = (1, 5, 15)
_X_1 = [
    'avg_delta_bid_size_1', 'avg_delta_bid_size_5', 'avg_delta_bid_size_15',
    'avg_delta_ask_size_1', 'avg_delta_ask_size_5', 'avg_delta_ask_size_15',
    'bid_size_proportion_1', 'bid_size_proportion_5', 'bid_size_proportion_15'
]
_X_2 = [
    'buy_volume_1', 'buy_volume_5', 'buy_volume_15', 'sell_volume_1', 'sell_volume_5',
    'sell_volume_15', 'proportion_volume_1', 'proportion_volume_5', 'proportion_volume_15'
]
_X_3 = [
    'lag_return_1', 'lag_return_5', 'lag_return_15'
]

_Y_1, _Y_2, _Y_3, _Y_4 = ['return_1_min', 'return_5_min', 'return_15_min', 'return_30_min']

_COLUMNS_TO_NORMALIZE = [
    'avg_delta_bid_size_1', 'avg_delta_bid_size_5', 'avg_delta_bid_size_15',
    'avg_delta_ask_size_1', 'avg_delta_ask_size_5', 'avg_delta_ask_size_15',
    'buy_volume_1', 'buy_volume_5', 'buy_volume_15', 'sell_volume_1', 'sell_volume_5', 'sell_volume_15'
]

## Data and model config

In [3]:
# Training configuration
_TEST_SET_SIZE = 2
_SLIDING_WINDOW_SIZE = 15
_CHUNK_SIZE = 3901 * 2
_MODEL_RIDGE = Ridge
_PARAMS_RIDGE = {
    'alpha': 1,
    'random_state': 100
}

## Prepare training set

In [5]:
# Prepare dataset
date_files = get_leaf_file_names(_DATA_PATH)
data_train = date_files[-_TEST_SET_SIZE - 18: -_TEST_SET_SIZE]
print(data_train)

['./data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl', './data/date_data/20180827.pkl', './data/date_data/20180828.pkl', './data/date_data/20180829.pkl']


## Prepare test set

In [6]:
data_test = date_files[-_TEST_SET_SIZE:]
print(data_test)

['./data/date_data/20180830.pkl', './data/date_data/20180831.pkl']


## Train model with all regressors

In [8]:
# Train model with different parameters
model = Model(
    X=_X_1+_X_2+_X_3, y=_Y_1, model=_MODEL_RIDGE, params=_PARAMS_RIDGE, output_model_name = './output/models/models_y1_all_x',
    data_files=data_train, columns_to_normalize=_COLUMNS_TO_NORMALIZE,
    window_size=_SLIDING_WINDOW_SIZE, days_as_window=True
)
stats, corrcoef = model.run()

Reading file ['./data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl']
Reading file ./data/date_data/20180827.pkl
------------------------------------------------------------
Training phrase 1...
Training on ['20180806.pkl', '20180807.pkl', '20180808.pkl', '20180809.pkl', '20180810.pkl', '20180813.pkl', '20180814.pkl', '20180815.pkl', '20180816.pkl', '20180817.pkl', '20180820.pkl', '20180821.pkl', '20180822.pkl', '20180823.pkl', '20180824.pkl']
Validating on 20180827.pkl
Corrcoef on training set: 0.2626562560543392
Corrcoef on validation set: 0.27946164731

## Validate model on test set

In [22]:
with open(_OUTPUT_DIR + 'models/models_y1_all_x_1.pickle', 'rb') as file:
    _model = pickle.load(file)

In [23]:
result = model.test(data_test, _model)

Reading file ./data/date_data/20180830.pkl
[-269.4180054407369, -96.89480199894531, -9.591284412670374]
{   'R_square': 0.07788204876829219,
    'stats':          Feature      Coef     T-value
0   lag_return_1 -0.188591 -269.418005
1   lag_return_5 -0.042169  -96.894802
2  lag_return_15 -0.002285   -9.591284}
Reading file ./data/date_data/20180831.pkl
[-268.3779176156266, -99.36015629445369, -13.071794927632688]
{   'R_square': 0.0698260839972209,
    'stats':          Feature      Coef     T-value
0   lag_return_1 -0.188591 -268.377918
1   lag_return_5 -0.042169  -99.360156
2  lag_return_15 -0.002285  -13.071795}


## Train model with x1 (average of delta of bid/ask size)

In [10]:
model = Model(
    X=_X_1, y=_Y_1, model=_MODEL_RIDGE, params=_PARAMS_RIDGE, output_model_name = './output/models/models_y1_x1',
    data_files=data_train, columns_to_normalize=_COLUMNS_TO_NORMALIZE,
    window_size=_SLIDING_WINDOW_SIZE, days_as_window=True
)
stats, corrcoef = model.run()

Reading file ['./data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl']
Reading file ./data/date_data/20180827.pkl
------------------------------------------------------------
Training phrase 1...
Training on ['20180806.pkl', '20180807.pkl', '20180808.pkl', '20180809.pkl', '20180810.pkl', '20180813.pkl', '20180814.pkl', '20180815.pkl', '20180816.pkl', '20180817.pkl', '20180820.pkl', '20180821.pkl', '20180822.pkl', '20180823.pkl', '20180824.pkl']
Validating on 20180827.pkl
Corrcoef on training set: 0.06324227825423444
Corrcoef on validation set: 0.0703060624

## Validate model on test set

In [None]:
with open(_OUTPUT_DIR + 'model_y1_x1.pickle', 'rb') as file:
    model_x1_y1 = pickle.load(file)

In [11]:
result = model.test(data_test)

Reading file ./data/date_data/20180830.pkl
[36.82104581519272, 0.8543425932428242, 5.46147589847421, -34.99809735597944, -12.082243811815218, -4.676640287432064, 204.892057836754, -156.7039479603852, -39.09309245396656]
{   'R_square': 0.0026351978975839296,
    'stats':                   Feature      Coef     T-value
0    avg_delta_bid_size_1  0.000060   36.821046
1    avg_delta_bid_size_5  0.000001    0.854343
2   avg_delta_bid_size_15  0.000009    5.461476
3    avg_delta_ask_size_1 -0.000065  -34.998097
4    avg_delta_ask_size_5 -0.000024  -12.082244
5   avg_delta_ask_size_15 -0.000010   -4.676640
6   bid_size_proportion_1  0.001382  204.892058
7   bid_size_proportion_5 -0.001085 -156.703948
8  bid_size_proportion_15 -0.000280  -39.093092}
Reading file ./data/date_data/20180831.pkl
[35.36475206196729, 0.8528830779469975, 5.870844173536239, -36.51201933827114, -14.31582677493167, -5.783476593034704, 180.39809698278802, -137.0118069754777, -33.8381097730865]
{   'R_square': 0.00387800

## Train model with x2 (volume initiated by buy/sell)

In [12]:
model = Model(
    X=_X_2, y=_Y_1, model=_MODEL_RIDGE, params=_PARAMS_RIDGE, output_model_name = './output/models/models_y1_x2',
    data_files=data_train, columns_to_normalize=_COLUMNS_TO_NORMALIZE,
    window_size=_SLIDING_WINDOW_SIZE, days_as_window=True
)
stats, corrcoef = model.run()

Reading file ['./data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl']
Reading file ./data/date_data/20180827.pkl
------------------------------------------------------------
Training phrase 1...
Training on ['20180806.pkl', '20180807.pkl', '20180808.pkl', '20180809.pkl', '20180810.pkl', '20180813.pkl', '20180814.pkl', '20180815.pkl', '20180816.pkl', '20180817.pkl', '20180820.pkl', '20180821.pkl', '20180822.pkl', '20180823.pkl', '20180824.pkl']
Validating on 20180827.pkl
Corrcoef on training set: 0.05751386122192162
Corrcoef on validation set: 0.0409691502

## Validate model on test set

In [None]:
with open(_OUTPUT_DIR + 'model_x2_y1.pickle', 'rb') as file:
    model_x2_y1 = pickle.load(file)

In [13]:
result = model.test(data_test)

Reading file ./data/date_data/20180830.pkl
[-101.68159391036434, 63.78408131338767, 24.476150462138968, 281.31447170685493, -190.48141018638813, -69.9805569774854, 69.12757572725177, -18.16976531624415, -27.69231905366637]
{   'R_square': 0.002623160092696364,
    'stats':                 Feature      Coef     T-value
0          buy_volume_1 -0.000367 -101.681594
1          buy_volume_5  0.000225   63.784081
2         buy_volume_15  0.000088   24.476150
3         sell_volume_1  0.000702  281.314472
4         sell_volume_5 -0.000459 -190.481410
5        sell_volume_15 -0.000165  -69.980557
6   proportion_volume_1  0.000764   69.127576
7   proportion_volume_5 -0.000273  -18.169765
8  proportion_volume_15 -0.000486  -27.692319}
Reading file ./data/date_data/20180831.pkl
[-109.37015336032546, 69.31265364343412, 28.058130367796657, 223.01640232595682, -146.99744864634044, -53.077666793316254, 63.012680075821876, -16.184692541208147, -24.163115965444046]
{   'R_square': 0.004019112219266874,

## Train model with x3 (lag return)

In [16]:
model = Model(
    X=_X_3, y=_Y_1, model=_MODEL_RIDGE, params=_PARAMS_RIDGE, output_model_name = './output/models/models_y1_x3',
    data_files=data_train, columns_to_normalize=_COLUMNS_TO_NORMALIZE,
    window_size=_SLIDING_WINDOW_SIZE, days_as_window=True
)
stats, corrcoef = model.run()

Reading file ['./data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl']
Reading file ./data/date_data/20180827.pkl
------------------------------------------------------------
Training phrase 1...
Training on ['20180806.pkl', '20180807.pkl', '20180808.pkl', '20180809.pkl', '20180810.pkl', '20180813.pkl', '20180814.pkl', '20180815.pkl', '20180816.pkl', '20180817.pkl', '20180820.pkl', '20180821.pkl', '20180822.pkl', '20180823.pkl', '20180824.pkl']
Validating on 20180827.pkl
Corrcoef on training set: 0.2252146108501102
Corrcoef on validation set: 0.26624487681

## Validate model on test set

In [None]:
with open(_OUTPUT_DIR + 'model_x3_y1.pickle', 'rb') as file:
    model_x2_y1 = pickle.load(file)

In [17]:
result = model.test(data_test)

Reading file ./data/date_data/20180830.pkl
[-269.4180054407369, -96.89480199894531, -9.591284412670374]
{   'R_square': 0.07788204876829219,
    'stats':          Feature      Coef     T-value
0   lag_return_1 -0.188591 -269.418005
1   lag_return_5 -0.042169  -96.894802
2  lag_return_15 -0.002285   -9.591284}
Reading file ./data/date_data/20180831.pkl
[-268.3779176156266, -99.36015629445369, -13.071794927632688]
{   'R_square': 0.0698260839972209,
    'stats':          Feature      Coef     T-value
0   lag_return_1 -0.188591 -268.377918
1   lag_return_5 -0.042169  -99.360156
2  lag_return_15 -0.002285  -13.071795}
