# Model Training

In [1]:
from __future__ import print_function
from __future__ import division

import pickle
import pandas as pd 
import numpy as np
from sklearn.linear_model import Ridge

from src.data_reader import Dataset
from src.model import Model
from src.utils import get_leaf_file_names

# Configurations

In [2]:
# Global config
_RAW_DATA_PATH = './data/rawdata'
_DATA_PATH = './data/date_data'
_OUTPUT_DIR = './output/'

# Data configuration
_TIME_FRAME_SIZE_Y = (1, 5, 15, 30)
_TIME_FRAME_SIZE_X = (1, 5, 15)
_X_1 = [
    'avg_delta_bid_size_1', 'avg_delta_bid_size_5', 'avg_delta_bid_size_15',
    'avg_delta_ask_size_1', 'avg_delta_ask_size_5', 'avg_delta_ask_size_15',
    'bid_size_proportion_1', 'bid_size_proportion_5', 'bid_size_proportion_15'
]
_X_2 = [
    'buy_volume_1', 'buy_volume_5', 'buy_volume_15', 'sell_volume_1', 'sell_volume_5',
    'sell_volume_15', 'proportion_volume_1', 'proportion_volume_5', 'proportion_volume_15'
]
_X_3 = [
    'lag_return_1', 'lag_return_5', 'lag_return_15'
]

_Y_1, _Y_2, _Y_3, _Y_4 = ['return_1_min', 'return_5_min', 'return_15_min', 'return_30_min']

_COLUMNS_TO_NORMALIZE = [
    'avg_delta_bid_size_1', 'avg_delta_bid_size_5', 'avg_delta_bid_size_15',
    'avg_delta_ask_size_1', 'avg_delta_ask_size_5', 'avg_delta_ask_size_15',
    'buy_volume_1', 'buy_volume_5', 'buy_volume_15', 'sell_volume_1', 'sell_volume_5', 'sell_volume_15'
]

## Data and model config

In [3]:
# Training configuration
_TEST_SET_SIZE = 2
_SLIDING_WINDOW_SIZE = 15
_CHUNK_SIZE = 3901 * 2
_MODEL_RIDGE = Ridge
_PARAMS_RIDGE = {
    'alpha': 1,
    'random_state': 100
}

## Prepare training set

In [4]:
# Prepare dataset
date_files = get_leaf_file_names(_DATA_PATH)
data_train = date_files[-_TEST_SET_SIZE - 18: -_TEST_SET_SIZE]
print(data_train)

['./data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl', './data/date_data/20180827.pkl', './data/date_data/20180828.pkl', './data/date_data/20180829.pkl']


## Prepare test set

In [5]:
data_test = date_files[-_TEST_SET_SIZE:]
print(data_test)

['./data/date_data/20180830.pkl', './data/date_data/20180831.pkl']


## Train model with all regressors

In [6]:
# Train model with different parameters
model = Model(
    X=_X_1+_X_2+_X_3, y=_Y_2, model=_MODEL_RIDGE, params=_PARAMS_RIDGE, output_model_name = './output/models/models_y2_all_x',
    data_files=data_train, columns_to_normalize=_COLUMNS_TO_NORMALIZE,
    window_size=_SLIDING_WINDOW_SIZE, days_as_window=True
)
stats, corrcoef = model.run()

Reading file ['./data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl']
Reading file ./data/date_data/20180827.pkl
------------------------------------------------------------
Training phrase 1...
Training on ['20180806.pkl', '20180807.pkl', '20180808.pkl', '20180809.pkl', '20180810.pkl', '20180813.pkl', '20180814.pkl', '20180815.pkl', '20180816.pkl', '20180817.pkl', '20180820.pkl', '20180821.pkl', '20180822.pkl', '20180823.pkl', '20180824.pkl']
Validating on 20180827.pkl
Corrcoef on training set: 0.1918182264064436
Corrcoef on validation set: 0.20472006032

## Validate model on test set

In [None]:
with open(_OUTPUT_DIR + 'models/model_y2_all_x.pickle', 'rb') as file:
    model_y1_all_x = pickle.load(file)

In [7]:
result = model.test(data_test)

Reading file ./data/date_data/20180830.pkl
[18.20600436830905, 6.647390600716641, 7.935325695201242, -19.460758026818294, -11.624790321388199, -10.426663686777369, 177.12365104186762, -102.88720098789791, -87.59309255294475, -31.378872424245372, -17.33860706958371, 44.09377661453797, 139.89841900161338, -1.1077157449024102, -117.16532081710731, 141.1347474284809, 57.86954931696641, -129.23110343728652, -237.2045654124787, -86.38978847713545, 14.010755304789317]
{   'R_square': 0.0480268725485864,
    'stats':                    Feature      Coef     T-value
0     avg_delta_bid_size_1  0.000046   18.206004
1     avg_delta_bid_size_5  0.000017    6.647391
2    avg_delta_bid_size_15  0.000021    7.935326
3     avg_delta_ask_size_1 -0.000056  -19.460758
4     avg_delta_ask_size_5 -0.000036  -11.624790
5    avg_delta_ask_size_15 -0.000035  -10.426664
6    bid_size_proportion_1  0.001846  177.123651
7    bid_size_proportion_5 -0.001100 -102.887201
8   bid_size_proportion_15 -0.000970  -87.59

## Train model with x1 (average of delta of bid/ask size)

In [10]:
model = Model(
    X=_X_1, y=_Y_2, model=_MODEL_RIDGE, params=_PARAMS_RIDGE, output_model_name = './output/models/models_y2_x1',
    data_files=data_train, columns_to_normalize=_COLUMNS_TO_NORMALIZE,
    window_size=_SLIDING_WINDOW_SIZE, days_as_window=True
)
stats, corrcoef = model.run()

Reading file ['./data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl']
Reading file ./data/date_data/20180827.pkl
------------------------------------------------------------
Training phrase 1...
Training on ['20180806.pkl', '20180807.pkl', '20180808.pkl', '20180809.pkl', '20180810.pkl', '20180813.pkl', '20180814.pkl', '20180815.pkl', '20180816.pkl', '20180817.pkl', '20180820.pkl', '20180821.pkl', '20180822.pkl', '20180823.pkl', '20180824.pkl']
Validating on 20180827.pkl
Corrcoef on training set: 0.060040243779887814
Corrcoef on validation set: 0.069486953

## Validate model on test set

In [None]:
with open(_OUTPUT_DIR + 'model_y1_x1.pickle', 'rb') as file:
    model_x1_y1 = pickle.load(file)

In [11]:
result = model.test(data_test)

Reading file ./data/date_data/20180830.pkl
[33.33173057814046, 7.576369026302669, 7.477582450173023, -40.6073135191686, -14.78033274719093, -8.233474087987986, 193.6053000316705, -136.1790408740334, -54.6787430729969]
{   'R_square': -0.003232727540683822,
    'stats':                   Feature      Coef     T-value
0    avg_delta_bid_size_1  0.000086   33.331731
1    avg_delta_bid_size_5  0.000019    7.576369
2   avg_delta_bid_size_15  0.000020    7.477582
3    avg_delta_ask_size_1 -0.000119  -40.607314
4    avg_delta_ask_size_5 -0.000047  -14.780333
5   avg_delta_ask_size_15 -0.000029   -8.233474
6   bid_size_proportion_1  0.002071  193.605300
7   bid_size_proportion_5 -0.001495 -136.179041
8  bid_size_proportion_15 -0.000622  -54.678743}
Reading file ./data/date_data/20180831.pkl
[30.885843967940993, 7.297022260193393, 7.754947363906132, -40.87170815440426, -16.895854579737392, -9.823477411276334, 164.45655248186088, -114.87233239206404, -45.66166076513637]
{   'R_square': 0.0037834

## Train model with x2 (volume initiated by buy/sell)

In [12]:
model = Model(
    X=_X_2, y=_Y_2, model=_MODEL_RIDGE, params=_PARAMS_RIDGE, output_model_name = './output/models/models_y2_x2',
    data_files=data_train, columns_to_normalize=_COLUMNS_TO_NORMALIZE,
    window_size=_SLIDING_WINDOW_SIZE, days_as_window=True
)
stats, corrcoef = model.run()

Reading file ['./data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl']
Reading file ./data/date_data/20180827.pkl
------------------------------------------------------------
Training phrase 1...
Training on ['20180806.pkl', '20180807.pkl', '20180808.pkl', '20180809.pkl', '20180810.pkl', '20180813.pkl', '20180814.pkl', '20180815.pkl', '20180816.pkl', '20180817.pkl', '20180820.pkl', '20180821.pkl', '20180822.pkl', '20180823.pkl', '20180824.pkl']
Validating on 20180827.pkl
Corrcoef on training set: 0.048220324470507225
Corrcoef on validation set: 0.001346946

## Validate model on test set

In [None]:
with open(_OUTPUT_DIR + 'model_x2_y2.pickle', 'rb') as file:
    model_x2_y1 = pickle.load(file)

In [13]:
result = model.test(data_test)

Reading file ./data/date_data/20180830.pkl
[-49.15059430501186, 13.299945987618017, 33.548365493011126, 184.31622324365944, -93.06683947011118, -79.12563162693729, 39.23661981969996, 11.464494419700898, -34.85079943383979]
{   'R_square': -0.006776926971935993,
    'stats':                 Feature      Coef     T-value
0          buy_volume_1 -0.000282  -49.150594
1          buy_volume_5  0.000075   13.299946
2         buy_volume_15  0.000192   33.548365
3         sell_volume_1  0.000730  184.316223
4         sell_volume_5 -0.000356  -93.066839
5        sell_volume_15 -0.000296  -79.125632
6   proportion_volume_1  0.000689   39.236620
7   proportion_volume_5  0.000274   11.464494
8  proportion_volume_15 -0.000971  -34.850799}
Reading file ./data/date_data/20180831.pkl
[-51.027187573155594, 13.949751842221241, 37.119606479374745, 141.03427726533846, -69.32159522834947, -57.92526135579795, 34.52109750702231, 9.85658396855287, -29.350988983663836]
{   'R_square': 0.0012886483434143559,
  

## Train model with x3 (lag return)

In [14]:
model = Model(
    X=_X_3, y=_Y_2, model=_MODEL_RIDGE, params=_PARAMS_RIDGE, output_model_name = './output/models/models_y2_x3',
    data_files=data_train, columns_to_normalize=_COLUMNS_TO_NORMALIZE,
    window_size=_SLIDING_WINDOW_SIZE, days_as_window=True
)
stats, corrcoef = model.run()

Reading file ['./data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl']
Reading file ./data/date_data/20180827.pkl
------------------------------------------------------------
Training phrase 1...
Training on ['20180806.pkl', '20180807.pkl', '20180808.pkl', '20180809.pkl', '20180810.pkl', '20180813.pkl', '20180814.pkl', '20180815.pkl', '20180816.pkl', '20180817.pkl', '20180820.pkl', '20180821.pkl', '20180822.pkl', '20180823.pkl', '20180824.pkl']
Validating on 20180827.pkl
Corrcoef on training set: 0.1581498928577902
Corrcoef on validation set: 0.20553911934

## Validate model on test set

In [None]:
with open(_OUTPUT_DIR + 'model_x3_y2.pickle', 'rb') as file:
    model_x2_y1 = pickle.load(file)

In [15]:
result = model.test(data_test)

Reading file ./data/date_data/20180830.pkl
[-204.44959292818288, -70.87660459016698, -1.0096021183399433]
{   'R_square': 0.03491731787460384,
    'stats':          Feature      Coef     T-value
0   lag_return_1 -0.231504 -204.449593
1   lag_return_5 -0.049897  -70.876605
2  lag_return_15 -0.000389   -1.009602}
Reading file ./data/date_data/20180831.pkl
[-197.50519836254094, -70.48339405743833, -1.3343840068206565]
{   'R_square': 0.042043609602636534,
    'stats':          Feature      Coef     T-value
0   lag_return_1 -0.231504 -197.505198
1   lag_return_5 -0.049897  -70.483394
2  lag_return_15 -0.000389   -1.334384}
