# Model Training

In [2]:
from __future__ import print_function
from __future__ import division

import pickle
import pandas as pd 
import numpy as np
from sklearn.linear_model import Ridge

from src.data_reader import Dataset
from src.model import Model
from src.utils import get_leaf_file_names

# Configurations

In [3]:
# Global config
_RAW_DATA_PATH = './data/rawdata'
_DATA_PATH = './data/date_data'
_OUTPUT_DIR = './output/'

# Data configuration
_TIME_FRAME_SIZE_Y = (1, 5, 15, 30)
_TIME_FRAME_SIZE_X = (1, 5, 15)
_X_1 = [
    'avg_delta_bid_size_1', 'avg_delta_bid_size_5', 'avg_delta_bid_size_15',
    'avg_delta_ask_size_1', 'avg_delta_ask_size_5', 'avg_delta_ask_size_15',
    'bid_size_proportion_1', 'bid_size_proportion_5', 'bid_size_proportion_15'
]
_X_2 = [
    'buy_volume_1', 'buy_volume_5', 'buy_volume_15', 'sell_volume_1', 'sell_volume_5',
    'sell_volume_15', 'proportion_volume_1', 'proportion_volume_5', 'proportion_volume_15'
]
_X_3 = [
    'lag_return_1', 'lag_return_5', 'lag_return_15'
]

_Y_1, _Y_2, _Y_3, _Y_4 = ['return_1_min', 'return_5_min', 'return_15_min', 'return_30_min']

_COLUMNS_TO_NORMALIZE = [
    'avg_delta_bid_size_1', 'avg_delta_bid_size_5', 'avg_delta_bid_size_15',
    'avg_delta_ask_size_1', 'avg_delta_ask_size_5', 'avg_delta_ask_size_15',
    'buy_volume_1', 'buy_volume_5', 'buy_volume_15', 'sell_volume_1', 'sell_volume_5', 'sell_volume_15'
]

## Data and model config

In [4]:
# Training configuration
_TEST_SET_SIZE = 2
_SLIDING_WINDOW_SIZE = 15
_CHUNK_SIZE = 3901 * 2
_MODEL_RIDGE = Ridge
_PARAMS_RIDGE = {
    'alpha': 1,
    'random_state': 100
}

## Prepare training set

In [5]:
# Prepare dataset
date_files = get_leaf_file_names(_DATA_PATH)
data_train = date_files[-_TEST_SET_SIZE - 18: -_TEST_SET_SIZE]
print(data_train)

['./data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl', './data/date_data/20180827.pkl', './data/date_data/20180828.pkl', './data/date_data/20180829.pkl']


## Prepare test set

In [6]:
data_test = date_files[-_TEST_SET_SIZE:]
print(data_test)

['./data/date_data/20180830.pkl', './data/date_data/20180831.pkl']


## Train model with all regressors

In [7]:
# Train model with different parameters
model = Model(
    X=_X_1+_X_2+_X_3, y=_Y_3, model=_MODEL_RIDGE, params=_PARAMS_RIDGE, output_model_name = './output/models/models_y3_all_x',
    data_files=data_train, columns_to_normalize=_COLUMNS_TO_NORMALIZE,
    window_size=_SLIDING_WINDOW_SIZE, days_as_window=True
)
stats, corrcoef = model.run()

Reading file ['./data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl']
Reading file ./data/date_data/20180827.pkl
------------------------------------------------------------
Training phrase 1...
Training on ['20180806.pkl', '20180807.pkl', '20180808.pkl', '20180809.pkl', '20180810.pkl', '20180813.pkl', '20180814.pkl', '20180815.pkl', '20180816.pkl', '20180817.pkl', '20180820.pkl', '20180821.pkl', '20180822.pkl', '20180823.pkl', '20180824.pkl']
Validating on 20180827.pkl
Corrcoef on training set: 0.13771960882386655
Corrcoef on validation set: 0.1279012060

## Validate model on test set

In [None]:
with open(_OUTPUT_DIR + 'models/model_y3_all_x.pickle', 'rb') as file:
    model_y1_all_x = pickle.load(file)

In [8]:
result = model.test(data_test)

Reading file ./data/date_data/20180830.pkl
[13.130988032634354, 3.2935284569871803, 10.935878372655461, -13.378162860632049, -12.278785513727165, -10.881125911736769, 111.20145973563223, -60.794701454444514, -76.89740414993065, -23.186273065765814, 3.340156736951543, 19.25215498913293, 100.7998156466865, 46.86932209275052, -128.96426138254688, 97.32343762760412, 58.09766217594275, -99.63669698826757, -160.8598695988443, -66.73083410723055, 29.222911863179505]
{   'R_square': -0.00889833733613643,
    'stats':                    Feature      Coef     T-value
0     avg_delta_bid_size_1  0.000051   13.130988
1     avg_delta_bid_size_5  0.000013    3.293528
2    avg_delta_bid_size_15  0.000044   10.935878
3     avg_delta_ask_size_1 -0.000059  -13.378163
4     avg_delta_ask_size_5 -0.000058  -12.278786
5    avg_delta_ask_size_15 -0.000057  -10.881126
6    bid_size_proportion_1  0.001780  111.201460
7    bid_size_proportion_5 -0.000998  -60.794701
8   bid_size_proportion_15 -0.001308  -76.89

## Train model with x1 (average of delta of bid/ask size)

In [9]:
model = Model(
    X=_X_1, y=_Y_3, model=_MODEL_RIDGE, params=_PARAMS_RIDGE, output_model_name = './output/models/models_y3_x1',
    data_files=data_train, columns_to_normalize=_COLUMNS_TO_NORMALIZE,
    window_size=_SLIDING_WINDOW_SIZE, days_as_window=True
)
stats, corrcoef = model.run()

Reading file ['./data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl']
Reading file ./data/date_data/20180827.pkl
------------------------------------------------------------
Training phrase 1...
Training on ['20180806.pkl', '20180807.pkl', '20180808.pkl', '20180809.pkl', '20180810.pkl', '20180813.pkl', '20180814.pkl', '20180815.pkl', '20180816.pkl', '20180817.pkl', '20180820.pkl', '20180821.pkl', '20180822.pkl', '20180823.pkl', '20180824.pkl']
Validating on 20180827.pkl
Corrcoef on training set: 0.041060476754640585
Corrcoef on validation set: 0.061590925

## Validate model on test set

In [None]:
with open(_OUTPUT_DIR + 'models/model_y3_x1.pickle', 'rb') as file:
    model_x1_y1 = pickle.load(file)

In [10]:
result = model.test(data_test)

Reading file ./data/date_data/20180830.pkl
[22.813005734052954, 4.023221929643897, 12.42619764797382, -27.808147985186856, -14.821653456710967, -7.5838064930661, 123.867653277201, -95.7080929985346, -37.91484585171789]
{   'R_square': -0.035072073211620314,
    'stats':                   Feature      Coef     T-value
0    avg_delta_bid_size_1  0.000089   22.813006
1    avg_delta_bid_size_5  0.000016    4.023222
2   avg_delta_bid_size_15  0.000050   12.426198
3    avg_delta_ask_size_1 -0.000124  -27.808148
4    avg_delta_ask_size_5 -0.000071  -14.821653
5   avg_delta_ask_size_15 -0.000040   -7.583806
6   bid_size_proportion_1  0.002008  123.867653
7   bid_size_proportion_5 -0.001592  -95.708093
8  bid_size_proportion_15 -0.000653  -37.914846}
Reading file ./data/date_data/20180831.pkl
[20.481412449661327, 3.7543462573254525, 12.486240178029453, -27.118544678547444, -16.416039553054482, -8.766881862086349, 101.94539968302894, -78.22212580634641, -30.67737947686876]
{   'R_square': 0.0002

## Train model with x2 (volume initiated by buy/sell)

In [11]:
model = Model(
    X=_X_2, y=_Y_3, model=_MODEL_RIDGE, params=_PARAMS_RIDGE, output_model_name = './output/models/models_y3_x2',
    data_files=data_train, columns_to_normalize=_COLUMNS_TO_NORMALIZE,
    window_size=_SLIDING_WINDOW_SIZE, days_as_window=True
)
stats, corrcoef = model.run()

Reading file ['./data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl']
Reading file ./data/date_data/20180827.pkl
------------------------------------------------------------
Training phrase 1...
Training on ['20180806.pkl', '20180807.pkl', '20180808.pkl', '20180809.pkl', '20180810.pkl', '20180813.pkl', '20180814.pkl', '20180815.pkl', '20180816.pkl', '20180817.pkl', '20180820.pkl', '20180821.pkl', '20180822.pkl', '20180823.pkl', '20180824.pkl']
Validating on 20180827.pkl
Corrcoef on training set: 0.06273694783231523
Corrcoef on validation set: -0.007731893

## Validate model on test set

In [None]:
with open(_OUTPUT_DIR + 'model_x2_y1.pickle', 'rb') as file:
    model_x2_y1 = pickle.load(file)

In [12]:
result = model.test(data_test)

Reading file ./data/date_data/20180830.pkl
[-36.26484229853772, 26.43567827820745, 15.576524622135473, 131.57655961749586, -20.291669102681656, -105.60168848231987, 27.54522047060263, 20.754333670947904, -35.358153438645864]
{   'R_square': -0.03613958878715229,
    'stats':                 Feature      Coef     T-value
0          buy_volume_1 -0.000315  -36.264842
1          buy_volume_5  0.000225   26.435678
2         buy_volume_15  0.000135   15.576525
3         sell_volume_1  0.000789  131.576560
4         sell_volume_5 -0.000117  -20.291669
5        sell_volume_15 -0.000598 -105.601688
6   proportion_volume_1  0.000732   27.545220
7   proportion_volume_5  0.000750   20.754334
8  proportion_volume_15 -0.001491  -35.358153}
Reading file ./data/date_data/20180831.pkl
[-36.46523084913684, 26.855137335480343, 16.692557418647826, 97.5124248523938, -14.639008716684707, -74.87588862544747, 23.47251234100046, 17.28226374708871, -28.841635639193616]
{   'R_square': -0.0004574491280704418,
 

## Train model with x3 (lag return)

In [13]:
model = Model(
    X=_X_3, y=_Y_3, model=_MODEL_RIDGE, params=_PARAMS_RIDGE, output_model_name = './output/models/models_y3_x3',
    data_files=data_train, columns_to_normalize=_COLUMNS_TO_NORMALIZE,
    window_size=_SLIDING_WINDOW_SIZE, days_as_window=True
)
stats, corrcoef = model.run()

Reading file ['./data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl']
Reading file ./data/date_data/20180827.pkl
------------------------------------------------------------
Training phrase 1...
Training on ['20180806.pkl', '20180807.pkl', '20180808.pkl', '20180809.pkl', '20180810.pkl', '20180813.pkl', '20180814.pkl', '20180815.pkl', '20180816.pkl', '20180817.pkl', '20180820.pkl', '20180821.pkl', '20180822.pkl', '20180823.pkl', '20180824.pkl']
Validating on 20180827.pkl
Corrcoef on training set: 0.09949646218580516
Corrcoef on validation set: 0.1464254699

## Validate model on test set

In [None]:
with open(_OUTPUT_DIR + 'models/model_x3_y1.pickle', 'rb') as file:
    model_x2_y1 = pickle.load(file)

In [14]:
result = model.test(data_test)

Reading file ./data/date_data/20180830.pkl
[-139.3285775097863, -51.904182364259896, 18.574687016428225]
{   'R_square': -0.02186021965911733,
    'stats':          Feature      Coef     T-value
0   lag_return_1 -0.242170 -139.328578
1   lag_return_5 -0.056089  -51.904182
2  lag_return_15  0.010986   18.574687}
Reading file ./data/date_data/20180831.pkl
[-130.46497386949508, -50.031981897061975, 23.79652421506053]
{   'R_square': 0.014256172730596428,
    'stats':          Feature      Coef     T-value
0   lag_return_1 -0.242170 -130.464974
1   lag_return_5 -0.056089  -50.031982
2  lag_return_15  0.010986   23.796524}
