# Model Training

In [6]:
from __future__ import print_function
from __future__ import division

import pickle
import pandas as pd 
import numpy as np
from sklearn.linear_model import Ridge

from src.data_reader import Dataset
from src.model import Model
from src.utils import get_leaf_file_names

# Configurations

In [2]:
# Global config
_RAW_DATA_PATH = './data/rawdata'
_DATA_PATH = './data/date_data'
_OUTPUT_DIR = './output/'

# Data configuration
_TIME_FRAME_SIZE_Y = (1, 5, 15, 30)
_TIME_FRAME_SIZE_X = (1, 5, 15)
_X_1 = [
    'avg_delta_bid_size_1', 'avg_delta_bid_size_5', 'avg_delta_bid_size_15',
    'avg_delta_ask_size_1', 'avg_delta_ask_size_5', 'avg_delta_ask_size_15',
    'bid_size_proportion_1', 'bid_size_proportion_5', 'bid_size_proportion_15'
]
_X_2 = [
    'buy_volume_1', 'buy_volume_5', 'buy_volume_15', 'sell_volume_1', 'sell_volume_5',
    'sell_volume_15', 'proportion_volume_1', 'proportion_volume_5', 'proportion_volume_15'
]
_X_3 = [
    'lag_return_1', 'lag_return_5', 'lag_return_15'
]

_Y_1, _Y_2, _Y_3, _Y_4 = ['return_1_min', 'return_5_min', 'return_15_min', 'return_30_min']

_COLUMNS_TO_NORMALIZE = [
    'avg_delta_bid_size_1', 'avg_delta_bid_size_5', 'avg_delta_bid_size_15',
    'avg_delta_ask_size_1', 'avg_delta_ask_size_5', 'avg_delta_ask_size_15',
    'buy_volume_1', 'buy_volume_5', 'buy_volume_15', 'sell_volume_1', 'sell_volume_5', 'sell_volume_15'
]

## Data and model config

In [3]:
# Training configuration
_TEST_SET_SIZE = 1
_SLIDING_WINDOW_SIZE = 18
_CHUNK_SIZE = 3901 * 2
_MODEL_RIDGE = Ridge
_PARAMS_RIDGE = {
    'alpha': 1,
    'random_state': 100
}

## Prepare training set

In [4]:
# Prepare dataset
date_files = get_leaf_file_names(_DATA_PATH)
data_train = date_files[: -_TEST_SET_SIZE]
print(data_train)

['./data/date_data/20180802.pkl', './data/date_data/20180803.pkl', './data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl', './data/date_data/20180827.pkl', './data/date_data/20180828.pkl', './data/date_data/20180829.pkl', './data/date_data/20180830.pkl']


## Prepare test set

In [5]:
x = date_files[-_TEST_SET_SIZE:]

## Train model with all regressors

In [6]:
# Train model with different parameters
model_all_x_y1 = Model(
    X=_X_1+_X_2+_X_3, y=_Y_1, model=_MODEL_RIDGE, params=_PARAMS_RIDGE,
    data_files=data_train, columns_to_normalize=_COLUMNS_TO_NORMALIZE,
    window_size=_SLIDING_WINDOW_SIZE, days_as_window=True
)
corrcoef_all_x_y1, _, _ = model_all_x_y1.run()

Reading file ['./data/date_data/20180802.pkl', './data/date_data/20180803.pkl', './data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl', './data/date_data/20180827.pkl']
Reading file ./data/date_data/20180828.pkl
------------------------------------------
Training phrase 1...
Training on ['20180802.pkl', '20180803.pkl', '20180806.pkl', '20180807.pkl', '20180808.pkl', '20180809.pkl', '20180810.pkl', '20180813.pkl', '20180814.pkl', '20180815.pkl', '20180816.pkl', '20180817.pkl', '20180820.pkl', '20180821.pkl', '20180822.pkl', '20180823.pkl', '20180824.pkl', 

In [11]:
with open(_OUTPUT_DIR + 'model_all_x_y1.pickle', 'wb') as file:
    pickle.dump(model_all_x_y1, file)

In [None]:
with open(_OUTPUT_DIR + 'model_all_x_y1.pickle', 'rb') as file:
    model_all_x_y1 = pickle.load(file)

## Validate model on test set

In [7]:
y_hat_all, y, corr_all = model_all_x_y1.test(x)
print('Regression with all regressors: corr_all = {}'.format(corr_all))
np.save('./output/y_hat_all', y_hat_all)

Reading file ['./data/date_data/20180831.pkl']
Regression with all regressors: corr_all = [[1.         0.29236617]
 [0.29236617 1.        ]]


## Train model with x1 (average of delta of bid/ask size)

In [12]:
model_x1_y1 = Model(
    X=_X_1, y=_Y_1, model=_MODEL_RIDGE, params=_PARAMS_RIDGE,
    data_files=data_train, columns_to_normalize=_COLUMNS_TO_NORMALIZE,
    window_size=_SLIDING_WINDOW_SIZE, days_as_window=True
)
corrcoef_x1_y1, _, _ = model_x1_y1.run()

Reading file ['./data/date_data/20180802.pkl', './data/date_data/20180803.pkl', './data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl', './data/date_data/20180827.pkl']
Reading file ./data/date_data/20180828.pkl
------------------------------------------
Training phrase 1...
Training on ['20180802.pkl', '20180803.pkl', '20180806.pkl', '20180807.pkl', '20180808.pkl', '20180809.pkl', '20180810.pkl', '20180813.pkl', '20180814.pkl', '20180815.pkl', '20180816.pkl', '20180817.pkl', '20180820.pkl', '20180821.pkl', '20180822.pkl', '20180823.pkl', '20180824.pkl', 

In [13]:
with open(_OUTPUT_DIR + 'model_x1_y1.pickle', 'wb') as file:
    pickle.dump(model_x1_y1, file)

In [22]:
with open(_OUTPUT_DIR + 'model_x1_y1.pickle', 'rb') as file:
    model_x1_y1 = pickle.load(file)

## Validate model on test set

In [23]:
y_hat_x1, y, corr_x1 = model_x1_y1.test(x)
print('Regression with all x1 regressors {}:\n corr_all = {}'.format(_X_1, corr_x1))
np.save('./output/y_hat_x1', y_hat_x1)

Reading file ['./data/date_data/20180831.pkl']
Regression with all x1 regressors ['avg_delta_bid_size_1', 'avg_delta_bid_size_5', 'avg_delta_bid_size_15', 'avg_delta_ask_size_1', 'avg_delta_ask_size_5', 'avg_delta_ask_size_15', 'bid_size_proportion_1', 'bid_size_proportion_5', 'bid_size_proportion_15']:
 corr_all = [[1.         0.06301515]
 [0.06301515 1.        ]]


In [9]:
np.save('./output/y', y)

## Train model with x2 (volume initiated by buy/sell)

In [11]:
model_x2_y1 = Model(
    X=_X_2, y=_Y_1, model=_MODEL_RIDGE, params=_PARAMS_RIDGE,
    data_files=data_train, columns_to_normalize=_COLUMNS_TO_NORMALIZE,
    window_size=_SLIDING_WINDOW_SIZE, days_as_window=True
)
corrcoef_x2_y1, _, _ = model_x2_y1.run()

Reading file ['./data/date_data/20180802.pkl', './data/date_data/20180803.pkl', './data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl', './data/date_data/20180827.pkl']
Reading file ./data/date_data/20180828.pkl
------------------------------------------
Training phrase 1...
Training on ['20180802.pkl', '20180803.pkl', '20180806.pkl', '20180807.pkl', '20180808.pkl', '20180809.pkl', '20180810.pkl', '20180813.pkl', '20180814.pkl', '20180815.pkl', '20180816.pkl', '20180817.pkl', '20180820.pkl', '20180821.pkl', '20180822.pkl', '20180823.pkl', '20180824.pkl', 

In [12]:
with open(_OUTPUT_DIR + 'model_x2_y1.pickle', 'wb') as file:
    pickle.dump(model_x2_y1, file)

In [None]:
with open(_OUTPUT_DIR + 'model_x2_y1.pickle', 'rb') as file:
    model_x2_y1 = pickle.load(file)

## Validate model on test set

In [20]:
y_hat_x2, y, corr_x2 = model_x2_y1.test(x)
print('Regression with all x2 regressors {}:\n corr_all = {}'.format(_X_2, corr_x2))
np.save('./output/y_hat_x2', y_hat_x2)

Reading file ['./data/date_data/20180831.pkl']
Regression with all x2 regressors ['buy_volume_1', 'buy_volume_5', 'buy_volume_15', 'sell_volume_1', 'sell_volume_5', 'sell_volume_15', 'proportion_volume_1', 'proportion_volume_5', 'proportion_volume_15']:
 corr_all = [[1.         0.06708809]
 [0.06708809 1.        ]]


## Train model with x3 (lag return)

In [25]:
model_x3_y1 = Model(
    X=_X_3, y=_Y_1, model=_MODEL_RIDGE, params=_PARAMS_RIDGE,
    data_files=data_train, columns_to_normalize=_COLUMNS_TO_NORMALIZE,
    window_size=_SLIDING_WINDOW_SIZE, days_as_window=True
)
corrcoef_x3_y1, _, _ = model_x3_y1.run()

Reading file ['./data/date_data/20180802.pkl', './data/date_data/20180803.pkl', './data/date_data/20180806.pkl', './data/date_data/20180807.pkl', './data/date_data/20180808.pkl', './data/date_data/20180809.pkl', './data/date_data/20180810.pkl', './data/date_data/20180813.pkl', './data/date_data/20180814.pkl', './data/date_data/20180815.pkl', './data/date_data/20180816.pkl', './data/date_data/20180817.pkl', './data/date_data/20180820.pkl', './data/date_data/20180821.pkl', './data/date_data/20180822.pkl', './data/date_data/20180823.pkl', './data/date_data/20180824.pkl', './data/date_data/20180827.pkl']
Reading file ./data/date_data/20180828.pkl
------------------------------------------
Training phrase 1...
Training on ['20180802.pkl', '20180803.pkl', '20180806.pkl', '20180807.pkl', '20180808.pkl', '20180809.pkl', '20180810.pkl', '20180813.pkl', '20180814.pkl', '20180815.pkl', '20180816.pkl', '20180817.pkl', '20180820.pkl', '20180821.pkl', '20180822.pkl', '20180823.pkl', '20180824.pkl', 

In [26]:
with open(_OUTPUT_DIR + 'model_x3_y1.pickle', 'wb') as file:
    pickle.dump(model_x3_y1, file)

In [None]:
with open(_OUTPUT_DIR + 'model_x3_y1.pickle', 'rb') as file:
    model_x2_y1 = pickle.load(file)

## Validate model on test set

In [27]:
y_hat_x3, y, corr_x3 = model_x3_y1.test(x)
print('Regression with all x3 regressors {}:\n corr_all = {}'.format(_X_3, corr_x3))
np.save('./output/y_hat_x3', y_hat_x3)

Reading file ['./data/date_data/20180831.pkl']
Regression with all x3 regressors ['lag_return_1', 'lag_return_5', 'lag_return_15']:
 corr_all = [[1.        0.2668826]
 [0.2668826 1.       ]]


# Result

In [32]:
with open(_OUTPUT_DIR + 'model_all_x_y1.pickle', 'rb') as file:
    model_all_x = pickle.load(file)
    y_hat_all, y, corr_all = model_all_x.test(x)
    print('Regression with all regressors: corr_all = {}'.format(corr_all[0][1]))
    print()
    
with open(_OUTPUT_DIR + 'model_x1_y1.pickle', 'rb') as file:
    model_x1 = pickle.load(file)
    y_hat_x1, y, corr_x1 = model_x1.test(x)
    print('Regression with all x1 regressors {}:\n corr_all = {}'.format(_X_1, corr_x1[0][1]))
    print()

with open(_OUTPUT_DIR + 'model_x2_y1.pickle', 'rb') as file:
    model_x2 = pickle.load(file)
    y_hat_x2, y, corr_x2 = model_x2.test(x)
    print('Regression with all x2 regressors {}:\n corr_all = {}'.format(_X_2, corr_x2[0][1]))
    print()
    
with open(_OUTPUT_DIR + 'model_x3_y1.pickle', 'rb') as file:
    model_x3 = pickle.load(file)
    y_hat_x3, y, corr_x3 = model_x3.test(x)
    print('Regression with all x3 regressors {}:\n corr_all = {}'.format(_X_3, corr_x3[0][1]))

Reading file ['./data/date_data/20180831.pkl']
Regression with all regressors: corr_all = 0.2923661671592335

Reading file ['./data/date_data/20180831.pkl']
Regression with all x1 regressors ['avg_delta_bid_size_1', 'avg_delta_bid_size_5', 'avg_delta_bid_size_15', 'avg_delta_ask_size_1', 'avg_delta_ask_size_5', 'avg_delta_ask_size_15', 'bid_size_proportion_1', 'bid_size_proportion_5', 'bid_size_proportion_15']:
 corr_all = 0.06301514702098966

Reading file ['./data/date_data/20180831.pkl']
Regression with all x2 regressors ['buy_volume_1', 'buy_volume_5', 'buy_volume_15', 'sell_volume_1', 'sell_volume_5', 'sell_volume_15', 'proportion_volume_1', 'proportion_volume_5', 'proportion_volume_15']:
 corr_all = 0.06708809012273853

Reading file ['./data/date_data/20180831.pkl']
Regression with all x3 regressors ['lag_return_1', 'lag_return_5', 'lag_return_15']:
 corr_all = 0.2668826007132174
