In [1]:
from __future__ import print_function
from __future__ import division

from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from src.data_reader import Dataset
from src.model import Model
from src.utils import timeit
from src.utils import get_leaf_file_names

In [2]:
# Global config
_RAW_DATA_PATH = './data/rawdata'
_DATA_PATH = './data/date_data/'
_OUTPUT_DIR = './data/output/'

# Training configuration
_TEST_SET_SIZE = .4
_SLIDING_WINDOW_SIZE = 4
_CHUNK_SIZE = 3901 * 2
_MODEL_RIDGE = Ridge
_MODEL_KKR = KernelRidge
_PARAMS_RIDGE = {
    'alpha': 1e-3,
    'random_state': 100
}
_PARAMS_KERNEL_RIDGE = {
    'alpha': 2.5,
    'random_state': 100
}

# Data configuration
_TIME_FRAME_SIZE_Y = (1, 5, 15, 30)
_TIME_FRAME_SIZE_X = (1, 5, 15)
_X_1 = [
    'avg_delta_bid_size_1', 'avg_delta_bid_size_5', 'avg_delta_bid_size_15',
    'avg_delta_ask_size_1', 'avg_delta_ask_size_5', 'avg_delta_ask_size_15',
    'bid_size_proportion_1', 'bid_size_proportion_5', 'bid_size_proportion_15'
]
_X_2 = [
    'buy_volume_1', 'buy_volume_5', 'buy_volume_15', 'sell_volume_1', 'sell_volume_5',
    'sell_volume_15', 'proportion_volume_1', 'proportion_volume_5', 'proportion_volume_15'
]
_Y_1, _Y_2, _Y_3, _Y_4 = ['return_1_min', 'return_5_min', 'return_15_min', 'return_30_min']

_COLUMNS_TO_NORMALIZE = [
    'avg_delta_bid_size_1', 'avg_delta_bid_size_5', 'avg_delta_bid_size_15',
    'avg_delta_ask_size_1', 'avg_delta_ask_size_5', 'avg_delta_ask_size_15',
    'buy_volume_1', 'buy_volume_5', 'buy_volume_15', 'sell_volume_1', 'sell_volume_5', 'sell_volume_15'
]

In [3]:
# Prepare dataset
date_files = get_leaf_file_names(_DATA_PATH)
data_train = date_files[:-int(len(date_files) * _TEST_SET_SIZE)]
data_test = date_files[-int(len(date_files) * _TEST_SET_SIZE) - _SLIDING_WINDOW_SIZE + 1:]

In [6]:
# Train model with different parameters
model = Model(
    X=_X_2, y=_Y_1, model=_MODEL_RIDGE, params=_PARAMS_RIDGE,
    data_files=data_train, columns_to_normalize=_COLUMNS_TO_NORMALIZE,
    window_size=_SLIDING_WINDOW_SIZE, days_as_window=True
)

In [14]:
corrcoef, y_preds, y_truth = model.run()

Training phrase 1...
Training on ['date_data20180802.pkl', 'date_data20180803.pkl', 'date_data20180806.pkl', 'date_data20180807.pkl', 'date_data20180808.pkl']
Validating on date_data20180809.pkl
Score on training set: [[1.         0.07215098]
 [0.07215098 1.        ]]
Score on validation set: [[1.         0.08452256]
 [0.08452256 1.        ]]
Training phrase 2...
Training on ['date_data20180803.pkl', 'date_data20180806.pkl', 'date_data20180807.pkl', 'date_data20180808.pkl', 'date_data20180809.pkl']
Validating on date_data20180810.pkl
Score on training set: [[1.         0.06148356]
 [0.06148356 1.        ]]
Score on validation set: [[1.        0.0683065]
 [0.0683065 1.       ]]
Training phrase 3...
Training on ['date_data20180806.pkl', 'date_data20180807.pkl', 'date_data20180808.pkl', 'date_data20180809.pkl', 'date_data20180810.pkl']
Validating on date_data20180813.pkl
Score on training set: [[1.         0.06341071]
 [0.06341071 1.        ]]
Score on validation set: [[1.         0.07411

In [15]:
corrcoef = [x[0][1] for x in corrcoef]
print(corrcoef)

[0.08452256300905822, 0.0683064994816764, 0.07411157781158899, 0.04401068728440605, 0.07327787226405831, 0.0710482080214819, 0.06177323839587649, 0.0551595072666944, 0.0492013419391362, 0.08452256300905822, 0.0683064994816764, 0.07411157781158899, 0.04401068728440605, 0.07327787226405831, 0.0710482080214819, 0.06177323839587649, 0.0551595072666944, 0.0492013419391362]
