In [11]:
#

In [12]:
import json
import time
import numpy
import pandas
import datetime
from scipy import stats
from data.flat import load
from matplotlib import pyplot
from matplotlib import lines as mlines

In [13]:
# from mpydge.wrap.models.linear import LBR
from lightgbm import LGBMRegressor as LBR

In [14]:
from m_utils.measures import r2_adj
from m_utils.sampling import ts_sampler
from m_utils.feature_selection import recursive_batch

In [15]:
# models
# all of them are stored in .json file with possible parameters

import json

with open('./models_params.json') as f:
    models_params = json.load(f)

In [16]:
def multiply_params(params):
    keys = numpy.array(list(params.keys()))
    dims = numpy.array([len(params[keys[j]]) for j in numpy.arange(keys.shape[0])])
    result = []
    for j in numpy.arange(dims.prod()):
        curr = j
        res = {}
        for k in numpy.arange(keys.shape[0]):
            ix = curr % dims[k]
            res[keys[k]] = params[keys[k]][ix]
            curr = curr // dims[k]
        result.append(res)
    return result


In [17]:
dim_models = 0
multiple_model_args = multiply_params(models_params['LGBMRegressor'])
multiple_model_args

[{'max_depth': -1,
  'learning_rate': 0.01,
  'n_estimators': 10,
  'class_weight': 'balanced',
  'subsample': 1.0,
  'reg_alpha': 0.0,
  'reg_lambda': 0.0,
  'n_jobs': -1},
 {'max_depth': 10,
  'learning_rate': 0.01,
  'n_estimators': 10,
  'class_weight': 'balanced',
  'subsample': 1.0,
  'reg_alpha': 0.0,
  'reg_lambda': 0.0,
  'n_jobs': -1},
 {'max_depth': 5,
  'learning_rate': 0.01,
  'n_estimators': 10,
  'class_weight': 'balanced',
  'subsample': 1.0,
  'reg_alpha': 0.0,
  'reg_lambda': 0.0,
  'n_jobs': -1},
 {'max_depth': 2,
  'learning_rate': 0.01,
  'n_estimators': 10,
  'class_weight': 'balanced',
  'subsample': 1.0,
  'reg_alpha': 0.0,
  'reg_lambda': 0.0,
  'n_jobs': -1},
 {'max_depth': -1,
  'learning_rate': 0.1,
  'n_estimators': 10,
  'class_weight': 'balanced',
  'subsample': 1.0,
  'reg_alpha': 0.0,
  'reg_lambda': 0.0,
  'n_jobs': -1},
 {'max_depth': 10,
  'learning_rate': 0.1,
  'n_estimators': 10,
  'class_weight': 'balanced',
  'subsample': 1.0,
  'reg_alpha': 0.0

In [18]:
def load_data(data, tsi_names, y_names, removes, test_rate, n_folds):

    # clarify names

    exclude = y_names + removes
    x_names = [x for x in data.columns.values if (x not in exclude and 'LAG0' not in x)]
    
    # sample (without folds)

    data_train, data_test = ts_sampler(data, n_folds, test_rate)
    X_, Y_ = numpy.array(x_names), numpy.array(y_names)
    X_train, Y_train = [x[X_].values for x in data_train], [x[Y_].values for x in data_train]
    X_test, Y_test = data_test[X_].values, data_test[Y_].values
    
    print('Fold shape: {0}'.format(X_train[0].shape))
    print('Test shape: {0}'.format(X_test.shape))
    
    return X_train, Y_train, X_test, Y_test, X_
    

In [19]:
def golags(tsi_names, y_names, removes, test_rate=0.2, n_folds=1):
    report = pandas.DataFrame(columns=['Np', 'Nf', 'R2_adj_train', 'R2_adj_test', 'd1', 'params', 'd1', 'X_adj_'])
    X_train, Y_train, X_test, Y_test, X_ = load_data(data, tsi_names, y_names, removes, test_rate, n_folds)
    model = LBR

    verbose_step = 100
    n_iters = len(multiple_model_args)
    print('N of expected iters = {0}'.format(n_iters))
    print('Started search: {0}'.format(datetime.datetime.now().isoformat()))
    
    for i in range(len(multiple_model_args)):
        
        if i % verbose_step == 0:
            print('{0} / {1}'.format(i, n_iters))
        
        params = multiple_model_args[i]
        
        for j in range(len(X_train)):
            
            model_ = model(**params)
            model_.fit(X_train[j], Y_train[j].ravel())
            Y_hat_train = model_.predict(X_train[j])
            Y_hat_test = model_.predict(X_test)
            result = {'Np': i, 'Nf': j, 
                      'R2_adj_train': r2_adj(Y_train[j], Y_hat_train, X_train[j].shape[0], X_train[j].shape[1]),
                      'R2_adj_test': r2_adj(Y_test, Y_hat_test, X_test.shape[0], X_test.shape[1]),
                      'd1': X_train[j].shape[1], 'params': params, 'X_adj_': X_}
            
            report = report.append(result, ignore_index=True)

    print('Finished search: {0}'.format(datetime.datetime.now().isoformat()))
    
    return report

In [20]:
d = './dataset.csv'
data = pandas.read_csv(d)
data = data.rename(columns={'lag': 'news_horizon'})
data = data.set_index(['ticker', 'time', 'news_horizon'], drop=False)
data = data.sort_index()
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ticker,time,open_LAG0,close_LAG0,high_LAG0,low_LAG0,volume_LAG0,open_LAG1,close_LAG1,high_LAG1,...,volume_LAG99,open_LAG100,close_LAG100,high_LAG100,low_LAG100,volume_LAG100,id,title,news_horizon,news_time
ticker,time,news_horizon,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
MSFT,2019-08-30 15:08:00+00:00,3,MSFT,2019-08-30 15:08:00+00:00,-0.000292,-0.000438,-0.000365,-0.000511,0.539497,-0.000146,-0.000219,-0.000292,...,9.611399,-0.000359,0.000647,-0.000072,0.00036,18.300000,165358,From Jedis to astronauts: How NASA tapped a vi...,3,2019-08-30 15:05:00+00:00
MSFT,2019-08-30 15:08:00+00:00,3,MSFT,2019-08-30 15:08:00+00:00,-0.000292,-0.000438,-0.000365,-0.000511,0.539497,-0.000146,-0.000219,-0.000292,...,9.611399,-0.000359,0.000647,-0.000072,0.00036,18.300000,165350,How About a Mash Up of Green Bonds and Century...,3,2019-08-30 15:05:00+00:00
MSFT,2019-08-30 15:08:00+00:00,6,MSFT,2019-08-30 15:08:00+00:00,-0.000292,-0.000438,-0.000365,-0.000511,0.539497,-0.000146,-0.000219,-0.000292,...,9.611399,-0.000359,0.000647,-0.000072,0.00036,18.300000,165205,Brexit endgame: It's PM Johnson vs parliament ...,6,2019-08-30 15:02:00+00:00
MSFT,2019-08-30 15:08:00+00:00,6,MSFT,2019-08-30 15:08:00+00:00,-0.000292,-0.000438,-0.000365,-0.000511,0.539497,-0.000146,-0.000219,-0.000292,...,9.611399,-0.000359,0.000647,-0.000072,0.00036,18.300000,165253,The Latest: Scottish court moves up case on UK...,6,2019-08-30 15:02:00+00:00
MSFT,2019-08-30 15:08:00+00:00,9,MSFT,2019-08-30 15:08:00+00:00,-0.000292,-0.000438,-0.000365,-0.000511,0.539497,-0.000146,-0.000219,-0.000292,...,9.611399,-0.000359,0.000647,-0.000072,0.00036,18.300000,165103,Deutsche Post sees interest from potential Str...,9,2019-08-30 14:59:00+00:00
MSFT,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MSFT,2019-09-26 20:00:00+00:00,94,MSFT,2019-09-26 20:00:00+00:00,-0.000072,0.000716,-0.000215,0.000933,4.762202,0.000932,0.000143,0.001218,...,-0.407206,0.000644,-0.000143,0.000215,0.00043,0.827525,383553,Timeline: The alarming pattern of actions by T...,94,2019-09-26 18:26:00+00:00
MSFT,2019-09-26 20:00:00+00:00,94,MSFT,2019-09-26 20:00:00+00:00,-0.000072,0.000716,-0.000215,0.000933,4.762202,0.000932,0.000143,0.001218,...,-0.407206,0.000644,-0.000143,0.000215,0.00043,0.827525,383482,UK royals give Boaty McBoatface polar ship its...,94,2019-09-26 18:26:00+00:00
MSFT,2019-09-26 20:00:00+00:00,94,MSFT,2019-09-26 20:00:00+00:00,-0.000072,0.000716,-0.000215,0.000933,4.762202,0.000932,0.000143,0.001218,...,-0.407206,0.000644,-0.000143,0.000215,0.00043,0.827525,383461,UPDATE 1-Unipec replaces ship charters after U...,94,2019-09-26 18:26:00+00:00
MSFT,2019-09-26 20:00:00+00:00,94,MSFT,2019-09-26 20:00:00+00:00,-0.000072,0.000716,-0.000215,0.000933,4.762202,0.000932,0.000143,0.001218,...,-0.407206,0.000644,-0.000143,0.000215,0.00043,0.827525,383481,Ukrainian leader's rivals use Trump call to ki...,94,2019-09-26 18:26:00+00:00


In [21]:
tsi_names = ['news_time']
y_names = ['open_LAG0']
removes = ['ticker', 'time', 'id', 'title', 'news_time']

report = golags(tsi_names, y_names, removes, n_folds=5)
report

Fold shape: (21483, 501)
Test shape: (36978, 501)


KeyboardInterrupt: 