In [1]:
#

In [2]:
import json
import time
import numpy
import pandas
from scipy import stats
from data.flat import load
from matplotlib import pyplot
from matplotlib import lines as mlines

  from pandas.util.testing import assert_frame_equal


In [3]:
from m_utils.measures import r2_adj
from m_utils.sampling import ts_sampler
from m_utils.summarization import get_ols_summary
from m_utils.feature_selection import recursive_batch

In [4]:
def load_data(data, tsi_names, y_names, removes, test_rate, n_folds):

    # clarify names

    exclude = y_names + removes
    x_names = [x for x in data.columns.values if (x not in exclude and 'LAG0' not in x)]
    
    # sample (without folds)

    data_train, data_test = ts_sampler(data, n_folds, test_rate)
    X_, Y_ = numpy.array(x_names), numpy.array(y_names)
    X_train, Y_train = [x[X_].values for x in data_train], [x[Y_].values for x in data_train]
    X_test, Y_test = data_test[X_].values, data_test[Y_].values
    
    print('Fold shape: {0}'.format(X_train[0].shape))
    print('Test shape: {0}'.format(X_test.shape))
    
    return X_train, Y_train, X_test, Y_test, X_
    

In [5]:
def censory(model, XX, YY):
    if XX.shape[1] == 0:
        mask = numpy.array([False])
    else:
        p_values = get_ols_summary(model, XX, YY, list(range(XX.shape[1])))['Probabilities'].values[1:]
        mask = p_values < 0.05
    return mask

In [6]:
def golags(tsi_names, y_names, removes, test_rate=0.2, n_folds=1):
    report = pandas.DataFrame(columns=['R2_adj', 'd1', 'X_adj_'])
    X_train, Y_train, X_test, Y_test, X_ = load_data(data, tsi_names, y_names, removes, test_rate, n_folds)
    params = {'n_jobs': -1, 'fit_intercept': False}
    model = LinearRegression
    for j in range(len(X_train)):
        
        model_adj, X_train_adj, Y_train_adj, X_test_adj, Y_test_adj, X_adj_ = recursive_batch(model, params, X_train[j], Y_train[j], censory, X_test, Y_test, X_)
        Y_hat_train_adj = model_adj.predict(X_train_adj)
        Y_hat_test_adj = model_adj.predict(X_test_adj)
        result = {'R2_adj': r2_adj(Y_train_adj, Y_hat_train_adj, X_train_adj.shape[0], X_train_adj.shape[1]), 
                  'd1': X_train_adj.shape[1], 'X_adj_': X_adj_}
        report = report.append(result, ignore_index=True)

    return report

In [None]:
d = './dataset.csv'
data = pandas.read_csv(d)
data = data.rename(columns={'lag': 'news_horizon'})
data = data.set_index(['ticker', 'time', 'news_horizon'], drop=False)
data = data.sort_index()

tsi_names = ['news_time']
y_names = ['open_LAG0']
removes = ['ticker', 'time', 'id', 'title', 'news_time']

report = golags(tsi_names, y_names, removes, n_folds=5)
report