Replicate [Dynamic Return Dependencies Across Industries: A Machine Learning Approach](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3120110&download=yes) by David Rapach, Jack Strauss, Jun Tu and Guofu Zhou.

1) Use Keras NNs instead of linear regression

2) Add additional variables, 3 and 12-month MA, interest rate change, yield curve, Mkt-RF, seasonal dummy variables. With cross-validation and regularization we hope to do that without overfitting and possibly produce a better result.


In [1]:
import os
import sys
import warnings
import numpy as np
import pandas as pd
import pandas_datareader.data as datareader

import time 
import datetime
import copy
import random
from itertools import product

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #Hide messy TensorFlow warnings
warnings.filterwarnings("ignore") #Hide messy numpy warnings

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, explained_variance_score, r2_score
from sklearn.linear_model import LinearRegression, Lasso, lasso_path, lars_path, LassoLarsIC
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler


import tensorflow as tf
tf.set_random_seed(1764)
print(tf.__version__)
# confirm GPU is in use
with tf.device('/gpu:0'):
    a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
    b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
    c = tf.matmul(a, b)

with tf.Session() as sess:
    print (sess.run(c))
    
import keras
from keras.layers.core import Dense, Activation
from keras.layers import Input
from keras.models import Model

from keras.layers.recurrent import LSTM, GRU
from keras.regularizers import l1
from keras.models import Sequential
from keras.models import load_model

import ffn
%matplotlib inline

import plotly as py
# print (py.__version__) # requires version >= 1.9.0
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
import plotly.figure_factory as ff

init_notebook_mode(connected=True)

random.seed(1764)
np.random.seed(1764)


1.6.0
[[22. 28.]
 [49. 64.]]


Using TensorFlow backend.


In [2]:
print("Loading data...")
data = pd.read_csv("30_Industry_Portfolios.csv")
data = data.set_index('yyyymm')
industries = list(data.columns)
# map industry names to col nums
ind_reverse_dict = dict([(industries[i], i) for i in range(len(industries))])

rfdata = pd.read_csv("F-F_Research_Data_Factors.csv")
rfdata = rfdata.set_index('yyyymm')
data['rf'] = rfdata['RF']

# subtract risk-free rate
# create a response variable led by 1 period to predict
for ind in industries:
    data[ind] = data[ind] - data['rf']

    
# add rates data from FRED
start_date = datetime.datetime(1926, 9, 1)
end_date = datetime.datetime(2017, 12, 1)
TB3MS = datareader.DataReader("TB3MS", "fred", start_date, end_date)
TB3MS['yyyymm'] = TB3MS.index.strftime('%Y%m')
TB3MS['yyyymm'] = [int(datestr) for datestr in TB3MS['yyyymm']]
TB3MS=TB3MS.set_index(['yyyymm'])
data['3month']=TB3MS['TB3MS']

GS10 =  datareader.DataReader("GS10", "fred", start_date, end_date)
GS10['yyyymm'] = GS10.index.strftime('%Y%m')
GS10['yyyymm'] = [int(datestr) for datestr in GS10['yyyymm']]
GS10=GS10.set_index(['yyyymm'])
data['10year']=GS10['GS10']

data['curve'] = data['10year'] - data['3month']
data['10year'] = data['10year'].diff() # first difference 10-year yield
data['3month'] = data['3month'].diff() # first difference 3-month

data['month'] = data.index  % 100 # for possible seasonality
month_dummy = keras.utils.to_categorical(data['month'])
for i in range(month_dummy.shape[1]):
    data['month_dummy_%02d' % i] = month_dummy[:,i]
data = data.drop(columns=['month'])    
#data[['month_dummy_%02d' % i for i in range(12)]]

data['Mkt-RF'] = rfdata['Mkt-RF']

for ind in industries + ['3month', '10year', 'curve', 'Mkt-RF',]:
    data[ind+".3m"] = pd.rolling_mean(data[ind],3)
    
#for ind in industries + ['3month', '10year', 'curve', 'Mkt-RF',]:
#    data[ind+".6m"] = pd.rolling_mean(data[ind],6)

for ind in industries + ['3month', '10year', 'curve', 'Mkt-RF',]:
    data[ind+".12m"] = pd.rolling_mean(data[ind],12)

for ind in industries:
    data[ind+".lead"] = data[ind].shift(-1)

data = data.loc[data.index[data.index > 195911]]
data = data.drop(columns=['rf'])    
data = data.dropna(axis=0, how='any')

nresponses = len(industries)
npredictors = data.shape[1]-nresponses

predictors = list(data.columns[:npredictors])
predictor_reverse_dict = dict([(predictors[i], i) for i in range(len(predictors))])

responses = list(data.columns[-nresponses:])
response_reverse_dict = dict([(responses[i], i) for i in range(len(responses))])

print(data.shape)
print(list(data.columns))
data[['3month', '10year', 'curve', 'Mkt-RF',]]


Loading data...
(697, 145)
['Food', 'Beer', 'Smoke', 'Games', 'Books', 'Hshld', 'Clths', 'Hlth', 'Chems', 'Txtls', 'Cnstr', 'Steel', 'FabPr', 'ElcEq', 'Autos', 'Carry', 'Mines', 'Coal', 'Oil', 'Util', 'Telcm', 'Servs', 'BusEq', 'Paper', 'Trans', 'Whlsl', 'Rtail', 'Meals', 'Fin', 'Other', '3month', '10year', 'curve', 'month_dummy_00', 'month_dummy_01', 'month_dummy_02', 'month_dummy_03', 'month_dummy_04', 'month_dummy_05', 'month_dummy_06', 'month_dummy_07', 'month_dummy_08', 'month_dummy_09', 'month_dummy_10', 'month_dummy_11', 'month_dummy_12', 'Mkt-RF', 'Food.3m', 'Beer.3m', 'Smoke.3m', 'Games.3m', 'Books.3m', 'Hshld.3m', 'Clths.3m', 'Hlth.3m', 'Chems.3m', 'Txtls.3m', 'Cnstr.3m', 'Steel.3m', 'FabPr.3m', 'ElcEq.3m', 'Autos.3m', 'Carry.3m', 'Mines.3m', 'Coal.3m', 'Oil.3m', 'Util.3m', 'Telcm.3m', 'Servs.3m', 'BusEq.3m', 'Paper.3m', 'Trans.3m', 'Whlsl.3m', 'Rtail.3m', 'Meals.3m', 'Fin.3m', 'Other.3m', '3month.3m', '10year.3m', 'curve.3m', 'Mkt-RF.3m', 'Food.12m', 'Beer.12m', 'Smoke.12m',

Unnamed: 0_level_0,3month,10year,curve,Mkt-RF
yyyymm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
195912,0.34,0.16,0.20,2.45
196001,-0.14,0.03,0.37,-6.98
196002,-0.39,-0.23,0.53,1.17
196003,-0.65,-0.24,0.94,-1.63
196004,-0.08,0.03,1.05,-1.71
196005,0.06,0.07,1.06,3.12
196006,-0.83,-0.20,1.69,2.08
196007,-0.16,-0.25,1.60,-2.37
196008,0.00,-0.10,1.50,3.01
196009,0.18,0.00,1.32,-5.99


In [3]:
# exclude 2017 and later to tie to paper
#data = data.loc[data.index[data.index < 201701]]
data = data.loc[data.index[data.index > 195911]]
data


Unnamed: 0_level_0,Food,Beer,Smoke,Games,Books,Hshld,Clths,Hlth,Chems,Txtls,...,Telcm.lead,Servs.lead,BusEq.lead,Paper.lead,Trans.lead,Whlsl.lead,Rtail.lead,Meals.lead,Fin.lead,Other.lead
yyyymm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
195912,2.01,0.35,-3.02,1.64,7.29,0.67,1.87,-1.97,3.08,0.74,...,0.62,-6.18,-7.93,-9.41,-4.31,-5.33,-6.09,-10.08,-4.68,-3.98
196001,-4.49,-5.71,-2.05,1.21,-5.47,-7.84,-8.53,-6.68,-10.03,-4.77,...,8.07,9.13,5.09,3.00,-0.94,1.42,4.00,1.81,-0.98,6.32
196002,3.35,-2.14,2.27,4.23,2.39,9.31,1.44,-0.02,-0.74,0.32,...,-0.21,-0.31,3.34,-2.43,-4.99,-1.37,-0.13,-3.88,0.05,-2.43
196003,-1.67,-2.94,-0.18,-0.65,2.18,-0.56,-2.59,1.26,-2.75,-6.79,...,-1.24,7.14,1.77,0.41,-2.13,0.45,-0.53,8.86,-0.64,0.55
196004,1.17,-2.16,1.35,6.46,-1.17,-1.27,0.21,1.49,-5.53,-1.10,...,3.05,-1.75,11.90,2.85,0.90,1.65,3.11,0.80,-0.45,1.02
196005,8.20,-0.52,2.44,7.28,11.67,7.74,1.74,13.50,3.40,2.10,...,-0.58,-8.07,2.39,3.50,2.17,5.96,3.41,1.03,3.72,6.41
196006,5.39,0.47,4.73,2.24,0.02,6.38,-1.59,-0.40,0.45,4.04,...,-0.03,2.84,-2.02,-4.10,-3.11,-6.16,-2.99,-1.25,0.09,-5.95
196007,-2.11,-0.79,4.60,-4.72,0.23,-0.60,-1.10,-3.99,-6.80,-3.14,...,6.94,5.69,2.71,1.18,1.98,4.51,2.85,2.05,3.47,3.48
196008,4.57,3.24,5.20,7.16,3.63,5.09,3.34,2.29,1.17,-0.84,...,-6.07,-3.53,-7.61,-7.37,-7.07,-8.44,-8.57,-1.90,-5.78,-4.21
196009,-3.88,-5.00,-2.09,-2.33,-6.20,-9.18,-4.23,-8.87,-6.70,-5.25,...,-0.08,4.62,-3.40,-1.85,-1.02,-4.22,0.31,-4.54,-0.40,0.38


In [4]:
data.to_csv("data.csv")
desc = data.describe()
desc
# min, max line up with Table 1

Unnamed: 0,Food,Beer,Smoke,Games,Books,Hshld,Clths,Hlth,Chems,Txtls,...,Telcm.lead,Servs.lead,BusEq.lead,Paper.lead,Trans.lead,Whlsl.lead,Rtail.lead,Meals.lead,Fin.lead,Other.lead
count,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0,...,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0
mean,0.688666,0.72703,0.985079,0.732095,0.532253,0.564333,0.690387,0.665825,0.552367,0.687145,...,0.515968,0.729928,0.62297,0.534806,0.60109,0.631076,0.698235,0.728766,0.637547,0.396628
std,4.30866,5.058992,6.032324,7.12817,5.780362,4.728,6.355251,4.897557,5.482363,6.970961,...,4.607931,6.486956,6.698787,5.021876,5.707154,5.57104,5.334178,6.065564,5.381389,5.771655
min,-18.15,-20.19,-25.32,-33.4,-26.56,-22.24,-31.5,-21.06,-28.6,-33.11,...,-16.44,-28.67,-32.07,-27.74,-28.5,-29.25,-29.74,-31.89,-22.53,-28.09
25%,-1.63,-2.08,-2.74,-3.39,-2.6,-2.03,-2.8,-2.23,-2.75,-3.17,...,-2.11,-3.05,-3.22,-2.4,-2.78,-2.56,-2.38,-2.84,-2.4,-2.93
50%,0.74,0.75,1.27,0.94,0.51,0.75,0.7,0.76,0.72,0.64,...,0.59,1.01,0.67,0.71,0.9,0.94,0.54,1.08,0.87,0.54
75%,3.07,3.69,4.66,5.26,3.64,3.54,4.31,3.55,3.76,4.48,...,3.36,4.26,4.63,3.46,4.04,3.88,3.98,4.3,4.0,4.2
max,19.89,25.51,32.38,34.52,33.13,18.22,31.79,29.01,21.68,59.03,...,21.22,23.38,24.66,21.0,18.5,17.53,26.49,27.38,20.59,19.96


In [5]:
# annualized returns don't match Table 1, oddly
# geometric mean, annualized
pd.DataFrame((np.prod(data/100 + 1)**(12.0/len(data))-1)[:30], columns=['Mean Ann. Return'])

Unnamed: 0,Mean Ann. Return
Food,0.073929
Beer,0.074309
Smoke,0.100741
Games,0.058342
Books,0.044662
Hshld,0.055568
Clths,0.060067
Hlth,0.067552
Chems,0.049242
Txtls,0.054817


In [6]:
# try this way, arithmetic mean then annualize (not very correct)
#print(pd.DataFrame(((desc.loc['mean']/100+1)**12-1)[:30]))
#nope

# same
pd.DataFrame(((1 + np.mean(data, axis=0)/100)**12 -1)[:30], columns=['Mean Ann. Return'])

Unnamed: 0,Mean Ann. Return
Food,0.085843
Beer,0.090818
Smoke,0.124829
Games,0.091476
Books,0.065774
Hshld,0.069862
Clths,0.086066
Hlth,0.082891
Chems,0.068335
Txtls,0.085646


In [7]:
#annualized volatility 
pd.DataFrame((desc.loc['std']*np.sqrt(12))[:30].round(2))
# lines up with table 1

Unnamed: 0,std
Food,14.93
Beer,17.52
Smoke,20.9
Games,24.69
Books,20.02
Hshld,16.38
Clths,22.02
Hlth,16.97
Chems,18.99
Txtls,24.15


In [8]:
# Run LASSO, then OLS on selected variables

# skip last row to better match published r-squared
# looks like they forecast actuals 1960-2016 using 1959m12 to 2016m11
# not exact matches to Table 2 R-squared but almost within rounding error 
X = data.values[:,:npredictors]
Y = data.values[:,-nresponses:]
nrows = X.shape[0]
X.shape

(697, 115)

In [9]:
predictors

['Food',
 'Beer',
 'Smoke',
 'Games',
 'Books',
 'Hshld',
 'Clths',
 'Hlth',
 'Chems',
 'Txtls',
 'Cnstr',
 'Steel',
 'FabPr',
 'ElcEq',
 'Autos',
 'Carry',
 'Mines',
 'Coal',
 'Oil',
 'Util',
 'Telcm',
 'Servs',
 'BusEq',
 'Paper',
 'Trans',
 'Whlsl',
 'Rtail',
 'Meals',
 'Fin',
 'Other',
 '3month',
 '10year',
 'curve',
 'month_dummy_00',
 'month_dummy_01',
 'month_dummy_02',
 'month_dummy_03',
 'month_dummy_04',
 'month_dummy_05',
 'month_dummy_06',
 'month_dummy_07',
 'month_dummy_08',
 'month_dummy_09',
 'month_dummy_10',
 'month_dummy_11',
 'month_dummy_12',
 'Mkt-RF',
 'Food.3m',
 'Beer.3m',
 'Smoke.3m',
 'Games.3m',
 'Books.3m',
 'Hshld.3m',
 'Clths.3m',
 'Hlth.3m',
 'Chems.3m',
 'Txtls.3m',
 'Cnstr.3m',
 'Steel.3m',
 'FabPr.3m',
 'ElcEq.3m',
 'Autos.3m',
 'Carry.3m',
 'Mines.3m',
 'Coal.3m',
 'Oil.3m',
 'Util.3m',
 'Telcm.3m',
 'Servs.3m',
 'BusEq.3m',
 'Paper.3m',
 'Trans.3m',
 'Whlsl.3m',
 'Rtail.3m',
 'Meals.3m',
 'Fin.3m',
 'Other.3m',
 '3month.3m',
 '10year.3m',
 'curve.3m

In [10]:
def subset_selection(X, Y, model_aic, verbose=False, responses=responses, predictors=predictors):
    
    nrows, npreds = X.shape
    nows, nresps = Y.shape
    coef_dict = []
    
    for response_index in range(nresps):
        y = Y[:,response_index]
        model_aic.fit(X, y)
        predcols = [i for i in range(npreds) if model_aic.coef_[i] !=0]

        #y_response = model_aic.predict(X)
        # print ("In-sample LASSO R-squared: %.6f" % r2_score(y, y_response))
        if verbose and responses:
            print("LASSO variables selected for %s: " % responses[response_index])
            print([predictors[i] for i in predcols])
        
        if not predcols:
            if verbose and responses:
                print("No coefs selected for " + responses[response_index] + ", using all")
                print("---")
            predcols = list(range(npreds))    
            
        # fit OLS vs. selected vars, better fit w/o LASSO penalties
        # in-sample R-squared using LASSO coeffs
        coef_dict.append(predcols)
        if verbose and responses and predictors:
            print("Running OLS for " + responses[response_index] + " against " + str([predictors[i] for i in predcols]))
            # col nums of selected responses
            model_ols = LinearRegression()
            model_ols.fit(X[:, predcols], y)
            y_pred = model_ols.predict(X[:, predcols])
            print ("In-sample OLS R-squared: %.2f%%" % (100 * r2_score(y, y_pred)))
            print("---")
            
    return coef_dict

#coef_dict = subset_selection(X, Y, LassoLarsIC(criterion='aic'))
coef_dict = subset_selection(X, Y, LassoLarsIC(criterion='aic'), verbose=True, responses=responses, predictors=predictors)
print(coef_dict)
# These subsets line up closely with Table 2
# except Clths, Whlsl, we get different responses

LASSO variables selected for Food.lead: 
['10year', 'Mines.12m']
Running OLS for Food.lead against ['10year', 'Mines.12m']
In-sample OLS R-squared: 2.25%
---
LASSO variables selected for Beer.lead: 
['Food', 'Clths', 'Coal', '3month', '10year', 'month_dummy_02', 'month_dummy_04', 'month_dummy_07', 'month_dummy_08', 'Beer.3m', 'Hlth.3m', 'Util.3m', 'Mines.12m', 'Coal.12m', 'Servs.12m', '3month.12m']
Running OLS for Beer.lead against ['Food', 'Clths', 'Coal', '3month', '10year', 'month_dummy_02', 'month_dummy_04', 'month_dummy_07', 'month_dummy_08', 'Beer.3m', 'Hlth.3m', 'Util.3m', 'Mines.12m', 'Coal.12m', 'Servs.12m', '3month.12m']
In-sample OLS R-squared: 8.42%
---
LASSO variables selected for Smoke.lead: 
['Txtls', 'Carry', 'Coal', 'Oil', 'Util', 'Telcm', 'Servs', '3month', 'month_dummy_05', 'month_dummy_07', 'month_dummy_08', 'month_dummy_09', 'month_dummy_10', 'month_dummy_11', 'Food.3m', 'Beer.3m', 'Chems.3m', 'ElcEq.3m', 'Mines.3m', 'Util.3m', 'Servs.3m', 'Paper.3m', 'Other.3m', '

LASSO variables selected for Telcm.lead: 
['Fin', 'month_dummy_11', 'Beer.3m', 'Carry.3m', 'Mines.12m', 'Coal.12m', 'Telcm.12m', 'curve.12m']
Running OLS for Telcm.lead against ['Fin', 'month_dummy_11', 'Beer.3m', 'Carry.3m', 'Mines.12m', 'Coal.12m', 'Telcm.12m', 'curve.12m']
In-sample OLS R-squared: 5.98%
---
LASSO variables selected for Servs.lead: 
['3month', '10year']
Running OLS for Servs.lead against ['3month', '10year']
In-sample OLS R-squared: 1.93%
---
LASSO variables selected for BusEq.lead: 
['Books', 'Util', '3month', 'month_dummy_05', 'month_dummy_08', 'month_dummy_10', 'month_dummy_12', '10year.3m', 'Smoke.12m', 'Txtls.12m', 'Steel.12m', 'Telcm.12m', '3month.12m']
Running OLS for BusEq.lead against ['Books', 'Util', '3month', 'month_dummy_05', 'month_dummy_08', 'month_dummy_10', 'month_dummy_12', '10year.3m', 'Smoke.12m', 'Txtls.12m', 'Steel.12m', 'Telcm.12m', '3month.12m']
In-sample OLS R-squared: 8.90%
---
LASSO variables selected for Paper.lead: 
['Clths', 'ElcEq', 'Ca

In [11]:
def predict_with_subsets(X, Y, create_model, coef_dict, verbose=False):
    """evaluate subset selection, pass a model function and subsets, compute avg R-squared"""
    global responses

    nrows, ncols = Y.shape
    model = create_model()
    
    scores = []
    for response_col in range(ncols):
        y = Y[:,response_col]

#        print("LASSO variables selected for %s: " % pred)
#        print(coef_dict[pred])
        
        if not coef_dict[response_col]:
            if verbose:
                print("No coefs selected for " + responses[response_col])
 #           print("---")
            continue
        # fit model vs. selected vars, better fit w/o LASSO penalties
        # in-sample R-squared using LASSO coeffs
        #print("Running model for " + pred + " against " + str(coef_dict[pred]))
        # col nums of selected predictors
        predcols = coef_dict[response_col]
        model.fit(X[:, predcols], y)
        y_pred = model.predict(X[:, predcols])
        score = r2_score(y, y_pred)
        scores.append(score)
        if verbose:
            print ("In-sample R-squared: %.2f%% for %s against %s" % (score*100, responses[response_col], 
                                                                      str([predictors[i] for i in coef_dict[response_col]])))
#        print("---")
    
    if verbose:
        print("Mean R-squared: %.2f%%" % (100 * np.mean(np.array(scores))))
    return np.mean(np.array(scores))
    
predict_with_subsets(X, Y, LinearRegression, coef_dict, verbose=True)


In-sample R-squared: 2.25% for Food.lead against ['10year', 'Mines.12m']
In-sample R-squared: 8.42% for Beer.lead against ['Food', 'Clths', 'Coal', '3month', '10year', 'month_dummy_02', 'month_dummy_04', 'month_dummy_07', 'month_dummy_08', 'Beer.3m', 'Hlth.3m', 'Util.3m', 'Mines.12m', 'Coal.12m', 'Servs.12m', '3month.12m']
In-sample R-squared: 15.45% for Smoke.lead against ['Txtls', 'Carry', 'Coal', 'Oil', 'Util', 'Telcm', 'Servs', '3month', 'month_dummy_05', 'month_dummy_07', 'month_dummy_08', 'month_dummy_09', 'month_dummy_10', 'month_dummy_11', 'Food.3m', 'Beer.3m', 'Chems.3m', 'ElcEq.3m', 'Mines.3m', 'Util.3m', 'Servs.3m', 'Paper.3m', 'Other.3m', 'Food.12m', 'Smoke.12m', 'Games.12m', 'Hshld.12m', 'Hlth.12m', 'FabPr.12m', 'Paper.12m', '10year.12m']
In-sample R-squared: 8.70% for Games.lead against ['Books', 'Clths', 'Coal', 'Fin', '3month', 'curve', 'month_dummy_05', 'month_dummy_06', 'month_dummy_08', 'month_dummy_12', 'Oil.12m', '3month.12m']
In-sample R-squared: 14.67% for Books.

0.0942178600176278

In [12]:
# use all predictors - higher in-sample R-squared
coef_dict_all = []
for _ in responses:
    coef_dict_all.append(range(len(predictors)))
predict_with_subsets(X, Y, LinearRegression, coef_dict_all, verbose=False)


0.21415481234430905

In [13]:
# first iteration will train up to including 196911
# will use 196912 to predict 197001
# 1970101 will be first month of performance to use
# train on first 121 months up to 196912 (0:120), put first prediction in P[121] (122nd row)
# first month of performance will be 197002
FIRST_TRAIN_MONTHS = 121
FIRST_PREDICT_MONTH = FIRST_TRAIN_MONTHS # This is stupid but keeps my head straight

print(X[FIRST_TRAIN_MONTHS])
print(data.iloc[FIRST_TRAIN_MONTHS][:30])

[-3.34000000e+00 -1.95000000e+00 -7.59000000e+00 -7.76000000e+00
 -1.20500000e+01 -7.50000000e+00 -5.69000000e+00 -7.71000000e+00
 -7.37000000e+00 -5.26000000e+00 -9.84000000e+00 -6.31000000e+00
 -7.15000000e+00 -6.89000000e+00 -9.35000000e+00 -1.24900000e+01
 -2.34000000e+00 -7.70000000e-01 -1.21600000e+01 -4.83000000e+00
 -3.16000000e+00 -1.11700000e+01 -9.73000000e+00 -8.89000000e+00
 -8.17000000e+00 -8.28000000e+00 -6.31000000e+00 -1.31200000e+01
 -9.78000000e+00 -6.20000000e+00  5.00000000e-02  1.40000000e-01
 -8.00000000e-02  0.00000000e+00  1.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -8.10000000e+00 -2.28666667e+00
 -2.18000000e+00 -3.36000000e+00 -7.00333333e+00 -6.82000000e+00
 -3.37666667e+00 -5.32666667e+00 -1.41000000e+00 -5.58666667e+00
 -5.43333333e+00 -6.02666667e+00 -4.45000000e+00 -4.68333333e+00
 -4.67666667e+00 -5.93666

In [14]:
class PredictWrapper():
    """Wrap an sklearn model e.g. LinearRegression to fit, predict all vars as a vector, 
    match the way our Keras model will do it"""

    def __init__(self, create_model, coef_dict):
        self.create_model = create_model
        self.coef_dict = coef_dict
        self.models = []
        
    def fit(self, X_fit, Y_fit, verbose=False):
        
        self.nrows, self.ycols = Y_fit.shape
        
        self.models = []
        # fit model for each column
        for responsecol in range(self.ycols):
            if not self.coef_dict[responsecol]:
                # don't fit
                self.models.append(None)
                continue
                
            # column indexes to fit against each other
            predcols = self.coef_dict[responsecol]
            if verbose:
                print("fitting on " + str(X_fit[:, predcols].shape) + str(predcols))
            model = self.create_model()
            model.fit(X_fit[:, predcols], Y_fit[:,responsecol])
            self.models.append(model)
                
            #debug
            #print(responsecol)
            #print(X_fit[:, predcols])
            #print("=====")
            #print(Y_fit[:,responsecol])
            #print("=====")
            #print(self.model.coef_)
            #print(self.model.intercept_)
            #print("=====")                

    def predict(self, X_predict, verbose=False):

        predictions = []

        for responsecol in range(self.ycols):
            if not self.coef_dict[responsecol]:
                # don't predict
                #print('skip')
                predictions.append([np.nan])
                continue
        
            if verbose:
                print("predicting on" + str(X_predict[:, predcols].shape) + str(predcols))

            predcols = self.coef_dict[responsecol]
            y_pred = self.models[responsecol].predict(X_predict[:,predcols])
            predictions.append(y_pred)
        
        return np.array(predictions).transpose()
        
        

In [34]:
class BacktestModel():
    
    def __init__(self, 
                 X, # predictors
                 Y, # responses
                 create_model, # sklearn function to initialize model e.g. LinearRegression
                 coef_dict_param="all", # mapping of predictors to responses ("all", "timestep", or a list of lists)
                 startindex=FIRST_TRAIN_MONTHS,
                 scaler=None):
        
        Xrows, Xcols = X.shape
        Yrows, Ycols = Y.shape
        
        if Xrows != Yrows:
            raise(ValueError, "Shapes differ: X %s, Y %s" % (str(X.shape), str(Y.shape)))            
            
        self.X = X
        self.Y = Y
        self.Xscale = X.copy()
        self.Yscale = Y.copy()

        if scaler:
            # MinMaxScaler: each row (min->0, max->1) 
            # StandardScaler: each row (mean->0, SD->1)            
            # transpose, scale, transpose back because scales by columns
            print("scaler: %s " %str(scaler))
            self.Xscale = scaler().fit_transform(Xscale.transpose()).transpose()
            self.Yscale = scaler().fit_transform(Yscale.transpose()).transpose()
        
        self.create_model=create_model
        self.nrows, self.xcols = X.shape
        self.nrows, self.ycols = Y.shape
        
        self.coef_dict_param = coef_dict_param
        self.startindex = startindex

    def fit_predict(self, ntrain, npredict=1, verbose=False):
        """for backtest, train model using Y v. X 
        train on first ntrain rows. if ntrain=121, fit 0:120
        predict following npredict rows 
        if npredict=1, predict row 121
        if npredict=12, predict rows 121-132
        """
        
        # fit first ntrain rows
        X_fit = self.Xscale[:ntrain]  # e.g. 0:120
        Y_fit = self.Yscale[:ntrain]
        # predict npredict rows
        X_predict = self.Xscale[ntrain:ntrain+npredict] # 121-122
        X_predict = X_predict.reshape(npredict,self.xcols)
       
        # if no coef_dict select predictors into coef_dict
        if self.coef_dict_param == "timestep":
            msg = "Performing subset selection"
            coef_dict = subset_selection(X_fit, Y_fit, LassoLarsIC(criterion='aic'))
        # if coef_dict == "all" use all predictors for each response        
        elif self.coef_dict_param == 'all':
            msg = "Using all predictors"
            coef_dict = [range(self.xcols) for _ in range(self.ycols)]
        else: # should check valid dict
            msg = "Using coef_dict predictors"
            coef_dict = self.coef_dict_param
        if verbose: 
            print(msg)
#            print(coef_dict)

#        modelwrapper = PredictWrapper(self.create_model, coef_dict)
        modelwrapper = self.create_model()
        modelwrapper.fit(X_fit, Y_fit)
        return modelwrapper.predict(X_predict)

    # predict all months
    # initial train_months = 120 -> train first model on 120 rows
    # first prediction will be in P[120] (121st row)
    # step = 6 -> predict following 6 rows, then step forward 6 months at a time
    # initialize predictions matrix self.P
    
    # use either step or folds
    # step, do range(self.startindex, nrows, step)
    # folds, at each fold train 0:startfold, predict startfold+1:endfold
    # store only out-of-sample predictions in P, calc out-of-sample MSE
    
    # using a step > 1 or folds is quicker, for quicker xval, or to speed up by not estimating model at each timestep

    def gen_predictions(self,
                        step=1, 
                        splits=None,
                        verbose=False):

        self.P = np.zeros_like(self.Y)

        progress_i = 0
        self.nrows, self.ycols = Y.shape
        
        if splits:
            month_indexes = splits[:-1] # last index is nrows
        else:
            # create list of steps
            month_indexes = list(range(self.startindex, nrows, step))
        steps = [month_indexes[i+1]-month_indexes[i] for i in range(len(month_indexes)-1)]
        # last step -> end
        steps.append(self.nrows - month_indexes[-1])
        
        if verbose:
            print ("Steps: " + str(month_indexes))

        for month_index, forecast_rows in zip(month_indexes, steps):
            if verbose:
                print("Training on first %d rows (%d:%d), putting predictions in rows %s" % (month_index, 
                                                                                            0, month_index-1, 
                                                                                            str(range(month_index,month_index+forecast_rows))))
            predictions = self.fit_predict(month_index, forecast_rows, verbose=verbose)
            
            first_pred_row = month_index
            for row_index in range(forecast_rows):
                self.P[first_pred_row+row_index] = predictions[row_index]
            sys.stdout.write('.')
            progress_i += 1
            if progress_i % 80 == 0:
                print("")
                print("%s Still training step %d of %d" % (time.strftime("%H:%M:%S"), progress_i, len(month_indexes)))
            sys.stdout.flush()
        print("")
        
        msetemp = (self.P[self.startindex:]-self.Yscale[self.startindex:])**2
        #remove nans
        msetemp = msetemp[~np.isnan(msetemp)]
        mse = np.mean(msetemp)
        print("MSE across all predictions: %.4f" % mse)
        # force unpredicted ys to be nans, then remove nans
        vartemp = self.Yscale[self.startindex:] - self.P[self.startindex:] + self.P[self.startindex:]
        vartemp = vartemp[~np.isnan(vartemp)]
        y_variance = np.var(vartemp[self.startindex:])
        print("Variance: %.4f" % (y_variance))
        print("R-squared: %.4f" % (1- mse/y_variance))
        return mse

    def walkforward_xval (self, n_splits=5, verbose=False):
        """quick and dirty genreturns, with a step"""
        # generate k-folds
        kf = KFold(n_splits=n_splits)
        kf.get_n_splits(X)
        last_indexes = []
        for train_index, test_index in kf.split(X):
            # use test_index as last index to train
            last_index = test_index[-1] + 1
            last_indexes.append(last_index)
        print("%s Generate splits %s" % (time.strftime("%H:%M:%S"), str([i for i in last_indexes])))
        return self.gen_predictions(splits=last_indexes, verbose=verbose)
    
    def gen_returns(self, port_returns_func, verbose=False):

        self.R = np.zeros(self.P.shape[0])
        first_pred_month=self.startindex
        
        indcount = [0] * self.ycols
        longcount = [0] * self.ycols
        shortcount = [0] * self.ycols
        
        for month_index in range(first_pred_month, nrows-1):
            return_month = month_index + 1
            port_return, long_indexes, short_indexes = port_returns_func(self.P[month_index], 
                                                                         self.X[return_month])
            self.R[return_month] = port_return
            
            for i in long_indexes:
                indcount[i] += 1
                longcount[i] += 1
            for i in short_indexes:
                indcount[i] += 1
                shortcount[i] += 1
                
        for i in range(len(responses)):
            print("%s: long %d times, short %d times, total %d times" % (predictors[i], 
                                                                         longcount[i], 
                                                                         shortcount[i], 
                                                                         indcount[i]))
        return self.R

    def report_returns(self, start_date='01/01/1970', freq='M'):

        first_pred_month=self.startindex        
        results = self.R[first_pred_month:]
        index = pd.date_range(start_date,periods=results.shape[0], freq=freq)
        perfdata = pd.DataFrame(results,index=index,columns=['Returns'])
        perfdata['Equity'] = 100 * np.cumprod(1 + results / 100)
        self.cumulative_return = perfdata['Equity']

        stats = perfdata['Equity'].calc_stats()
        
        retframe = pd.DataFrame([stats.stats.loc['start'],
                                 stats.stats.loc['end'],
                                 stats.stats.loc['cagr'],
                                 stats.stats.loc['yearly_vol'],
                                 stats.stats.loc['yearly_sharpe'],
                                 stats.stats.loc['max_drawdown'],
                                 ffn.core.calc_sortino_ratio(perfdata.Returns, rf=0, nperiods=564, annualize=False),
                                ],
                                index = ['start',
                                         'end',
                                         'cagr',
                                         'yearly_vol',
                                         'yearly_sharpe',
                                         'max_drawdown',
                                         'sortino',
                                        ],
                                columns=['Value'])   
        return retframe

In [16]:
NUMSTOCKS = 6 # top quintile (and bottom)

def calc_returns(prediction_row, return_row, numstocks=NUMSTOCKS, verbose=False):

    # ensure nan sorts to top for shorts
    short_sort_array = [999999 if np.isnan(x) else x for x in prediction_row]
    # pick bottom numstocks
    select_array = np.argsort(short_sort_array)
    short_indexes = select_array[:numstocks]

    # ensure nan sorts to bottom for longs
    long_sort_array = [-999999 if np.isnan(x) else x for x in prediction_row]
    # pick top numstocks
    select_array = np.argsort(long_sort_array)
    long_indexes = select_array[-numstocks:]
    
    if verbose:
        print("Longs: %s" %(str([(i,prediction_row[i]) for i in long_indexes])))
        print("Shorts: %s" %(str([(i,prediction_row[i]) for i in short_indexes])))

    # compute equal weighted long/short return
    return np.mean(return_row[long_indexes])/2 - np.mean(return_row[short_indexes])/2, long_indexes, short_indexes


In [17]:
start_date_int = data.index[FIRST_TRAIN_MONTHS]
start_year, start_month = start_date_int // 100, start_date_int % 100
start_date_str = "%02d/%02d/%d" % (start_month, 1, start_year)
start_date_str

'01/01/1970'

In [18]:
def mychart(args, names=None):
    x_coords = np.linspace(1970, 2016, args[0].shape[0])
    
    plotdata = []
    for i in range(len(args)):
        tracelabel = "Trace %d" % i
        if names:
                tracelabel=names[i]
        plotdata.append(Scatter(x=x_coords,
                                y=args[i].reshape(-1),
                                mode = 'line',
                                name=tracelabel))    

    layout = Layout(
        autosize=False,
        width=600,
        height=480,
        yaxis=dict(
            type='log',
            autorange=True
        )
    )
    
    fig = Figure(data=plotdata, layout=layout)
    
    return iplot(fig)
    


In [19]:
backtestmodel = BacktestModel(X, Y, LinearRegression, coef_dict_param=coef_dict, startindex=FIRST_TRAIN_MONTHS)
backtestmodel.gen_predictions(verbose=False)
backtestmodel.gen_returns(calc_returns, verbose=False)
backtestmodel.report_returns(start_date=start_date_str, freq='M')


................................................................................
21:58:03 Still training step 80 of 576
................................................................................
21:58:04 Still training step 160 of 576
................................................................................
21:58:06 Still training step 240 of 576
................................................................................
21:58:07 Still training step 320 of 576
................................................................................
21:58:09 Still training step 400 of 576
................................................................................
21:58:10 Still training step 480 of 576
................................................................................
21:58:12 Still training step 560 of 576
................
MSE across all predictions: 39.7141
Variance: 38.8245
R-squared: -0.0229
Food: long 80 times, short 85 times, total 165 times
Beer: long 

Unnamed: 0,Value
start,1970-01-31 00:00:00
end,2017-12-31 00:00:00
cagr,0.0912365
yearly_vol,0.100378
yearly_sharpe,0.978319
max_drawdown,-0.115531
sortino,0.879155


In [20]:
perf_post_LASSO = backtestmodel.cumulative_return
mychart([perf_post_LASSO],["Post-LASSO (115 vars)"])


In [21]:
backtestmodel = BacktestModel(X, Y, LinearRegression, coef_dict_param="timestep", startindex=FIRST_TRAIN_MONTHS)
backtestmodel.walkforward_xval(n_splits=5, verbose=True)


21:58:13 Generate splits [140, 280, 419, 558, 697]
Steps: [140, 280, 419, 558]
Training on first 140 rows (0:139), putting predictions in rows [140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279]
Performing subset selection
.Training on first 280 rows (0:279), putting predictions in rows [280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 29

77.93955407019347

In [22]:
backtestmodel = BacktestModel(X, Y, LinearRegression, coef_dict_param='timestep', startindex=FIRST_TRAIN_MONTHS)
backtestmodel.gen_predictions(verbose=False)
backtestmodel.gen_returns(calc_returns, verbose=False)
backtestmodel.report_returns(start_date=start_date_str, freq='M')


................................................................................
21:59:35 Still training step 80 of 576
................................................................................
22:00:37 Still training step 160 of 576
................................................................................
22:01:44 Still training step 240 of 576
................................................................................
22:02:39 Still training step 320 of 576
................................................................................
22:03:31 Still training step 400 of 576
................................................................................
22:04:22 Still training step 480 of 576
................................................................................
22:05:14 Still training step 560 of 576
................
MSE across all predictions: 46.9536
Variance: 38.8245
R-squared: -0.2094
Food: long 81 times, short 74 times, total 155 times
Beer: long 

Unnamed: 0,Value
start,1970-01-31 00:00:00
end,2017-12-31 00:00:00
cagr,0.0379483
yearly_vol,0.0549741
yearly_sharpe,0.698329
max_drawdown,-0.152728
sortino,0.370122


In [23]:
perf_LASSO_each_timestep = backtestmodel.cumulative_return
mychart([perf_LASSO_each_timestep],["LASSO each timestep"])


In [24]:
backtestmodel = BacktestModel(X, Y, LinearRegression, coef_dict_param='all', startindex=FIRST_TRAIN_MONTHS)
backtestmodel.gen_predictions(verbose=False)
backtestmodel.gen_returns(calc_returns, verbose=False)
backtestmodel.report_returns(start_date=start_date_str, freq='M')


................................................................................
22:08:45 Still training step 80 of 576
................................................................................
22:08:50 Still training step 160 of 576
................................................................................
22:08:55 Still training step 240 of 576
................................................................................
22:09:01 Still training step 320 of 576
................................................................................
22:09:07 Still training step 400 of 576
................................................................................
22:09:13 Still training step 480 of 576
................................................................................
22:09:20 Still training step 560 of 576
................
MSE across all predictions: 85.8968
Variance: 38.8245
R-squared: -1.2124
Food: long 96 times, short 73 times, total 169 times
Beer: long 

Unnamed: 0,Value
start,1970-01-31 00:00:00
end,2017-12-31 00:00:00
cagr,0.0324746
yearly_vol,0.0714097
yearly_sharpe,0.504359
max_drawdown,-0.20242
sortino,0.285197


In [25]:
perf_all_preds = backtestmodel.cumulative_return
mychart([perf_all_preds],["All preds"])


In [26]:
mychart([perf_post_LASSO, perf_LASSO_each_timestep, perf_all_preds],["Post-LASSO", "LASSO Each Timestep", "OLS All Predictors"])

In [35]:
# use keras instead of sklearn MLPRegressor
# wrap keras model in a class 
# multioutput 30 predictions simultaneously to speed up 
# fit takes a list of response ys, predict returns a list of y_predict arrays
# no coef_dict
INPUT_DIM = X.shape[1]
print(INPUT_DIM)
OUTPUT_DIM = len(responses) # 30
BATCH_SIZE = 32
EPOCHS=500

class KerasBacktestModel(object):

    def __init__(self, 
                 n_hidden_layers = 2,
                 hidden_layer_size = 32,
                 reg_penalty = 0.0001,
                 verbose=True):
        main_input = Input(shape=(INPUT_DIM,),
                           dtype='float32', 
                           name='main_input')
        lastlayer=main_input
        
        for i in range(n_hidden_layers):
            if verbose:
                print("layer %d size %d, reg_penalty %.8f" % (i + 1, 
                                                              hidden_layer_size, 
                                                              reg_penalty, 
                                                             ))
            lastlayer = Dense(units = hidden_layer_size, 
                              activation = 'relu',
                              kernel_initializer = keras.initializers.glorot_uniform(),
                              kernel_regularizer=keras.regularizers.l1(reg_penalty),
                              name = "Dense%02d" % i)(lastlayer)
            
        outputs = []
        for i in range(OUTPUT_DIM):
            # OUTPUT_DIM outputs
            outputs.append(Dense(1,
                                 activation='linear',
                                 name='output%02d' % i)(lastlayer)
                          )
            
        self.model = Model(inputs=[main_input], outputs=outputs)
        if verbose:
            print(self.model.summary())
            
        self.model.compile(loss="mse", optimizer="rmsprop", loss_weights=[1.]*OUTPUT_DIM)
        
    def fit(self, X, Y, epochs=EPOCHS):
        # convert Y to list of ys
        Y_list = [Y[:,i] for i in range(OUTPUT_DIM)]
        return self.model.fit(X,
                              Y_list,
                              batch_size=BATCH_SIZE,
                              epochs=epochs,
                              verbose=False)
    
    def predict(self, X):
        y_list = self.model.predict(X)
        # convert list of ys to Y array
        npreds=len(y_list[0])
        Y_pred = [y.reshape(npreds) for y in y_list]
        Y_pred = np.array(Y_pred).transpose()    
        return Y_pred
    
    def save(self, modelname):
        self.model.save("%s.h5" % modelname)
        self.model.save_weights("%s_weights.h5" % modelname)


115


In [36]:
def create_keras_model(n_hidden_layers, layer_size, reg_penalty, verbose=False):
    def create_func():
        return KerasBacktestModel(n_hidden_layers = n_hidden_layers,
                                  hidden_layer_size = layer_size,
                                  reg_penalty = reg_penalty,
                                  verbose=verbose)
    return create_func


In [37]:
MODELPREFIX = "NN"

n_hiddens = [1, 2, 3]
layer_sizes = [1, 2, 4, 8]
reg_penalties = [0.0, 0.001, 0.01, 0.1, 1]
hyperparameter_combos = list(product(n_hiddens, layer_sizes, reg_penalties))

print("%s Running %d experiments" % (time.strftime("%H:%M:%S"), len(hyperparameter_combos)))
#experiments = {}
#sharpes = {}

for counter, param_list in enumerate(hyperparameter_combos):
    n_hidden_layers, layer_size, reg_penalty = param_list
    print("%s Running experiment %d of %d" % (time.strftime("%H:%M:%S"), counter+1, len(hyperparameter_combos)))
    key = (n_hidden_layers, layer_size, reg_penalty)
    print("%s n_hidden_layers = %d, hidden_layer_size = %d, reg_penalty = %.6f" % 
          (time.strftime("%H:%M:%S"), n_hidden_layers, layer_size, reg_penalty))
    
    experiment_model = BacktestModel(X, Y, create_keras_model(n_hidden_layers,layer_size,reg_penalty), coef_dict_param="all", startindex=FIRST_TRAIN_MONTHS)
    score = experiment_model.walkforward_xval(n_splits=5)
    experiments[key] = score

    experiment_model.gen_returns(calc_returns, verbose=False)
    retframe = experiment_model.report_returns(start_date=start_date_str, freq='M')
    sharpe = retframe.loc['yearly_sharpe']
    sharpes[key] = sharpe
    print("%s MSE: %f" % (str(key), score))
    print("%s Sharpe: %f" % (str(key), sharpe))



22:17:13 Running 60 experiments
22:17:13 Running experiment 1 of 60
22:17:13 n_hidden_layers = 1, hidden_layer_size = 1, reg_penalty = 0.000000
22:17:13 Generate splits [140, 280, 419, 558, 697]
....
MSE across all predictions: 51.4170
Variance: 38.8245
R-squared: -0.3243
Food: long 209 times, short 76 times, total 285 times
Beer: long 123 times, short 0 times, total 123 times
Smoke: long 400 times, short 50 times, total 450 times
Games: long 160 times, short 171 times, total 331 times
Books: long 91 times, short 96 times, total 187 times
Hshld: long 119 times, short 49 times, total 168 times
Clths: long 130 times, short 136 times, total 266 times
Hlth: long 140 times, short 130 times, total 270 times
Chems: long 0 times, short 223 times, total 223 times
Txtls: long 138 times, short 0 times, total 138 times
Cnstr: long 59 times, short 338 times, total 397 times
Steel: long 0 times, short 470 times, total 470 times
FabPr: long 0 times, short 233 times, total 233 times
ElcEq: long 40 tim

....
MSE across all predictions: 61.7618
Variance: 38.8245
R-squared: -0.5908
Food: long 199 times, short 108 times, total 307 times
Beer: long 42 times, short 10 times, total 52 times
Smoke: long 292 times, short 87 times, total 379 times
Games: long 363 times, short 170 times, total 533 times
Books: long 36 times, short 55 times, total 91 times
Hshld: long 115 times, short 34 times, total 149 times
Clths: long 122 times, short 33 times, total 155 times
Hlth: long 29 times, short 14 times, total 43 times
Chems: long 24 times, short 257 times, total 281 times
Txtls: long 132 times, short 21 times, total 153 times
Cnstr: long 31 times, short 205 times, total 236 times
Steel: long 34 times, short 361 times, total 395 times
FabPr: long 0 times, short 7 times, total 7 times
ElcEq: long 46 times, short 1 times, total 47 times
Autos: long 21 times, short 280 times, total 301 times
Carry: long 232 times, short 53 times, total 285 times
Mines: long 141 times, short 177 times, total 318 times
C

....
MSE across all predictions: 88.2916
Variance: 38.8245
R-squared: -1.2741
Food: long 177 times, short 119 times, total 296 times
Beer: long 216 times, short 126 times, total 342 times
Smoke: long 253 times, short 124 times, total 377 times
Games: long 222 times, short 154 times, total 376 times
Books: long 124 times, short 29 times, total 153 times
Hshld: long 50 times, short 47 times, total 97 times
Clths: long 75 times, short 28 times, total 103 times
Hlth: long 68 times, short 36 times, total 104 times
Chems: long 3 times, short 227 times, total 230 times
Txtls: long 156 times, short 21 times, total 177 times
Cnstr: long 71 times, short 177 times, total 248 times
Steel: long 11 times, short 280 times, total 291 times
FabPr: long 65 times, short 160 times, total 225 times
ElcEq: long 98 times, short 109 times, total 207 times
Autos: long 8 times, short 65 times, total 73 times
Carry: long 65 times, short 27 times, total 92 times
Mines: long 114 times, short 80 times, total 194 ti

....
MSE across all predictions: 88.1404
Variance: 38.8245
R-squared: -1.2702
Food: long 137 times, short 129 times, total 266 times
Beer: long 136 times, short 83 times, total 219 times
Smoke: long 210 times, short 165 times, total 375 times
Games: long 237 times, short 173 times, total 410 times
Books: long 101 times, short 69 times, total 170 times
Hshld: long 76 times, short 53 times, total 129 times
Clths: long 142 times, short 80 times, total 222 times
Hlth: long 74 times, short 81 times, total 155 times
Chems: long 31 times, short 199 times, total 230 times
Txtls: long 139 times, short 49 times, total 188 times
Cnstr: long 41 times, short 86 times, total 127 times
Steel: long 49 times, short 200 times, total 249 times
FabPr: long 44 times, short 100 times, total 144 times
ElcEq: long 50 times, short 39 times, total 89 times
Autos: long 102 times, short 130 times, total 232 times
Carry: long 132 times, short 76 times, total 208 times
Mines: long 147 times, short 124 times, total 

....
MSE across all predictions: 45.9937
Variance: 38.8245
R-squared: -0.1847
Food: long 280 times, short 19 times, total 299 times
Beer: long 158 times, short 3 times, total 161 times
Smoke: long 492 times, short 64 times, total 556 times
Games: long 241 times, short 0 times, total 241 times
Books: long 194 times, short 72 times, total 266 times
Hshld: long 59 times, short 0 times, total 59 times
Clths: long 46 times, short 0 times, total 46 times
Hlth: long 0 times, short 0 times, total 0 times
Chems: long 0 times, short 484 times, total 484 times
Txtls: long 151 times, short 0 times, total 151 times
Cnstr: long 0 times, short 82 times, total 82 times
Steel: long 0 times, short 490 times, total 490 times
FabPr: long 0 times, short 335 times, total 335 times
ElcEq: long 138 times, short 117 times, total 255 times
Autos: long 114 times, short 300 times, total 414 times
Carry: long 138 times, short 57 times, total 195 times
Mines: long 200 times, short 28 times, total 228 times
Coal: lo

....
MSE across all predictions: 61.9852
Variance: 38.8245
R-squared: -0.5965
Food: long 154 times, short 95 times, total 249 times
Beer: long 238 times, short 35 times, total 273 times
Smoke: long 299 times, short 37 times, total 336 times
Games: long 288 times, short 72 times, total 360 times
Books: long 47 times, short 29 times, total 76 times
Hshld: long 98 times, short 33 times, total 131 times
Clths: long 58 times, short 0 times, total 58 times
Hlth: long 120 times, short 133 times, total 253 times
Chems: long 18 times, short 416 times, total 434 times
Txtls: long 140 times, short 0 times, total 140 times
Cnstr: long 19 times, short 33 times, total 52 times
Steel: long 11 times, short 418 times, total 429 times
FabPr: long 1 times, short 102 times, total 103 times
ElcEq: long 52 times, short 97 times, total 149 times
Autos: long 21 times, short 234 times, total 255 times
Carry: long 108 times, short 8 times, total 116 times
Mines: long 106 times, short 93 times, total 199 times
C

....
MSE across all predictions: 71.0292
Variance: 38.8245
R-squared: -0.8295
Food: long 241 times, short 156 times, total 397 times
Beer: long 114 times, short 66 times, total 180 times
Smoke: long 281 times, short 122 times, total 403 times
Games: long 230 times, short 153 times, total 383 times
Books: long 75 times, short 88 times, total 163 times
Hshld: long 47 times, short 24 times, total 71 times
Clths: long 126 times, short 96 times, total 222 times
Hlth: long 146 times, short 112 times, total 258 times
Chems: long 0 times, short 107 times, total 107 times
Txtls: long 60 times, short 9 times, total 69 times
Cnstr: long 73 times, short 231 times, total 304 times
Steel: long 26 times, short 287 times, total 313 times
FabPr: long 2 times, short 196 times, total 198 times
ElcEq: long 106 times, short 69 times, total 175 times
Autos: long 14 times, short 156 times, total 170 times
Carry: long 97 times, short 99 times, total 196 times
Mines: long 80 times, short 49 times, total 129 ti

....
MSE across all predictions: 90.5094
Variance: 38.8245
R-squared: -1.3312
Food: long 190 times, short 104 times, total 294 times
Beer: long 159 times, short 75 times, total 234 times
Smoke: long 287 times, short 144 times, total 431 times
Games: long 212 times, short 210 times, total 422 times
Books: long 74 times, short 59 times, total 133 times
Hshld: long 41 times, short 21 times, total 62 times
Clths: long 122 times, short 53 times, total 175 times
Hlth: long 63 times, short 89 times, total 152 times
Chems: long 7 times, short 107 times, total 114 times
Txtls: long 70 times, short 34 times, total 104 times
Cnstr: long 56 times, short 159 times, total 215 times
Steel: long 56 times, short 266 times, total 322 times
FabPr: long 20 times, short 125 times, total 145 times
ElcEq: long 64 times, short 111 times, total 175 times
Autos: long 59 times, short 85 times, total 144 times
Carry: long 63 times, short 69 times, total 132 times
Mines: long 123 times, short 94 times, total 217 t

....
MSE across all predictions: 40.2240
Variance: 38.8245
R-squared: -0.0360
Food: long 277 times, short 35 times, total 312 times
Beer: long 158 times, short 14 times, total 172 times
Smoke: long 437 times, short 0 times, total 437 times
Games: long 290 times, short 0 times, total 290 times
Books: long 21 times, short 0 times, total 21 times
Hshld: long 140 times, short 14 times, total 154 times
Clths: long 117 times, short 0 times, total 117 times
Hlth: long 0 times, short 0 times, total 0 times
Chems: long 0 times, short 542 times, total 542 times
Txtls: long 139 times, short 0 times, total 139 times
Cnstr: long 7 times, short 119 times, total 126 times
Steel: long 0 times, short 542 times, total 542 times
FabPr: long 0 times, short 139 times, total 139 times
ElcEq: long 150 times, short 119 times, total 269 times
Autos: long 0 times, short 400 times, total 400 times
Carry: long 138 times, short 140 times, total 278 times
Mines: long 161 times, short 0 times, total 161 times
Coal: 

....
MSE across all predictions: 43.8314
Variance: 38.8245
R-squared: -0.1290
Food: long 244 times, short 40 times, total 284 times
Beer: long 127 times, short 0 times, total 127 times
Smoke: long 429 times, short 5 times, total 434 times
Games: long 54 times, short 18 times, total 72 times
Books: long 140 times, short 12 times, total 152 times
Hshld: long 140 times, short 0 times, total 140 times
Clths: long 7 times, short 0 times, total 7 times
Hlth: long 16 times, short 21 times, total 37 times
Chems: long 1 times, short 515 times, total 516 times
Txtls: long 150 times, short 0 times, total 150 times
Cnstr: long 20 times, short 121 times, total 141 times
Steel: long 0 times, short 523 times, total 523 times
FabPr: long 0 times, short 114 times, total 114 times
ElcEq: long 138 times, short 0 times, total 138 times
Autos: long 11 times, short 292 times, total 303 times
Carry: long 148 times, short 140 times, total 288 times
Mines: long 264 times, short 0 times, total 264 times
Coal: l

....
MSE across all predictions: 47.0219
Variance: 38.8245
R-squared: -0.2111
Food: long 160 times, short 90 times, total 250 times
Beer: long 148 times, short 10 times, total 158 times
Smoke: long 334 times, short 60 times, total 394 times
Games: long 426 times, short 93 times, total 519 times
Books: long 23 times, short 4 times, total 27 times
Hshld: long 84 times, short 31 times, total 115 times
Clths: long 115 times, short 3 times, total 118 times
Hlth: long 61 times, short 135 times, total 196 times
Chems: long 15 times, short 405 times, total 420 times
Txtls: long 192 times, short 70 times, total 262 times
Cnstr: long 3 times, short 215 times, total 218 times
Steel: long 0 times, short 392 times, total 392 times
FabPr: long 2 times, short 256 times, total 258 times
ElcEq: long 11 times, short 89 times, total 100 times
Autos: long 0 times, short 30 times, total 30 times
Carry: long 187 times, short 91 times, total 278 times
Mines: long 110 times, short 0 times, total 110 times
Coa

KeyboardInterrupt: 

In [None]:
3 layers, 4 units and up

In [38]:
# list and chart experiments
flatlist = [list(l[0]) + [l[1]] for l in experiments.items()]
 
lossframe = pd.DataFrame(flatlist, columns=["n_hidden_layers", "layer_size", "reg_penalty", "loss"])
# one row didn't converge properly - messes up plotly scales
#for i in list(lossframe.loc[lossframe['loss']> 1000].index):
#    lossframe.at[i, 'loss'] = 100
lossframe.sort_values(['loss'])

Unnamed: 0,n_hidden_layers,layer_size,reg_penalty,loss
29,3,1,1.0,39.20741
31,3,1,0.0,40.224018
26,3,2,0.0,43.831446
0,3,1,0.1,45.082445
36,1,1,1.0,45.777254
25,3,1,0.001,45.822488
41,2,1,0.0,45.993655
35,3,1,0.01,46.095104
33,3,4,0.0,47.02191
1,2,1,1.0,48.388703


In [None]:
# we can pick lowest loss , but first we look at patterns by hyperparameter
pd.DataFrame(lossframe.groupby(['n_hidden_layers'])['loss'].mean())


In [None]:
pd.DataFrame(lossframe.groupby(['layer_size'])['loss'].mean())


In [None]:
pd.DataFrame(lossframe.groupby(['reg_penalty'])['loss'].mean())


In [None]:
def plot_matrix(lossframe, x_labels, y_labels, x_suffix="", y_suffix=""):

    pivot = lossframe.pivot_table(index=[y_labels], columns=[x_labels], values=['loss'])
#    print(pivot)
    # specify labels as strings, to force plotly to use a discrete axis
#    print(pivot.columns.levels[1]).values
#    print(lossframe[x_labels].dtype)
    
    if lossframe[x_labels].dtype == np.float64 or lossframe[x_labels].dtype == np.float32:
        xaxis = ["%f %s" % (i, x_suffix) for i in pivot.columns.levels[1].values]
    else:
        xaxis = ["%d %s" % (i, x_suffix) for i in pivot.columns.levels[1].values]
    if lossframe[y_labels].dtype == np.float64 or lossframe[y_labels].dtype == np.float32:
        yaxis = ["%f %s" % (i, y_suffix) for i in pivot.index.values]
    else:
        yaxis = ["%d %s" % (i, y_suffix) for i in pivot.index.values]
        
#    print(xaxis, yaxis)
    """plot a heat map of a matrix"""
    chart_width=640
    chart_height=480
    
    layout = Layout(
        title="%s v. %s" % (x_labels, y_labels),
        height=chart_height,
        width=chart_width,     
        margin=dict(
            l=150,
            r=30,
            b=120,
            t=100,
        ),
        xaxis=dict(
            title=x_labels,
            tickfont=dict(
                family='Arial, sans-serif',
                size=10,
                color='black'
            ),
        ),
        yaxis=dict(
            title=y_labels,
            tickfont=dict(
                family='Arial, sans-serif',
                size=10,
                color='black'
            ),
        ),
    )
    
    data = [Heatmap(z=pivot.values,
                    x=xaxis,
                    y=yaxis,
                    colorscale=[[0, 'rgb(0,0,255)', [1, 'rgb(255,0,0)']]],
                   )
           ]

    fig = Figure(data=data, layout=layout)
    return iplot(fig, link_text="")

plot_matrix(lossframe, "n_hidden_layers", "layer_size", x_suffix=" layers", y_suffix=" units")




In [None]:
plot_matrix(lossframe, "n_hidden_layers", "reg_penalty", x_suffix=" layers", y_suffix=" p")


In [None]:
plot_matrix(lossframe, "reg_penalty", "layer_size", x_suffix=" p", y_suffix="units")


In [None]:
# 1-unit layers is not really a NN but anyway let's see how it does
experiment_model = BacktestModel(X, Y, create_keras_model(3,1,1.0), coef_dict_param="all", startindex=FIRST_TRAIN_MONTHS)
experiment_model.gen_predictions(verbose=False)
experiment_model.gen_returns(calc_returns, verbose=False)
experiment_model.report_returns(start_date=start_date_str, freq='M')


..........................................