Replicate [Dynamic Return Dependencies Across Industries: A Machine Learning Approach](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3120110&download=yes) by David Rapach, Jack Strauss, Jun Tu and Guofu Zhou.

1) Use Keras NN classification instead of linear regression

2) Add additional variables, 3 and 12-month MA, interest rate change, yield curve, Mkt-RF. The hope is with  cross-validation and regularization we can do that without overfitting.


In [1]:
import os
import sys
import warnings
import numpy as np
import pandas as pd
import pandas_datareader.data as datareader
import time 
import datetime
import copy
import random
from itertools import product

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #Hide messy TensorFlow warnings
warnings.filterwarnings("ignore") #Hide messy numpy warnings

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, explained_variance_score, r2_score
from sklearn.linear_model import LinearRegression, Lasso, lasso_path, lars_path, LassoLarsIC
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler


import tensorflow as tf
tf.set_random_seed(1764)

import keras
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import Input
from keras.models import Model

from keras.layers.recurrent import LSTM, GRU
from keras.regularizers import l1
from keras.models import Sequential
from keras.models import load_model

import ffn
%matplotlib inline

import plotly as py
# print (py.__version__) # requires version >= 1.9.0
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
import plotly.figure_factory as ff

init_notebook_mode(connected=True)

random.seed(1764)
np.random.seed(1764)


Using TensorFlow backend.


In [2]:
print("Loading data...")
data = pd.read_csv("30_Industry_Portfolios.csv")
data = data.set_index('yyyymm')
industries = list(data.columns)
# map industry names to col nums
ind_reverse_dict = dict([(industries[i], i) for i in range(len(industries))])

rfdata = pd.read_csv("F-F_Research_Data_Factors.csv")
rfdata = rfdata.set_index('yyyymm')
data['rf'] = rfdata['RF']

# subtract risk-free rate
# create a response variable led by 1 period to predict
for ind in industries:
    data[ind] = data[ind] - data['rf']

    
# add rates data from FRED
start_date = datetime.datetime(1926, 9, 1)
end_date = datetime.datetime(2017, 12, 1)
TB3MS = datareader.DataReader("TB3MS", "fred", start_date, end_date)
TB3MS['yyyymm'] = TB3MS.index.strftime('%Y%m')
TB3MS['yyyymm'] = [int(datestr) for datestr in TB3MS['yyyymm']]
TB3MS=TB3MS.set_index(['yyyymm'])
data['3month']=TB3MS['TB3MS']

GS10 =  datareader.DataReader("GS10", "fred", start_date, end_date)
GS10['yyyymm'] = GS10.index.strftime('%Y%m')
GS10['yyyymm'] = [int(datestr) for datestr in GS10['yyyymm']]
GS10=GS10.set_index(['yyyymm'])
data['10year']=GS10['GS10']

data['curve'] = data['10year'] - data['3month']
data['10year'] = data['10year'].diff() # first difference 10-year yield
data['3month'] = data['3month'].diff() # first difference 3-month
data['month'] = (data.index  % 100)/12.0 # for possible seasonality
data['Mkt-RF'] = rfdata['Mkt-RF']

for ind in industries + ['3month', '10year', 'curve', 'Mkt-RF',]:
    data[ind+".3m"] = pd.rolling_mean(data[ind],3)
    
#for ind in industries + ['3month', '10year', 'curve', 'Mkt-RF',]:
#    data[ind+".6m"] = pd.rolling_mean(data[ind],6)

for ind in industries + ['3month', '10year', 'curve', 'Mkt-RF',]:
    data[ind+".12m"] = pd.rolling_mean(data[ind],12)

for ind in industries:
    data[ind+".lead"] = data[ind].shift(-1)

data = data.loc[data.index[data.index > 195911]]
data = data.drop(columns=['rf'])    
data = data.dropna(axis=0, how='any')

nresponses = len(industries)
npredictors = data.shape[1]-nresponses

predictors = list(data.columns[:npredictors])
predictor_reverse_dict = dict([(predictors[i], i) for i in range(len(predictors))])

responses = list(data.columns[-nresponses:])
response_reverse_dict = dict([(responses[i], i) for i in range(len(responses))])

print(data.shape)
print(list(data.columns))
data[['3month', '10year', 'curve', 'month', 'Mkt-RF',]]

Loading data...
(697, 133)
['Food', 'Beer', 'Smoke', 'Games', 'Books', 'Hshld', 'Clths', 'Hlth', 'Chems', 'Txtls', 'Cnstr', 'Steel', 'FabPr', 'ElcEq', 'Autos', 'Carry', 'Mines', 'Coal', 'Oil', 'Util', 'Telcm', 'Servs', 'BusEq', 'Paper', 'Trans', 'Whlsl', 'Rtail', 'Meals', 'Fin', 'Other', '3month', '10year', 'curve', 'month', 'Mkt-RF', 'Food.3m', 'Beer.3m', 'Smoke.3m', 'Games.3m', 'Books.3m', 'Hshld.3m', 'Clths.3m', 'Hlth.3m', 'Chems.3m', 'Txtls.3m', 'Cnstr.3m', 'Steel.3m', 'FabPr.3m', 'ElcEq.3m', 'Autos.3m', 'Carry.3m', 'Mines.3m', 'Coal.3m', 'Oil.3m', 'Util.3m', 'Telcm.3m', 'Servs.3m', 'BusEq.3m', 'Paper.3m', 'Trans.3m', 'Whlsl.3m', 'Rtail.3m', 'Meals.3m', 'Fin.3m', 'Other.3m', '3month.3m', '10year.3m', 'curve.3m', 'Mkt-RF.3m', 'Food.12m', 'Beer.12m', 'Smoke.12m', 'Games.12m', 'Books.12m', 'Hshld.12m', 'Clths.12m', 'Hlth.12m', 'Chems.12m', 'Txtls.12m', 'Cnstr.12m', 'Steel.12m', 'FabPr.12m', 'ElcEq.12m', 'Autos.12m', 'Carry.12m', 'Mines.12m', 'Coal.12m', 'Oil.12m', 'Util.12m', 'Telcm.1

Unnamed: 0_level_0,3month,10year,curve,month,Mkt-RF
yyyymm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
195912,0.34,0.16,0.20,1.000000,2.45
196001,-0.14,0.03,0.37,0.083333,-6.98
196002,-0.39,-0.23,0.53,0.166667,1.17
196003,-0.65,-0.24,0.94,0.250000,-1.63
196004,-0.08,0.03,1.05,0.333333,-1.71
196005,0.06,0.07,1.06,0.416667,3.12
196006,-0.83,-0.20,1.69,0.500000,2.08
196007,-0.16,-0.25,1.60,0.583333,-2.37
196008,0.00,-0.10,1.50,0.666667,3.01
196009,0.18,0.00,1.32,0.750000,-5.99


In [3]:
#data = data.loc[data.index[data.index < 201701]]
data = data.loc[data.index[data.index > 195911]]
data


Unnamed: 0_level_0,Food,Beer,Smoke,Games,Books,Hshld,Clths,Hlth,Chems,Txtls,...,Telcm.lead,Servs.lead,BusEq.lead,Paper.lead,Trans.lead,Whlsl.lead,Rtail.lead,Meals.lead,Fin.lead,Other.lead
yyyymm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
195912,2.01,0.35,-3.02,1.64,7.29,0.67,1.87,-1.97,3.08,0.74,...,0.62,-6.18,-7.93,-9.41,-4.31,-5.33,-6.09,-10.08,-4.68,-3.98
196001,-4.49,-5.71,-2.05,1.21,-5.47,-7.84,-8.53,-6.68,-10.03,-4.77,...,8.07,9.13,5.09,3.00,-0.94,1.42,4.00,1.81,-0.98,6.32
196002,3.35,-2.14,2.27,4.23,2.39,9.31,1.44,-0.02,-0.74,0.32,...,-0.21,-0.31,3.34,-2.43,-4.99,-1.37,-0.13,-3.88,0.05,-2.43
196003,-1.67,-2.94,-0.18,-0.65,2.18,-0.56,-2.59,1.26,-2.75,-6.79,...,-1.24,7.14,1.77,0.41,-2.13,0.45,-0.53,8.86,-0.64,0.55
196004,1.17,-2.16,1.35,6.46,-1.17,-1.27,0.21,1.49,-5.53,-1.10,...,3.05,-1.75,11.90,2.85,0.90,1.65,3.11,0.80,-0.45,1.02
196005,8.20,-0.52,2.44,7.28,11.67,7.74,1.74,13.50,3.40,2.10,...,-0.58,-8.07,2.39,3.50,2.17,5.96,3.41,1.03,3.72,6.41
196006,5.39,0.47,4.73,2.24,0.02,6.38,-1.59,-0.40,0.45,4.04,...,-0.03,2.84,-2.02,-4.10,-3.11,-6.16,-2.99,-1.25,0.09,-5.95
196007,-2.11,-0.79,4.60,-4.72,0.23,-0.60,-1.10,-3.99,-6.80,-3.14,...,6.94,5.69,2.71,1.18,1.98,4.51,2.85,2.05,3.47,3.48
196008,4.57,3.24,5.20,7.16,3.63,5.09,3.34,2.29,1.17,-0.84,...,-6.07,-3.53,-7.61,-7.37,-7.07,-8.44,-8.57,-1.90,-5.78,-4.21
196009,-3.88,-5.00,-2.09,-2.33,-6.20,-9.18,-4.23,-8.87,-6.70,-5.25,...,-0.08,4.62,-3.40,-1.85,-1.02,-4.22,0.31,-4.54,-0.40,0.38


In [4]:
desc = data.describe()
desc
# min, max line up with Table 1

Unnamed: 0,Food,Beer,Smoke,Games,Books,Hshld,Clths,Hlth,Chems,Txtls,...,Telcm.lead,Servs.lead,BusEq.lead,Paper.lead,Trans.lead,Whlsl.lead,Rtail.lead,Meals.lead,Fin.lead,Other.lead
count,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0,...,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0
mean,0.688666,0.72703,0.985079,0.732095,0.532253,0.564333,0.690387,0.665825,0.552367,0.687145,...,0.515968,0.729928,0.62297,0.534806,0.60109,0.631076,0.698235,0.728766,0.637547,0.396628
std,4.30866,5.058992,6.032324,7.12817,5.780362,4.728,6.355251,4.897557,5.482363,6.970961,...,4.607931,6.486956,6.698787,5.021876,5.707154,5.57104,5.334178,6.065564,5.381389,5.771655
min,-18.15,-20.19,-25.32,-33.4,-26.56,-22.24,-31.5,-21.06,-28.6,-33.11,...,-16.44,-28.67,-32.07,-27.74,-28.5,-29.25,-29.74,-31.89,-22.53,-28.09
25%,-1.63,-2.08,-2.74,-3.39,-2.6,-2.03,-2.8,-2.23,-2.75,-3.17,...,-2.11,-3.05,-3.22,-2.4,-2.78,-2.56,-2.38,-2.84,-2.4,-2.93
50%,0.74,0.75,1.27,0.94,0.51,0.75,0.7,0.76,0.72,0.64,...,0.59,1.01,0.67,0.71,0.9,0.94,0.54,1.08,0.87,0.54
75%,3.07,3.69,4.66,5.26,3.64,3.54,4.31,3.55,3.76,4.48,...,3.36,4.26,4.63,3.46,4.04,3.88,3.98,4.3,4.0,4.2
max,19.89,25.51,32.38,34.52,33.13,18.22,31.79,29.01,21.68,59.03,...,21.22,23.38,24.66,21.0,18.5,17.53,26.49,27.38,20.59,19.96


In [5]:
# Extract X and Y as raw arrays
X = data.values[:-1,:npredictors]
Y = data.values[:-1,-nresponses:]
nrows = X.shape[0]
X.shape

(696, 103)

In [6]:
# convert Ys to 3 classes
# long = 1
# short = 2
# neither = 0
ISLONG=1
ISSHORT=2
ISFLAT=0

Y_sortindex = np.argsort(Y)
print(Y[0])
# sorted position
print(Y_sortindex[0]) 
# sorted array
print(Y[0,Y_sortindex[0]])
# initialize class to 0
Y_class=np.zeros_like(Y)
for row in range(Y_class.shape[0]):
    # if index in last 6, long
    longlist = Y_sortindex[row,-6:]
    Y_class[row, longlist]=ISLONG
    # if index is in first 6, short
    shortlist = Y_sortindex[row,:6]
    Y_class[row, shortlist]=ISSHORT
    
print(Y_class[0])
print([Y[0,i] for i in range(30) if Y_class[0,i]==1])
print([Y[0,i] for i in range(30) if Y_class[0,i]==-1])
print(Y_class.shape)
print(X.shape)


[ -4.49  -5.71  -2.05   1.21  -5.47  -7.84  -8.53  -6.68 -10.03  -4.77
  -6.67  -9.38  -4.42 -12.3  -11.71  -5.03  -3.81  -7.91  -7.82  -2.4
   0.62  -6.18  -7.93  -9.41  -4.31  -5.33  -6.09 -10.08  -4.68  -3.98]
[13 14 27  8 23 11  6 22 17  5 18  7 10 21 26  1  4 25 15  9 28  0 12 24
 29 16 19  2 20  3]
[-12.3  -11.71 -10.08 -10.03  -9.41  -9.38  -8.53  -7.93  -7.91  -7.84
  -7.82  -6.68  -6.67  -6.18  -6.09  -5.71  -5.47  -5.33  -5.03  -4.77
  -4.68  -4.49  -4.42  -4.31  -3.98  -3.81  -2.4   -2.05   0.62   1.21]
[0. 0. 1. 1. 0. 0. 0. 0. 2. 0. 0. 2. 0. 2. 2. 0. 1. 0. 0. 1. 1. 0. 0. 2.
 0. 0. 0. 2. 0. 1.]
[-2.05, 1.21, -3.81, -2.4, 0.6199999999999999, -3.98]
[]
(696, 30)
(696, 103)


In [7]:
min(Y[:,0])

-18.150000000000002

In [54]:
# try keras classifier
# wrap their model in a class 
# use multioutput to speed  up 
# fit takes a list of response ys, predict returns a list of y_predict arrays
INPUT_DIM = X.shape[1]
print(INPUT_DIM)
NCLASSES=3
OUTPUT_DIM = len(responses) # 30
BATCH_SIZE = 32
EPOCHS=200

class KerasBacktestModel(object):

    def __init__(self, 
                 n_hidden_layers = 2,
                 hidden_layer_size = 32,
                 reg_penalty = 0.0001,
                 dropout = 0.333,
                 verbose=True):
        """initialize keras model"""
        
        main_input = Input(shape=(INPUT_DIM,),
                           dtype='float32', 
                           name='main_input')
        lastlayer=main_input
        
        for i in range(n_hidden_layers):
            if verbose:
                print("layer %d size %d, reg_penalty %.8f, dropout %.3f" % (i + 1, 
                                                                            hidden_layer_size, 
                                                                            reg_penalty, 
                                                                            dropout))
            lastlayer = Dense(units = hidden_layer_size, 
                              activation = 'relu',
                              kernel_initializer = keras.initializers.glorot_uniform(),
                              kernel_regularizer=keras.regularizers.l1(reg_penalty),
                              name = "Dense%02d" % i)(lastlayer)
            
            if dropout:
                lastlayer = Dropout(dropout, name = "Dropout%02d" % i)(lastlayer)
                
        outputs = []
        for i in range(OUTPUT_DIM):
            # OUTPUT_DIM outputs
            outputs.append(Dense(NCLASSES, 
                                 activation='softmax',
                                 name = "Output%02d" % (i+1))(lastlayer))
            
        self.model = Model(inputs=[main_input], outputs=outputs)
        if verbose:
            print(self.model.summary())
            
        self.model.compile(loss="categorical_crossentropy", 
                           optimizer="rmsprop", 
                           metrics=['accuracy'])
    
    def fit(self, X, Y, epochs=EPOCHS):
        """fit keras model for epochs"""
        # convert Y to list of ys
        nrows, npreds = Y.shape
        Y_list = [keras.utils.to_categorical(Y[:,i], num_classes=NCLASSES) for i in range(OUTPUT_DIM)]

        fit = self.model.fit(X,
                             Y_list,
                             batch_size=BATCH_SIZE,
                             epochs=epochs,
                             verbose=False)
        #evaluate returns a list of overall loss, loss by column and then accuracy by column
        evaluate = self.model.evaluate(X, Y_list, batch_size=BATCH_SIZE, verbose=1)
        self.accuracy = np.mean(np.array(evaluate[-npreds:]))

        return fit

    def predict(self, X):
        """predict classes using X"""
        # convert list of ys to Y array
        nrows, npreds = X.shape
        y_list = self.model.predict(X)
        longprobs = np.zeros([nrows, OUTPUT_DIM])
        shortprobs = np.zeros([nrows, OUTPUT_DIM])
        flatprobs = np.zeros([nrows, OUTPUT_DIM])

        evaluate_array = self.model.evaluate(X, y_list, batch_size=BATCH_SIZE, verbose=1)
        self.accuracy = np.mean(np.array(evaluate_array[-npreds:]))
        
        for response in range(OUTPUT_DIM):
            for row in range(nrows):
                longprobs[row, response] = y_list[response][row, ISLONG]
                shortprobs[row, response] = y_list[response][row, ISSHORT]
                flatprobs[row, response] = y_list[response][row, ISFLAT]
                
        return longprobs, shortprobs, flatprobs
    
    def evaluate(self, X, Y):
        """predict classes using X"""
        # convert list of ys to Y array
        nrows, npreds = Y.shape
        Y_list = [keras.utils.to_categorical(Y[:,i], num_classes=NCLASSES) for i in range(OUTPUT_DIM)]
        evaluate_array = self.model.evaluate(X, Y_list, batch_size=BATCH_SIZE, verbose=False)
        self.accuracy = np.mean(np.array(evaluate_array[-npreds:]))
        
        return self.accuracy
    
    
    def save(self, modelname):
        self.model.save("%s.h5" % modelname)
        self.model.save_weights("%s_weights.h5" % modelname)


103


In [45]:
def create_keras_model(n_hidden_layers, layer_size, reg_penalty, dropout, verbose=False):
    def create_func():
        return KerasBacktestModel(n_hidden_layers = n_hidden_layers,
                                  hidden_layer_size = layer_size,
                                  reg_penalty = reg_penalty,
                                  dropout = dropout,
                                  verbose=verbose)
    return create_func


In [46]:
def fit_predict_keras(X, Y, model, epochs=EPOCHS, npredict=1, verbose=False):
    """simpler fit_predict, no coef_dict, fits all at once, specifies epochs 
    for backtest, train model using Y_list v. X using n-npredict rows
    generate npredict prediction Y_list using last npredict rows of X
    if npredict=1, fit using n-1 rows, return prediction using X for final month
    if npredict=26, fit using n-26 rows, return prediction using X for final 26 months"""
    
    nrows = X.shape[0]
    if verbose:
        print("Fit on %d rows 0 to %d" % (nrows-npredict, nrows-npredict-1))
        print("Predict on %d rows %d to %d" % (npredict, nrows-npredict, nrows-1))
        
    # keep last rows to predict against
    X_predict = X[-npredict:]
    X_predict = X_predict.reshape(npredict,X.shape[1])
    # fit on remaining rows
    X_fit = X[:-npredict]
    Y_fit = Y[:-npredict]
    print(X_fit.shape)
    print(Y_fit.shape)
    fit = model.fit(
        X_fit,
        Y_fit,
        epochs=epochs
    )
    
    return model.predict(X_predict)

print("%s Start fit" % (time.strftime("%H:%M:%S")))
keras_create_model=create_keras_model(1, 4, 0.01, 0.25, verbose=True)
keras_model=keras_create_model()
longprobs, shortprobs, flatprobs = fit_predict_keras(X, Y_class, keras_model, npredict=3, epochs=200)
print("%s End fit" % (time.strftime("%H:%M:%S")))

print(longprobs)
print("")
print(shortprobs)
print("")
print(flatprobs)
print("")
print(keras_model.accuracy)

14:55:43 Start fit
layer 1 size 4, reg_penalty 0.01000000, dropout 0.250
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         (None, 103)          0                                            
__________________________________________________________________________________________________
Dense00 (Dense)                 (None, 4)            416         main_input[0][0]                 
__________________________________________________________________________________________________
Dropout00 (Dropout)             (None, 4)            0           Dense00[0][0]                    
__________________________________________________________________________________________________
Output01 (Dense)                (None, 3)            15          Dropout00[0][0]                  
____________________________________

In [47]:
keras_model.evaluate(X,Y_class)



0.6063218391261338

In [57]:
# do a bunch of experiments

EPOCHS=500
def walkforward_xval_keras (X, Y, create_model, n_splits=5, epochs=EPOCHS):
    ### no coef_dict, fit all at once

    # generate k-folds
    kf = KFold(n_splits=n_splits)
    kf.get_n_splits(X)
    last_indexes = []
    for train_index, test_index in kf.split(X):
        # use test_index as last index to train
        last_index = test_index[-1] + 1
        last_indexes.append(last_index)
    print("%s Generate splits %s" % (time.strftime("%H:%M:%S"), str([i for i in last_indexes])))

    print("%s Starting training" % (time.strftime("%H:%M:%S")))
    model = create_model()
    
    avg_bests = []
    for i in range(1, n_splits-1):

        models = []
        losses = []
        scores = []
        count = 0        
        # skip kfold 0 so you start with train 2x size of eval set
        last_train_index = last_indexes[i]
        last_xval_index = last_indexes[i+1]

        # set up train, xval
        # train from beginning to last_train_index        
        print("%s Training indexes 0 to %d" % (time.strftime("%H:%M:%S"), last_train_index-1))
        X_fit = X[:last_train_index]
        Y_fit = Y[:last_train_index]
        
        # xval from last_train_index to last_xval_index
        print("%s Cross-validating indexes %d to %d" % (time.strftime("%H:%M:%S"), last_train_index, last_xval_index -1 ))
        X_xval = X[last_train_index:last_xval_index]
        Y_xval = Y[last_train_index:last_xval_index]

        fit = model.fit(X_fit, Y_fit, epochs=epochs)
        xval_score = model.evaluate(X_xval,Y_xval)        

        print ("%s Xval MSE %f" % (time.strftime("%H:%M:%S"), xval_score))
        avg_bests.append(xval_score)
    
    # mean over folds
    avg_loss = np.mean(np.array(avg_bests))
    print ("%s Avg Xval loss %f" % (time.strftime("%H:%M:%S"), avg_loss))
    print("--------------------------------------------------------------------------------")
    return (avg_loss, model)


In [58]:

walkforward_xval_keras(X, Y_class, 
                       create_keras_model(n_hidden_layers=3,
                                          layer_size=2,
                                          reg_penalty=0.001,
                                          dropout=0.25),
                      epochs=5)

18:28:30 Generate splits [140, 279, 418, 557, 696]
18:28:30 Starting training
18:28:36 Training indexes 0 to 278
18:28:36 Cross-validating indexes 279 to 417
18:30:51 Xval MSE 0.576499
18:30:51 Training indexes 0 to 417
18:30:51 Cross-validating indexes 418 to 556
18:30:57 Xval MSE 0.588489
18:30:57 Training indexes 0 to 556
18:30:57 Cross-validating indexes 557 to 695
18:31:05 Xval MSE 0.599041
18:31:05 Avg Xval loss 0.588010
--------------------------------------------------------------------------------


(0.5880095929884129, <__main__.KerasBacktestModel at 0x7fd3a369bb38>)

In [59]:
# run an experiment with walk-forward cross-validation

EPOCHS = 50

def run_experiment(X, Y, 
                   n_hidden_layers, 
                   layer_size, 
                   reg_penalty,
                   dropout,
                   minmaxscale=False, 
                   standardscale=False,
                   epochs=EPOCHS):
    
    Xscale = X.copy()
    Yscale = Y.copy()
    
    if minmaxscale:
        # minmaxscale each row (min->0, max->1) - transpose, scale, transpose back because scales by columns
        Xscale = MinMaxScaler().fit_transform(Xscale.transpose()).transpose()
        Yscale = MinMaxScaler().fit_transform(Yscale.transpose()).transpose()
        print("using MinMaxScaler")
    elif standardscale:
        # standardize each row (mean->0, SD->1)- transpose, scale, transpose back because scales by columns
        Xscale = StandardScaler().fit_transform(Xscale.transpose()).transpose()
        Yscale = StandardScaler().fit_transform(Yscale.transpose()).transpose()
        print("using StandardScaler")

    return walkforward_xval_keras(Xscale, Yscale,
                                  create_keras_model(n_hidden_layers=n_hidden_layers,
                                                     layer_size=layer_size,
                                                     reg_penalty=reg_penalty,
                                                     dropout=dropout),
                                  epochs=epochs)


In [60]:
run_experiment(X, Y_class, 3, 4, .001, .25, epochs=40)

18:31:05 Generate splits [140, 279, 418, 557, 696]
18:31:05 Starting training
18:31:10 Training indexes 0 to 278
18:31:10 Cross-validating indexes 279 to 417
18:33:57 Xval MSE 0.594724
18:33:57 Training indexes 0 to 417
18:33:57 Cross-validating indexes 418 to 556
18:34:38 Xval MSE 0.598321
18:34:38 Training indexes 0 to 556
18:34:38 Cross-validating indexes 557 to 695
18:35:27 Xval MSE 0.601199
18:35:27 Avg Xval loss 0.598082
--------------------------------------------------------------------------------


(0.5980815354153979, <__main__.KerasBacktestModel at 0x7fd36de630b8>)

In [61]:
# run a lot of experiments in big xval loop to pick best hyperparameters

MODELPREFIX = "FFNN"
EPOCHS=200

n_hiddens = [1, 2, 3]
layer_sizes = [2, 4, 8]
reg_penalties = [0.0, 0.001, 0.1]
dropouts = [0.25]

hyperparameter_combos = list(product(n_hiddens, layer_sizes, reg_penalties, dropouts))

print("%s Running %d experiments" % (time.strftime("%H:%M:%S"), len(hyperparameter_combos)))

experiments = {}

for counter, param_list in enumerate(hyperparameter_combos):
    n_hidden_layers, layer_size, reg_penalty, dropout = param_list
    print("%s Running experiment %d of %d" % (time.strftime("%H:%M:%S"), counter+1, len(hyperparameter_combos)))
    key = (n_hidden_layers, layer_size, reg_penalty, dropout)
    print("%s n_hidden_layers %d, layer_size %d, reg_penalty %.6f, dropout %.3f" % (time.strftime("%H:%M:%S"),  n_hidden_layers, layer_size, reg_penalty, dropout))
    score, model = run_experiment(X, Y_class,
                                  n_hidden_layers = n_hidden_layers,
                                  layer_size = layer_size,
                                  reg_penalty = reg_penalty,
                                  dropout = dropout,
                                  epochs=EPOCHS
                                 )
    experiments[key] = score 
    modelname = "%s_%.6f_%d_%d_%.6f_%.3f" % (MODELPREFIX, score, n_hidden_layers, layer_size, reg_penalty, dropout)
    print("%s Saving %s.h5" % (time.strftime("%H:%M:%S"), modelname))
    model.save(modelname)


18:35:27 Running 27 experiments
18:35:27 Running experiment 1 of 27
18:35:27 Generate splits [140, 279, 418, 557, 696]
18:35:27 Starting training
18:35:32 Training indexes 0 to 278
18:35:32 Cross-validating indexes 279 to 417
18:40:08 Xval MSE 0.599520
18:40:08 Training indexes 0 to 417
18:40:08 Cross-validating indexes 418 to 556
18:43:29 Xval MSE 0.580576
18:43:29 Training indexes 0 to 556
18:43:29 Cross-validating indexes 557 to 695
18:47:44 Xval MSE 0.600000
18:47:44 Avg Xval loss 0.593365
--------------------------------------------------------------------------------
18:47:44 Saving FFNN_0.593365_1_2_0.000000_0.250.h5
18:50:56 Running experiment 2 of 27
18:50:56 Generate splits [140, 279, 418, 557, 696]
18:50:56 Starting training
18:51:01 Training indexes 0 to 278
18:51:01 Cross-validating indexes 279 to 417
18:55:37 Xval MSE 0.599760
18:55:37 Training indexes 0 to 417
18:55:37 Cross-validating indexes 418 to 556
18:58:49 Xval MSE 0.582494
18:58:49 Training indexes 0 to 556
18:58

21:34:29 Xval MSE 0.601679
21:34:29 Avg Xval loss 0.596723
--------------------------------------------------------------------------------
21:34:29 Saving FFNN_0.596723_2_2_0.001000_0.250.h5
21:39:02 Running experiment 12 of 27
21:39:02 Generate splits [140, 279, 418, 557, 696]
21:39:02 Starting training
21:39:07 Training indexes 0 to 278
21:39:07 Cross-validating indexes 279 to 417
21:44:27 Xval MSE 0.600000
21:44:27 Training indexes 0 to 417
21:44:27 Cross-validating indexes 418 to 556
21:47:54 Xval MSE 0.599760
21:47:54 Training indexes 0 to 556
21:47:54 Cross-validating indexes 557 to 695
21:52:25 Xval MSE 0.599520
21:52:25 Avg Xval loss 0.599760
--------------------------------------------------------------------------------
21:52:25 Saving FFNN_0.599760_2_2_0.100000_0.250.h5
21:57:10 Running experiment 13 of 27
21:57:10 Generate splits [140, 279, 418, 557, 696]
21:57:10 Starting training
21:57:15 Training indexes 0 to 278
21:57:15 Cross-validating indexes 279 to 417
22:02:46 Xva

01:04:46 Xval MSE 0.596163
01:04:46 Training indexes 0 to 556
01:04:46 Cross-validating indexes 557 to 695
01:10:01 Xval MSE 0.603837
01:10:01 Avg Xval loss 0.600080
--------------------------------------------------------------------------------
01:10:01 Saving FFNN_0.600080_3_4_0.000000_0.250.h5
01:16:17 Running experiment 23 of 27
01:16:17 Generate splits [140, 279, 418, 557, 696]
01:16:17 Starting training
01:16:22 Training indexes 0 to 278
01:16:22 Cross-validating indexes 279 to 417
01:22:50 Xval MSE 0.598801
01:22:50 Training indexes 0 to 417
01:22:50 Cross-validating indexes 418 to 556
01:27:00 Xval MSE 0.596163
01:27:00 Training indexes 0 to 556
01:27:00 Cross-validating indexes 557 to 695
01:32:24 Xval MSE 0.602398
01:32:24 Avg Xval loss 0.599121
--------------------------------------------------------------------------------
01:32:24 Saving FFNN_0.599121_3_4_0.001000_0.250.h5
01:38:46 Running experiment 24 of 27
01:38:46 Generate splits [140, 279, 418, 557, 696]
01:38:46 Sta

In [62]:
# list and chart experiments
flatlist = [list(l[0]) + [l[1]] for l in experiments.items()]

lossframe = pd.DataFrame(flatlist, columns=["n_hidden_layers", "layer_size", "reg_penalty", "dropout",
                                            "accuracy"])
lossframe.sort_values(['accuracy'], ascending=False)


Unnamed: 0,n_hidden_layers,layer_size,reg_penalty,dropout,accuracy
18,3,2,0.0,0.25,0.60032
21,3,4,0.0,0.25,0.60008
25,3,8,0.001,0.25,0.60008
14,2,4,0.1,0.25,0.6
20,3,2,0.1,0.25,0.6
19,3,2,0.001,0.25,0.6
26,3,8,0.1,0.25,0.6
23,3,4,0.1,0.25,0.6
17,2,8,0.1,0.25,0.6
11,2,2,0.1,0.25,0.59976


In [64]:
# we can pick lowest loss , but first we look at patterns by hyperparameter
pd.DataFrame(lossframe.groupby(['n_hidden_layers'])['accuracy'].mean())


Unnamed: 0_level_0,accuracy
n_hidden_layers,Unnamed: 1_level_1
1,0.584892
2,0.596438
3,0.599316


In [65]:
pd.DataFrame(lossframe.groupby(['layer_size'])['accuracy'].mean())

Unnamed: 0_level_0,accuracy
layer_size,Unnamed: 1_level_1
2,0.597131
4,0.593108
8,0.590408


In [66]:
pd.DataFrame(lossframe.groupby(['reg_penalty'])['accuracy'].mean())

Unnamed: 0_level_0,accuracy
reg_penalty,Unnamed: 1_level_1
0.0,0.591633
0.001,0.591287
0.1,0.597726


In [68]:
def plot_matrix(lossframe, x_labels, y_labels, x_suffix="", y_suffix=""):

    pivot = lossframe.pivot_table(index=[y_labels], columns=[x_labels], values=['accuracy'])
#    print(pivot)
    # specify labels as strings, to force plotly to use a discrete axis
#    print(pivot.columns.levels[1]).values
#    print(lossframe[x_labels].dtype)
    
    if lossframe[x_labels].dtype == np.float64 or lossframe[x_labels].dtype == np.float32:
        xaxis = ["%f %s" % (i, x_suffix) for i in pivot.columns.levels[1].values]
    else:
        xaxis = ["%d %s" % (i, x_suffix) for i in pivot.columns.levels[1].values]
    if lossframe[y_labels].dtype == np.float64 or lossframe[y_labels].dtype == np.float32:
        yaxis = ["%f %s" % (i, y_suffix) for i in pivot.index.values]
    else:
        yaxis = ["%d %s" % (i, y_suffix) for i in pivot.index.values]
        
#    print(xaxis, yaxis)
    """plot a heat map of a matrix"""
    chart_width=640
    chart_height=480
    
    layout = Layout(
        title="%s v. %s" % (x_labels, y_labels),
        height=chart_height,
        width=chart_width,     
        margin=dict(
            l=150,
            r=30,
            b=120,
            t=100,
        ),
        xaxis=dict(
            title=x_labels,
            tickfont=dict(
                family='Arial, sans-serif',
                size=10,
                color='black'
            ),
        ),
        yaxis=dict(
            title=y_labels,
            tickfont=dict(
                family='Arial, sans-serif',
                size=10,
                color='black'
            ),
        ),
    )
    
    data = [Heatmap(z=pivot.values,
                    x=xaxis,
                    y=yaxis,
                    colorscale=[[0, 'rgb(0,0,255)', [1, 'rgb(255,0,0)']]],
                   )
           ]

    fig = Figure(data=data, layout=layout)
    return iplot(fig, link_text="")




In [73]:
plot_matrix(lossframe, "n_hidden_layers", "layer_size", x_suffix=" layers", y_suffix=" units")


In [70]:
plot_matrix(lossframe, "n_hidden_layers", "reg_penalty", x_suffix="p", y_suffix=" layers")

In [72]:
plot_matrix(lossframe, "reg_penalty", "layer_size", x_suffix=" p", y_suffix=" units")


In [77]:
EPOCHS=500

nrows = X.shape[0]
START=121

def run_backtest_keras(X, Y_class, arg_dict, startindex=0, epochs=EPOCHS, step=1, minmaxscale=False, standardscale=False):
    """create keras model; add step, to iteratively train, predict 12 months, train up to next 12 months """
    global P_L, P_S
    P_L = np.zeros((Y_class.shape[0],OUTPUT_DIM))
    P_S = np.zeros((Y_class.shape[0],OUTPUT_DIM))
    
    print("%s Starting backtest" % (time.strftime("%H:%M:%S")))
    
    count = 0
    nrows = X.shape[0]

    Xscale = X.copy()
    
    if minmaxscale:
        # minmaxscale each row (min->0, max->1) - transpose, scale, transpose back because scales by columns
        Xscale = MinMaxScaler().fit_transform(Xscale.transpose()).transpose()
        print("using MinMaxScaler")
    elif standardscale:
        # standardize each row (mean->0, SD->1)- transpose, scale, transpose back because scales by columns
        Xscale = StandardScaler().fit_transform(Xscale.transpose()).transpose()
        print("using StandardScaler")
     
    model = create_keras_model(n_hidden_layers=arg_dict["n_hidden_layers"],
                               layer_size=arg_dict["hidden_layer_size"],
                               reg_penalty=arg_dict["reg_penalty"],
                               dropout=dropout,
                               verbose=False)()
        
    for train_index in range(startindex, nrows, step):
        if train_index + step >= nrows:
            train_index = nrows-step
            
        fp_index = train_index + step # eg 1000 + 26 = 1026

        # fit on e.g. 0:999, predict 1000-1025

        longprobs, shortprobs, flatprobs = fit_predict_keras(Xscale[:fp_index], 
                                                             Y_class[:fp_index], 
                                                             model,
                                                             epochs=epochs,
                                                             npredict=step)
        # store in 1000:1025 - lining up with future Xs
        for i in range(step):
            P_L[train_index + i] = longprobs[i]
            P_S[train_index + i] = shortprobs[i]
            sys.stdout.write('.')
            count += 1
            if count % 80 == 0:
                print("")
                print("%s Still training %d of %d" % (time.strftime("%H:%M:%S"), count, nrows-startindex))
            sys.stdout.flush()


In [84]:
def gen_returns(Y, P_L, P_S, first_pred_month, start_date='01/01/1970', freq='M', verbose=False):
    #TODO: more general version
    #take an indicator (P), a function to generate portfolio based on indicator, universe returns (Y), return portfolio returns

    global R
    R = np.zeros(P_L.shape[0])
    nrows, ncols = P_L.shape
    numstocks = 6 # top quintile (and bottom)

    indcount = [0 for response in responses]
    longcount = [0 for response in responses]
    shortcount = [0 for response in responses]
        
    for month_index in range(first_pred_month, nrows):
        # get indexes of sorted smallest to largest
        # rightmost 6 most probable P_Ls for longs, P_Ss for shorts
        # ignore nan
        short_sort_array = [-999999 if np.isnan(x) else x for x in P_S[month_index]]
        select_array = np.argsort(short_sort_array)
        short_indexes = select_array[-numstocks:]
        # rightmost 6 longs
        long_sort_array = [-999999 if np.isnan(x) else x for x in P_L[month_index]]
        select_array = np.argsort(long_sort_array)
        long_indexes = select_array[-numstocks:]
        # compute equal weighted long/short return
        return_month = month_index + 1
        if verbose:
            print("Longs for month %d: %s" %(return_month, str([(l,P_L[month_index, l]) for l in long_indexes])))
            print("Shorts for month %d: %s" %(return_month, str([(l,P_S[month_index, l]) for l in short_indexes])))
            
        if return_month < nrows: # last row has a prediction for following month but no following month
            R[return_month] = np.mean(X[return_month, long_indexes])/2 - np.mean(X[return_month, short_indexes])/2
            # count occurrences of each industry
            for i in short_indexes:
                indcount[i]+=1
                shortcount[i]+=1
            for i in long_indexes:
                indcount[i]+=1
                longcount[i]+=1

    for response in responses:
        i = response_reverse_dict[response]
        print("%s: long %d times, short %d times, total %d times" % (response, longcount[i], shortcount[i], indcount[i]))
        
    results = R[first_pred_month:]

    index = pd.date_range(start_date,periods=results.shape[0], freq=freq)
    perfdata = pd.DataFrame(results,index=index,columns=['Returns'])
    perfdata['Equity'] = 100 * np.cumprod(1 + results / 100)

    stats = perfdata['Equity'].calc_stats()

    retframe = pd.DataFrame([stats.stats.loc['start'],
                             stats.stats.loc['end'],
                             stats.stats.loc['cagr'],
                             stats.stats.loc['yearly_vol'],
                             stats.stats.loc['yearly_sharpe'],
                             stats.stats.loc['max_drawdown'],
                             ffn.core.calc_sortino_ratio(perfdata.Returns, rf=0, nperiods=564, annualize=False),
                            ],
                            index = ['start',
                                     'end',
                                     'cagr',
                                     'yearly_vol',
                                     'yearly_sharpe',
                                     'max_drawdown',
                                     'sortino',
                                    ],
                            columns=['Value'])   
    return retframe


In [80]:
START=121
EPOCHS=500
STEP=5
arg_dict = {"n_hidden_layers" : 3,
            "hidden_layer_size" : 2,
            "reg_penalty" : 0.1,
            "dropout": 0.25,
            'verbose' : False
           }
     
#model = build_model(**arg_dict)
run_backtest_keras(X, Y_class, arg_dict, startindex=START, step=STEP, epochs=EPOCHS)


03:18:10 Starting backtest
(121, 103)
(121, 30)
.....(126, 103)
(126, 30)
.....(131, 103)
(131, 30)
.....(136, 103)
(136, 30)
.....(141, 103)
(141, 30)
.....(146, 103)
(146, 30)
.....(151, 103)
(151, 30)
.....(156, 103)
(156, 30)
.....(161, 103)
(161, 30)
.....(166, 103)
(166, 30)
.....(171, 103)
(171, 30)
.....(176, 103)
(176, 30)
.....(181, 103)
(181, 30)
.....(186, 103)
(186, 30)
.....(191, 103)
(191, 30)
.....(196, 103)
(196, 30)
.....
04:36:19 Still training 80 of 575
(201, 103)
(201, 30)
.....(206, 103)
(206, 30)
.....(211, 103)
(211, 30)
.....(216, 103)
(216, 30)
.....(221, 103)
(221, 30)
.....(226, 103)
(226, 30)
.....(231, 103)
(231, 30)
.....(236, 103)
(236, 30)
.....(241, 103)
(241, 30)
.....(246, 103)
(246, 30)
.....(251, 103)
(251, 30)
.....(256, 103)
(256, 30)
.....(261, 103)
(261, 30)
.....(266, 103)
(266, 30)
.....(271, 103)
(271, 30)
.....(276, 103)
(276, 30)
.....
06:15:20 Still training 160 of 575
(281, 103)
(281, 30)
.....(286, 103)
(286, 30)
.....(291, 103)
(291, 3

.....(426, 103)
(426, 30)
.....(431, 103)
(431, 30)
.....(436, 103)
(436, 30)
.....
11:28:11 Still training 320 of 575
(441, 103)
(441, 30)
.....(446, 103)
(446, 30)
.....(451, 103)
(451, 30)
.....(456, 103)
(456, 30)
.....(461, 103)
(461, 30)
.....(466, 103)
(466, 30)
.....(471, 103)
(471, 30)
.....(476, 103)
(476, 30)
.....(481, 103)
(481, 30)
.....(486, 103)
(486, 30)
.....(491, 103)
(491, 30)
.....(496, 103)
(496, 30)
.....(501, 103)
(501, 30)
.....(506, 103)
(506, 30)
.....(511, 103)
(511, 30)
.....(516, 103)
(516, 30)
.....
13:42:23 Still training 400 of 575
(521, 103)
(521, 30)
.....(526, 103)
(526, 30)
.....(531, 103)
(531, 30)
.....(536, 103)
(536, 30)
.....(541, 103)
(541, 30)
.....(546, 103)
(546, 30)
.....(551, 103)
(551, 30)
.....(556, 103)
(556, 30)
.....(561, 103)
(561, 30)
.....(566, 103)
(566, 30)
.....(571, 103)
(571, 30)
.....(576, 103)
(576, 30)
.....(581, 103)
(581, 30)
.....(586, 103)
(586, 30)
.....(591, 103)
(591, 30)
.....(596, 103)
(596, 30)
.....
15:52:11 Sti

In [85]:
gen_returns(X, P_L, P_S, START, verbose=False)


Food.lead: long 0 times, short 0 times, total 0 times
Beer.lead: long 74 times, short 0 times, total 74 times
Smoke.lead: long 574 times, short 574 times, total 1148 times
Games.lead: long 574 times, short 325 times, total 899 times
Books.lead: long 0 times, short 0 times, total 0 times
Hshld.lead: long 10 times, short 0 times, total 10 times
Clths.lead: long 0 times, short 0 times, total 0 times
Hlth.lead: long 0 times, short 0 times, total 0 times
Chems.lead: long 0 times, short 0 times, total 0 times
Txtls.lead: long 0 times, short 0 times, total 0 times
Cnstr.lead: long 0 times, short 0 times, total 0 times
Steel.lead: long 0 times, short 449 times, total 449 times
FabPr.lead: long 0 times, short 0 times, total 0 times
ElcEq.lead: long 0 times, short 0 times, total 0 times
Autos.lead: long 0 times, short 100 times, total 100 times
Carry.lead: long 0 times, short 45 times, total 45 times
Mines.lead: long 514 times, short 384 times, total 898 times
Coal.lead: long 574 times, short 54

Unnamed: 0,Value
start,1970-01-31 00:00:00
end,2017-11-30 00:00:00
cagr,-0.00239437
yearly_vol,0.0288752
yearly_sharpe,-0.101873
max_drawdown,-0.258829
sortino,-0.0325975
