Replicate [Dynamic Return Dependencies Across Industries: A Machine Learning Approach](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3120110&download=yes) by David Rapach, Jack Strauss, Jun Tu and Guofu Zhou.

1) Use industry returns from [Ken French](http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html)

2) Forecast (for example) this month's Chemical industry return using last month's returns from all 30 industries 

3) Use LASSO for predictor subset selection over the entire 1960-2016 period to determine that e.g. Beer is predicted by Food, Clothing, Coal

4) Use those predictors and simple linear regression to predict returns

5) Generate portfolios and run backtests.

- Predictor selection - finds same predictors except 2 industries. Possibly use of AICc instead of AIC (don't see an sklearn implementation that uses AICc)

- Prediction by industry - R-squareds line up pretty closely

- Portfolio performance, similar ballpark results. Since prediction is similar but return profile is different, must be some difference in portfolio construction. (am taking equal weight top 6 predicted as long and bottom 6 as short, every month)

- For some reason their mean returns don't line up to geometric mean annualized, they seem to be calculating something different.

- But it does replicate closely and perform pretty well

In [1]:
import os
import sys
import warnings
import numpy as np
import pandas as pd
import time 
import copy
import random

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #Hide messy TensorFlow warnings
warnings.filterwarnings("ignore") #Hide messy numpy warnings

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import explained_variance_score, r2_score
from sklearn.linear_model import LinearRegression, Lasso, lasso_path, lars_path, LassoLarsIC
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

import ffn
%matplotlib inline

import plotly as py
# print (py.__version__) # requires version >= 1.9.0
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
import plotly.figure_factory as ff

init_notebook_mode(connected=True)

random.seed(1764)
np.random.seed(1764)


In [2]:
print("Loading data...")
data = pd.read_csv("30_Industry_Portfolios.csv")
data = data.set_index('yyyymm')
industries = list(data.columns)
# map industry names to col nums
ind_reverse_dict = dict([(industries[i], i) for i in range(len(industries))])

rfdata = pd.read_csv("F-F_Research_Data_Factors.csv")
rfdata = rfdata.set_index('yyyymm')
data['rf'] = rfdata['RF']

# subtract risk-free rate
# create a response variable led by 1 period to predict
for ind in industries:
    data[ind] = data[ind] - data['rf']

#for ind in industries:
#    data[ind+".3m"] = pd.rolling_mean(data[ind],3)
    
#for ind in industries:
#    data[ind+".6m"] = pd.rolling_mean(data[ind],6)

#for ind in industries:
#    data[ind+".12m"] = pd.rolling_mean(data[ind],12)
    
for ind in industries:
    data[ind+".lead"] = data[ind].shift(-1)

data = data.loc[data.index[data.index > 195911]]
data = data.drop(columns=['rf'])    
data = data.dropna(axis=0, how='any')

nresponses = len(industries)
npredictors = data.shape[1]-nresponses

predictors = list(data.columns[:npredictors])
predictor_reverse_dict = dict([(predictors[i], i) for i in range(len(predictors))])

responses = list(data.columns[-nresponses:])
response_reverse_dict = dict([(responses[i], i) for i in range(len(responses))])

print(data.shape)

data[['Food', 'Food.lead']]


Loading data...
(697, 60)


Unnamed: 0_level_0,Food,Food.lead
yyyymm,Unnamed: 1_level_1,Unnamed: 2_level_1
195912,2.01,-4.49
196001,-4.49,3.35
196002,3.35,-1.67
196003,-1.67,1.17
196004,1.17,8.20
196005,8.20,5.39
196006,5.39,-2.11
196007,-2.11,4.57
196008,4.57,-3.88
196009,-3.88,1.02


In [3]:
data = data.loc[data.index[data.index < 201701]]
data = data.loc[data.index[data.index > 195911]]
data


Unnamed: 0_level_0,Food,Beer,Smoke,Games,Books,Hshld,Clths,Hlth,Chems,Txtls,...,Telcm.lead,Servs.lead,BusEq.lead,Paper.lead,Trans.lead,Whlsl.lead,Rtail.lead,Meals.lead,Fin.lead,Other.lead
yyyymm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
195912,2.01,0.35,-3.02,1.64,7.29,0.67,1.87,-1.97,3.08,0.74,...,0.62,-6.18,-7.93,-9.41,-4.31,-5.33,-6.09,-10.08,-4.68,-3.98
196001,-4.49,-5.71,-2.05,1.21,-5.47,-7.84,-8.53,-6.68,-10.03,-4.77,...,8.07,9.13,5.09,3.00,-0.94,1.42,4.00,1.81,-0.98,6.32
196002,3.35,-2.14,2.27,4.23,2.39,9.31,1.44,-0.02,-0.74,0.32,...,-0.21,-0.31,3.34,-2.43,-4.99,-1.37,-0.13,-3.88,0.05,-2.43
196003,-1.67,-2.94,-0.18,-0.65,2.18,-0.56,-2.59,1.26,-2.75,-6.79,...,-1.24,7.14,1.77,0.41,-2.13,0.45,-0.53,8.86,-0.64,0.55
196004,1.17,-2.16,1.35,6.46,-1.17,-1.27,0.21,1.49,-5.53,-1.10,...,3.05,-1.75,11.90,2.85,0.90,1.65,3.11,0.80,-0.45,1.02
196005,8.20,-0.52,2.44,7.28,11.67,7.74,1.74,13.50,3.40,2.10,...,-0.58,-8.07,2.39,3.50,2.17,5.96,3.41,1.03,3.72,6.41
196006,5.39,0.47,4.73,2.24,0.02,6.38,-1.59,-0.40,0.45,4.04,...,-0.03,2.84,-2.02,-4.10,-3.11,-6.16,-2.99,-1.25,0.09,-5.95
196007,-2.11,-0.79,4.60,-4.72,0.23,-0.60,-1.10,-3.99,-6.80,-3.14,...,6.94,5.69,2.71,1.18,1.98,4.51,2.85,2.05,3.47,3.48
196008,4.57,3.24,5.20,7.16,3.63,5.09,3.34,2.29,1.17,-0.84,...,-6.07,-3.53,-7.61,-7.37,-7.07,-8.44,-8.57,-1.90,-5.78,-4.21
196009,-3.88,-5.00,-2.09,-2.33,-6.20,-9.18,-4.23,-8.87,-6.70,-5.25,...,-0.08,4.62,-3.40,-1.85,-1.02,-4.22,0.31,-4.54,-0.40,0.38


In [4]:
desc = data.describe()
desc
# min, max line up with Table 1

Unnamed: 0,Food,Beer,Smoke,Games,Books,Hshld,Clths,Hlth,Chems,Txtls,...,Telcm.lead,Servs.lead,BusEq.lead,Paper.lead,Trans.lead,Whlsl.lead,Rtail.lead,Meals.lead,Fin.lead,Other.lead
count,685.0,685.0,685.0,685.0,685.0,685.0,685.0,685.0,685.0,685.0,...,685.0,685.0,685.0,685.0,685.0,685.0,685.0,685.0,685.0,685.0
mean,0.690715,0.710613,0.982321,0.701708,0.528277,0.55419,0.66946,0.650905,0.519781,0.667416,...,0.520847,0.694234,0.584175,0.511241,0.582088,0.625562,0.662219,0.70273,0.60981,0.38562
std,4.339811,5.090215,6.061582,7.180918,5.809314,4.759874,6.386027,4.928072,5.518477,7.022552,...,4.62852,6.527984,6.738979,5.055314,5.739306,5.605317,5.349341,6.104515,5.411766,5.815446
min,-18.15,-20.19,-25.32,-33.4,-26.56,-22.24,-31.5,-21.06,-28.6,-33.11,...,-16.44,-28.67,-32.07,-27.74,-28.5,-29.25,-29.74,-31.89,-22.53,-28.09
25%,-1.64,-2.1,-2.78,-3.49,-2.69,-2.11,-2.81,-2.24,-2.8,-3.2,...,-2.11,-3.09,-3.29,-2.43,-2.78,-2.57,-2.43,-2.94,-2.42,-2.99
50%,0.74,0.71,1.28,0.89,0.51,0.75,0.69,0.75,0.67,0.63,...,0.61,0.97,0.56,0.69,0.86,0.94,0.47,1.03,0.82,0.47
75%,3.12,3.66,4.64,5.31,3.72,3.55,4.31,3.56,3.76,4.49,...,3.36,4.29,4.59,3.46,4.06,3.88,4.0,4.33,4.0,4.2
max,19.89,25.51,32.38,34.52,33.13,18.22,31.79,29.01,21.68,59.03,...,21.22,23.38,24.66,21.0,18.5,17.53,26.49,27.38,20.59,19.96


In [5]:
# annualized returns don't match Table 1, oddly
# geometric mean, annualized
pd.DataFrame((np.prod(data/100 + 1)**(12.0/len(data))-1)[:30], columns=['Mean Ann. Return'])

Unnamed: 0,Mean Ann. Return
Food,0.07402
Beer,0.072005
Smoke,0.100147
Games,0.054031
Books,0.043953
Hshld,0.054098
Clths,0.05717
Hlth,0.065463
Chems,0.044917
Txtls,0.051888


In [6]:
# try this way, arithmetic mean then annualize (not very correct)
#print(pd.DataFrame(((desc.loc['mean']/100+1)**12-1)[:30]))
#nope

# same
pd.DataFrame(((1 + np.mean(data, axis=0)/100)**12 -1)[:30], columns=['Mean Ann. Return'])

Unnamed: 0,Mean Ann. Return
Food,0.086108
Beer,0.088687
Smoke,0.12446
Games,0.087532
Books,0.065268
Hshld,0.068568
Clths,0.08336
Hlth,0.080966
Chems,0.064188
Txtls,0.083096


In [7]:
#annualized volatility 
pd.DataFrame((desc.loc['std']*np.sqrt(12))[:30].round(2))
# lines up with table 1

Unnamed: 0,std
Food,15.03
Beer,17.63
Smoke,21.0
Games,24.88
Books,20.12
Hshld,16.49
Clths,22.12
Hlth,17.07
Chems,19.12
Txtls,24.33


In [8]:
# Run LASSO, then OLS on selected variables

# skip last row to better match published r-squared
# looks like they forecast actuals 1960-2016 using 1959m12 to 2016m11
# not exact matches to Table 2 R-squared but almost within rounding error 
X = data.values[:-1,:npredictors]
Y = data.values[:-1,-nresponses:]
nrows = X.shape[0]
X.shape

(684, 30)

In [9]:
def subset_selection(X, Y, model_aic, verbose=False):
    
    global responses
    global response_reverse_dict
    global predictors
    global predictor_reverse_dict
    
    coef_dict = {}
    for response_index, response in enumerate(responses):
        y = Y[:,response_reverse_dict[response]]
        
        model_aic.fit(X, y)

        coef_dict[response] = [predstr for i, predstr in enumerate(predictors) if model_aic.coef_[i] !=0]
        #y_response = model_aic.responseict(X)
        # print ("In-sample LASSO R-squared: %.6f" % r2_score(y, y_response))
        if verbose:
            print("LASSO variables selected for %s: " % response)
            print(coef_dict[response])
        
        if not coef_dict[response]:
            if verbose:
                print("No coefs selected for " + response + ", using all")
                print("---")
            coef_dict[response] = predictors            
        # fit OLS vs. selected vars, better fit w/o LASSO penalties
        # in-sample R-squared using LASSO coeffs
        if verbose:
            print("Running OLS for " + response + " against " + str(coef_dict[response]))
            # col nums of selected responses
            predcols = [predictor_reverse_dict[predstr] for predstr in coef_dict[response]]
            model_ols = LinearRegression()
            model_ols.fit(X[:, predcols], y)
            y_pred = model_ols.predict(X[:, predcols])
            print ("In-sample OLS R-squared: %.2f" % (100 * r2_score(y, y_pred)))
            print("---")
            
    return coef_dict

coef_dict = subset_selection(X, Y, LassoLarsIC(criterion='aic'), verbose=True)

# These subsets line up closely with Table 2
# except Clths, Whlsl, we get different responses

LASSO variables selected for Food.lead: 
['Clths', 'Coal', 'Util', 'Rtail']
Running OLS for Food.lead against ['Clths', 'Coal', 'Util', 'Rtail']
In-sample OLS R-squared: 2.24
---
LASSO variables selected for Beer.lead: 
['Food', 'Clths', 'Coal']
Running OLS for Beer.lead against ['Food', 'Clths', 'Coal']
In-sample OLS R-squared: 2.52
---
LASSO variables selected for Smoke.lead: 
['Txtls', 'Carry', 'Mines', 'Coal', 'Oil', 'Util', 'Telcm', 'Servs', 'Paper', 'Trans', 'Fin']
Running OLS for Smoke.lead against ['Txtls', 'Carry', 'Mines', 'Coal', 'Oil', 'Util', 'Telcm', 'Servs', 'Paper', 'Trans', 'Fin']
In-sample OLS R-squared: 6.55
---
LASSO variables selected for Games.lead: 
['Books', 'Clths', 'Coal', 'Fin']
Running OLS for Games.lead against ['Books', 'Clths', 'Coal', 'Fin']
In-sample OLS R-squared: 5.05
---
LASSO variables selected for Books.lead: 
['Games', 'Books', 'Coal', 'Oil', 'Util', 'Servs', 'BusEq', 'Rtail', 'Fin']
Running OLS for Books.lead against ['Games', 'Books', 'Coal', 'O

In [10]:
# same predictors selected for all but 2 response vars
# use predictors from paper to match results
if False: # turn off/on
    coef_dict = {}
    coef_dict['Food.lead'] = ['Clths', 'Coal', 'Util', 'Rtail']
    coef_dict['Beer.lead'] = ['Food', 'Clths', 'Coal']
    coef_dict['Smoke.lead'] = ['Txtls', 'Carry', 'Mines', 'Coal', 'Oil', 'Util', 'Telcm', 'Servs', 'Paper', 'Trans', 'Fin']
    coef_dict['Games.lead'] = ['Books', 'Clths', 'Coal', 'Fin']
    coef_dict['Books.lead'] = ['Games', 'Books', 'Coal', 'Oil', 'Util', 'Servs', 'BusEq', 'Rtail', 'Fin']
    coef_dict['Hshld.lead'] = ['Clths', 'Coal', 'Rtail']
    coef_dict['Clths.lead'] = ['Books', 'Clths', 'Chems', 'Steel', 'ElcEq', 'Carry',  'Coal', 'Oil', 'Util','Telcm', 'Servs', 'BusEq', 'Rtail']
    # Running OLS for Clths against ['Clths', 'Coal', 'Oil', 'Servs', 'Rtail']
    coef_dict['Hlth.lead'] = ['Books', 'Mines', 'Coal', 'Util']
    coef_dict['Chems.lead'] = ['Clths']
    coef_dict['Txtls.lead'] = ['Clths', 'Autos', 'Coal', 'Oil', 'Rtail', 'Fin']
    coef_dict['Cnstr.lead'] = ['Clths', 'Coal', 'Oil', 'Util', 'Trans', 'Rtail', 'Fin']
    coef_dict['Steel.lead'] = ['Fin']
    coef_dict['FabPr.lead'] = ['Trans', 'Fin']
    coef_dict['ElcEq.lead'] = ['Fin']
    coef_dict['Autos.lead'] = ['Hshld', 'Clths', 'Coal', 'Oil', 'Util', 'BusEq', 'Rtail', 'Fin']
    coef_dict['Carry.lead'] = ['Trans']
    coef_dict['Mines.lead'] = []
    coef_dict['Coal.lead'] = ['Beer', 'Smoke', 'Books', 'Autos', 'Coal', 'Oil', 'Paper', 'Rtail']
    coef_dict['Oil.lead'] = ['Beer', 'Hlth', 'Carry']
    coef_dict['Util.lead'] = ['Food', 'Beer', 'Smoke', 'Hshld', 'Hlth', 'Cnstr', 'FabPr', 'Carry', 'Mines', 'Oil', 'Util', 'Telcm', 'BusEq', 'Whlsl', 'Fin', 'Other']
    coef_dict['Telcm.lead'] = ['Beer', 'Smoke', 'Books', 'Hshld', 'Cnstr', 'Autos', 'Carry', 'Mines', 'Coal', 'Oil', 'Util', 'Servs', 'BusEq', 'Rtail', 'Meals', 'Fin']
    coef_dict['Servs.lead'] = ['Smoke', 'Books', 'Steel', 'Oil', 'Util', 'Fin']
    coef_dict['BusEq.lead'] = ['Smoke', 'Books', 'Util']
    coef_dict['Paper.lead'] = ['Clths', 'Coal', 'Oil', 'Rtail', 'Fin']
    coef_dict['Trans.lead'] = ['Fin']
    coef_dict['Whlsl.lead'] = ['Food', 'Beer', 'Smoke', 'Books', 'Hlth', 'Carry', 'Coal', 'Oil', 'Util', 'Telcm', 'Servs', 'BusEq', 'Fin', 'Other']
    # Running OLS for Whlsl against ['Food', 'Smoke', 'Books', 'Carry', 'Coal', 'Oil', 'Util', 'Servs', 'Fin', 'Other']
    coef_dict['Rtail.lead'] = ['Rtail']
    coef_dict['Meals.lead'] = ['Smoke', 'Books', 'Clths', 'Steel', 'Carry', 'Coal', 'Oil', 'Util', 'Servs', 'BusEq', 'Meals', 'Fin']
    coef_dict['Fin.lead'] = ['Fin']
    coef_dict['Other.lead'] = ['Clths', 'Fin']


In [11]:
def predict_with_subsets(X, Y, model, coef_dict, verbose=False):

    global responses
    global response_reverse_dict
    
    scores = []
    for response in responses:
        y = Y[:,response_reverse_dict[response]]

#        print("LASSO variables selected for %s: " % pred)
#        print(coef_dict[pred])
        
        if not coef_dict[response]:
            if verbose:
                print("No coefs selected for " + response)
 #           print("---")
            continue
        # fit model vs. selected vars, better fit w/o LASSO penalties
        # in-sample R-squared using LASSO coeffs
        #print("Running model for " + pred + " against " + str(coef_dict[pred]))
        # col nums of selected predictors
        predcols = [predictor_reverse_dict[predstr] for predstr in coef_dict[response]]
        model.fit(X[:, predcols], y)
        y_pred = model.predict(X[:, predcols])
        score = r2_score(y, y_pred)
        scores.append(score)
        if verbose:
            print ("In-sample R-squared: %.4f for %s against %s" % (score, response, str(coef_dict[response])))
#        print("---")
    
    if verbose:
        print("Mean R-squared: %.4f" % np.mean(np.array(scores)))
    return np.mean(np.array(scores))
    

predict_with_subsets(X, Y, LinearRegression(), coef_dict, verbose=True)


In-sample R-squared: 0.0224 for Food.lead against ['Clths', 'Coal', 'Util', 'Rtail']
In-sample R-squared: 0.0252 for Beer.lead against ['Food', 'Clths', 'Coal']
In-sample R-squared: 0.0655 for Smoke.lead against ['Txtls', 'Carry', 'Mines', 'Coal', 'Oil', 'Util', 'Telcm', 'Servs', 'Paper', 'Trans', 'Fin']
In-sample R-squared: 0.0505 for Games.lead against ['Books', 'Clths', 'Coal', 'Fin']
In-sample R-squared: 0.0630 for Books.lead against ['Games', 'Books', 'Coal', 'Oil', 'Util', 'Servs', 'BusEq', 'Rtail', 'Fin']
In-sample R-squared: 0.0297 for Hshld.lead against ['Clths', 'Coal', 'Rtail']
In-sample R-squared: 0.0473 for Clths.lead against ['Clths', 'Coal', 'Oil', 'Servs', 'Rtail']
In-sample R-squared: 0.0268 for Hlth.lead against ['Books', 'Mines', 'Coal', 'Util']
In-sample R-squared: 0.0078 for Chems.lead against ['Clths']
In-sample R-squared: 0.0791 for Txtls.lead against ['Clths', 'Autos', 'Coal', 'Oil', 'Rtail', 'Fin']
In-sample R-squared: 0.0515 for Cnstr.lead against ['Clths', 'C

0.037966088190359314

In [12]:
def fit_predict(X, Y, model, coef_dict=None):
    """for backtest, fit Ys v. X using n-1 rows
    predict Ys on X using nth row
    return a prediction for month n+1 using X for final month"""
    
    global responses
    global response_reverse_dict
    
    # keep last row to predict against
    X_predict = X[-1]
    X_predict = X_predict.reshape(1,X.shape[1])
    # fit on remaining rows
    X_fit = X[:-1]
    Y_fit = Y[:-1]

    # if no coef_dict select predictors into coef_dict
    if coef_dict is None:
        coef_dict = subset_selection(X_fit, Y_fit, LassoLarsIC(criterion='aic'))

    predictions = []
    for response in responses:
        if not coef_dict[response]:
            predictions.append(0.0)
            continue
        # column predexes to fit against each other
        predcols = [predictor_reverse_dict[predstr] for predstr in coef_dict[response]]
        responsecol = response_reverse_dict[response]
        model.fit(X_fit[:, predcols], Y_fit[:,responsecol])
        y_pred = model.predict(X_predict[:,predcols])        
        predictions.append(y_pred[0])
        
    return predictions

#    return np.argsort(predictions)

X = data.values[:,:npredictors]
Y = data.values[:, -nresponses:]
model = LinearRegression()
predictions = fit_predict(X, Y, model, coef_dict)
predictions

[1.4836355019598275,
 1.7577138192988357,
 1.9110155185883189,
 1.969622328159436,
 1.3110672416243387,
 0.9205241255335848,
 0.7727895847947452,
 1.7530844982497151,
 0.4314495208090557,
 1.9458301150010573,
 1.8779222878527213,
 0.7782293098974638,
 0.8564700989823534,
 1.025335272257523,
 1.257304057586475,
 0.7699652389898424,
 1.801681177209615,
 -1.1485137785507393,
 0.5530742114183307,
 1.5968854600126257,
 1.174520090774591,
 1.3172987746289975,
 0.6933690994714192,
 0.9944042062269319,
 0.9702545325356124,
 1.1250829210465556,
 0.4528132245884195,
 1.6177108549713068,
 1.026827100687208,
 0.626422804780024]

In [13]:
# 197001 = 121
STARTMONTH = 121
print(X[STARTMONTH])
print(data.iloc[STARTMONTH][:30])

[ -3.34  -1.95  -7.59  -7.76 -12.05  -7.5   -5.69  -7.71  -7.37  -5.26
  -9.84  -6.31  -7.15  -6.89  -9.35 -12.49  -2.34  -0.77 -12.16  -4.83
  -3.16 -11.17  -9.73  -8.89  -8.17  -8.28  -6.31 -13.12  -9.78  -6.2 ]
Food     -3.34
Beer     -1.95
Smoke    -7.59
Games    -7.76
Books   -12.05
Hshld    -7.50
Clths    -5.69
Hlth     -7.71
Chems    -7.37
Txtls    -5.26
Cnstr    -9.84
Steel    -6.31
FabPr    -7.15
ElcEq    -6.89
Autos    -9.35
Carry   -12.49
Mines    -2.34
Coal     -0.77
Oil     -12.16
Util     -4.83
Telcm    -3.16
Servs   -11.17
BusEq    -9.73
Paper    -8.89
Trans    -8.17
Whlsl    -8.28
Rtail    -6.31
Meals   -13.12
Fin      -9.78
Other    -6.20
Name: 197001, dtype: float64


In [14]:
# predict all months starting STARTMONTH
# initialize predictions matrix P

def run_backtest(X, Y, model, coef_dict=None, startmonth=0):
    global P
    global R 

    P = np.zeros_like(Y)
    count = 0
    for month_index in range(startmonth, X.shape[0]+1):
        predictions = fit_predict(X[:month_index, :], 
                                  Y[:month_index], 
                                  model,
                                  coef_dict)
        try:
            P[month_index]= predictions
            sys.stdout.write('.')
            count += 1
            if count % 80 == 0:
                print("")
            sys.stdout.flush()
        except IndexError:
            # I want to run the fit and see the R-squared on full dataset
            # but we are storing the predictions in row of the month predicted
            # so we have no row to store the last prediction (2017-01)
            print("\nlast prediction not stored")
                
    R = np.zeros(P.shape[0])
    numstocks = 6 # top quintile (and bottom)

    for month_index in range(startmonth, X.shape[0]):
        # get indexes of sorted smallest to largest
        select_array = np.argsort(P[month_index])
        # leftmost 6
        short_indexes = select_array[:numstocks]
        # rightmost 6
        long_indexes = select_array[-numstocks:]
        # compute equal weighted long/short return
        R[month_index] = np.mean(X[month_index, long_indexes])/2 - np.mean(X[month_index, short_indexes])/2

    results = R[startmonth:]

    index = pd.date_range('01/01/1970',periods=results.shape[0], freq='M')
    perfdata = pd.DataFrame(results,index=index,columns=['Returns'])
    perfdata['Equity'] = 100 * np.cumprod(1 + results / 100)

    stats = perfdata['Equity'].calc_stats()

    retframe = pd.DataFrame([stats.stats.loc['start'],
                             stats.stats.loc['end'],
                             stats.stats.loc['cagr'],
                             stats.stats.loc['yearly_vol'],
                             stats.stats.loc['yearly_sharpe'],
                             stats.stats.loc['max_drawdown'],
                             ffn.core.calc_sortino_ratio(perfdata.Returns, rf=0, nperiods=564, annualize=False),
                            ],
                            index = ['start',
                                     'end',
                                     'cagr',
                                     'yearly_vol',
                                     'yearly_sharpe',
                                     'max_drawdown',
                                     'sortino',
                                    ],
                            columns=['Value'])   
    return retframe


In [15]:
model = LinearRegression()
run_backtest(X, Y, model, coef_dict, startmonth=STARTMONTH)

................................................................................
................................................................................
................................................................................
................................................................................
................................................................................
................................................................................
................................................................................
....
last prediction not stored


Unnamed: 0,Value
start,1970-01-31 00:00:00
end,2016-12-31 00:00:00
cagr,0.0649128
yearly_vol,0.0843657
yearly_sharpe,0.794651
max_drawdown,-0.09031
sortino,0.597059


In [16]:
# double check results
#model = LinearRegression()
#R = run_backtest(X, Y, model, coef_dict_paper, startmonth=STARTMONTH, summary=False)
results = R[STARTMONTH:]
print(len(results))
#print(results)
print(np.mean(results))
print(np.std(results) * np.sqrt(12))
print(np.prod(1 + results / 100))
print(np.prod(1 + results / 100) ** (12.0/results.shape[0]))-1

564
0.541044621749409
5.886561448310519
19.362424493035657
0.06507973626047181


In [17]:
# calc MSE across all preds
np.mean((P[121:]-Y[121:])**2)

42.3833947060866

In [18]:
# run performance chart
perf = 100 * np.cumprod(1 + results / 100)

def mychart(perf):
    x_coords = np.linspace(1970, 2016, perf.shape[0])
    
    trace1 = Scatter(
        x = x_coords,
        y = perf,
        name = 'Growth of $1',    
    )

    layout = Layout(
        yaxis=dict(
            type='log',
            autorange=True
        )
    )
    plotdata = [trace1]
    
    fig = Figure(data=plotdata, layout=layout)
    
    iplot(fig)
    
mychart(perf)

In [19]:
# pass coef_dict as None
# fit_predict will do subset selection at each timestep using data it trains on
model = LinearRegression()
run_backtest(X, Y, model, coef_dict=None, startmonth=STARTMONTH)

................................................................................
................................................................................
................................................................................
................................................................................
................................................................................
................................................................................
................................................................................
....
last prediction not stored


Unnamed: 0,Value
start,1970-01-31 00:00:00
end,2016-12-31 00:00:00
cagr,0.0352209
yearly_vol,0.0479525
yearly_sharpe,0.751411
max_drawdown,-0.128334
sortino,0.326473


In [20]:
results = R[STARTMONTH:]
perf = 100 * np.cumprod(1 + results / 100)
mychart(perf)