# Data

Code used to arrange data into a format more convenient for the thesis. We calculate the required predictors from the data provided by Amit Goyal (https://sites.google.com/view/agoyal145/?redirpath=/), from the dataset used in his paper "A Comprehensive Look at the Empirical Performance of Equity Premium Prediction" (with Ivo Welch), July 2008. We use data updated to include monthly observations up to 2021.

### Imports and Initialization

In [1]:
# external imports
import pandas as pd
import numpy as np

In [2]:
# loading Goyal data
GoyalData_updated = pd.read_csv("GoyalData_2021.csv", thousands=',')
data = GoyalData_updated

# starting and ending periods
TRAIN_START = 192612
TRAIN_END = 195612
TEST_START = 195701
TEST_END = 201612

IND_TRAIN_START = data[data['yyyymm'] == TRAIN_START].index[0]
IND_TRAIN_END = data[data['yyyymm'] == TRAIN_END].index[0] + 1
IND_TEST_START = data[data['yyyymm'] == TEST_START].index[0]
IND_TEST_END = data[data['yyyymm'] == TEST_END].index[0] + 1

data

Unnamed: 0,yyyymm,Index,D12,E12,b/m,tbl,AAA,BAA,lty,ntis,Rfree,infl,ltr,corpr,svar,csp,CRSP_SPvw,CRSP_SPvwx
0,187101,4.44,0.2600,0.4000,,,,,,,,,,,,,,
1,187102,4.50,0.2600,0.4000,,,,,,,0.004967,,,,,,,
2,187103,4.61,0.2600,0.4000,,,,,,,0.004525,,,,,,,
3,187104,4.74,0.2600,0.4000,,,,,,,0.004252,,,,,,,
4,187105,4.86,0.2600,0.4000,,,,,,,0.004643,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1807,202108,4522.68,58.7913,169.8333,0.184756,0.0005,0.0255,0.0324,0.0128,0.014846,0.000000,0.002066,-0.0035,-0.0045,0.000602,,0.030600,0.029205
1808,202109,4307.54,59.2545,175.3700,0.193036,0.0004,0.0253,0.0323,0.0137,0.015598,0.000000,0.002716,-0.0250,-0.0194,0.001393,,-0.046076,-0.047152
1809,202110,4605.38,59.6354,182.8600,0.182389,0.0005,0.0268,0.0335,0.0158,0.013368,0.000000,0.008308,0.0051,0.0159,0.001151,,0.070510,0.069627
1810,202111,4567.00,60.0162,190.3500,0.189455,0.0005,0.0262,0.0328,0.0156,0.015640,0.000100,0.004913,0.0210,0.0094,0.001327,,-0.007256,-0.008665


### Calculating Predictors

Note: we lag EQPREM by 1 for the sake of easier computation of forecast models

In [3]:
data_stocks = data.copy()[['yyyymm']]

data_stocks['EQPREM'] = data.loc[:,'CRSP_SPvw'].shift(-1)

data_stocks['DP'] = np.log(data['D12'] / data['Index'])

data_stocks['DY'] = np.log(data['D12'] / data['Index'].shift(1))

data_stocks['DE'] = np.log(data['D12'] / data['E12'])

data_stocks['EP'] = np.log(data['E12'] / data['Index'])

data_stocks['SVAR'] = data['svar']

data_stocks['BM'] = data['b/m']

data_stocks['NTIS'] = data['ntis']

data_stocks['TBL'] = data['tbl']

data_stocks['LTY'] = data['lty']

data_stocks['LTR'] = data['ltr']

data_stocks['TMS'] = data['lty'] - data['tbl']

data_stocks['DFY'] = data['BAA'] - data['AAA']

data_stocks['DFR'] = data['corpr'] - data['ltr']

data_stocks['INFL'] = data['infl']

data_stocks = data_stocks.drop(columns=['LTY', 'DE'])

data_stocks.iloc[IND_TRAIN_START:IND_TEST_END,:].to_csv('data_stocks.csv', index=False)

data_stocks

Unnamed: 0,yyyymm,EQPREM,DP,DY,EP,SVAR,BM,NTIS,TBL,LTR,TMS,DFY,DFR,INFL
0,187101,,-2.837728,,-2.406945,,,,,,,,,
1,187102,,-2.851151,-2.837728,-2.420368,,,,,,,,,
2,187103,,-2.875302,-2.851151,-2.444519,,,,,,,,,
3,187104,,-2.903111,-2.875302,-2.472328,,,,,,,,,
4,187105,,-2.928112,-2.903111,-2.497329,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1807,202108,-0.046076,-4.342866,-4.314288,-3.282043,0.000602,0.184756,0.014846,0.0005,-0.0035,0.0123,0.0069,-0.0010,0.002066
1808,202109,0.070510,-4.286281,-4.335018,-3.201224,0.001393,0.193036,0.015598,0.0004,-0.0250,0.0133,0.0070,0.0056,0.002716
1809,202110,-0.007256,-4.346731,-4.279873,-3.226260,0.001151,0.182389,0.013368,0.0005,0.0051,0.0153,0.0067,0.0108,0.008308
1810,202111,0.043485,-4.331997,-4.340366,-3.177747,0.001327,0.189455,0.015640,0.0005,0.0210,0.0151,0.0066,-0.0116,0.004913
