In [1]:
import os
import itertools
import xlsxwriter

import numpy as np
import pandas as pd
from scipy.optimize import minimize
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt

import DataPreparation as dpr
import Models
import benchmarks as bench
from MyEstimators import CLS_Estimator

<Figure size 720x360 with 0 Axes>

# Load data

In [13]:
df = pd.read_excel('data/EQP_Quarterly.xlsx', index_col = 0)

In [14]:
df = df.dropna()
df['yyyyq'] = df['yyyyq'].astype('int32')

In [15]:
df['time'] = [pd.to_datetime(str(x)[:4]) + pd.offsets.QuarterEnd(int(str(x)[4:])) for x in df['yyyyq']]
df = df.set_index('time')
df = df.drop(['yyyyq'], axis = 1)

In [18]:
df = df.loc['1956-03-31':'2018-12-31']

## Add new cay variable and construct X and y

In [19]:
df['new_cay'] = df['c'] - 0.218*df['w'] - 0.801*df['y']
df.head()

Unnamed: 0_level_0,EQP,y_lag,y_2lag,DP,DY,EP,DE,svar,b/m,ntis,...,infl,c,w,y,cay,AAA,BAA,rr,rfree,new_cay
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1956-03-31,0.066512,0.047768,0.063101,-3.322576,-3.281965,-2.530799,-0.791778,0.005084,0.509828,0.025245,...,-0.003717,9.274968,11.074455,9.093439,0.019292,0.0315,0.0362,0.075561,0.00635,-0.423108
1956-06-30,-0.028264,0.066512,0.047768,-3.33303,-3.269151,-2.575525,-0.757505,0.003289,0.531077,0.026695,...,0.0,9.272498,11.092725,9.100386,0.007275,0.031,0.036,-0.0224,0.005625,-0.435125
1956-09-30,-0.034415,-0.028264,0.066512,-3.261722,-3.293365,-2.568575,-0.693147,0.003688,0.551565,0.025672,...,0.014925,9.271728,11.091665,9.107828,0.000775,0.0326,0.0376,-0.027815,0.006225,-0.441626
1956-12-31,0.033241,-0.034415,-0.028264,-3.204645,-3.239744,-2.573142,-0.631503,0.002519,0.57191,0.029362,...,0.007353,9.269304,11.086198,9.106428,0.000663,0.0356,0.0407,0.041139,0.0071,-0.441736
1957-03-31,-0.05075,0.033241,-0.034415,-3.289216,-3.260525,-2.616389,-0.672827,0.004394,0.544177,0.026149,...,0.007299,9.277993,11.096678,9.118405,-0.002524,0.0375,0.0437,-0.041856,0.008025,-0.444925


## Plots of Variables

In [None]:
fig = plt.figure(figsize = (10,8))
plt.plot(df['DP'], label='DP')
plt.plot(df['DY'], label='DY')
        
# plt.title('co1', fontsize=20)
plt.legend(fontsize=16)
plt.savefig('co1.png')

In [None]:
fig = plt.figure(figsize = (10,8))
plt.plot(df['tbl'], label = 'tbl')
plt.plot(df['lty'], label = 'lty')
        
# plt.title('co2', fontsize=20)
plt.legend(fontsize=16)
plt.savefig('co2.png')

In [None]:
fig = plt.figure(figsize = (10,8))
plt.plot(df['DP'], label = 'dp')
plt.plot(df['EP'], label = 'ep')
        
# plt.title('co3', fontsize=20)
plt.legend(fontsize=16)
plt.savefig('co3.png')

In [None]:
fig = plt.figure(figsize = (10,8))
plt.plot(df['BAA'], label = 'BAA')
plt.plot(df['AAA'], label = 'AAA')
        
# plt.title('co4', fontsize=20)
plt.legend(fontsize=16)
plt.savefig('co4.png')

In [None]:
fig = plt.figure(figsize = (10,8))
plt.plot(df['c'], label = 'log of consumption')
plt.plot(df['w'], label = 'log of wealth')
plt.plot(df['y'], label = 'log of income')
        
# plt.title('cay', fontsize=20)
plt.legend(fontsize=16)
plt.savefig('cay.png')

In [None]:
fig = plt.figure(figsize = (10,8))
plt.plot(df['new_cay'], label = '"cay" variable')
        
# plt.title('cay', fontsize=20)
plt.legend(fontsize=16)
plt.savefig('new_cay.png')

## Full sample linear regression

In [None]:
lr = LinearRegression()
lr.fit(df[['y_lag', 'new_cay']], df['EQP'])

In [None]:
lr.coef_

In [None]:
lr.intercept_

## Regression with dy-dp

In [None]:
dpdy = df['DY']-df['DP']

In [None]:
lr = LinearRegression()
lr.fit(dpdy.values.reshape(-1,1), df['EQP'].values.reshape(-1,1))

In [None]:
lr.coef_

In [None]:
lr.intercept_

# Construct single-index and nonlinear models

### dimension function

In [None]:
extra_params = {'sin_func':1,
               'cos_func':1,
               'scaled_sin_func':2,
               'scaled_cos_func':2,
               'exp_func':2,
               'exp_shift_func':2,
                'poly_func':3,
                'linear_func':2
               }

In [8]:
co1 = df[['DP', 'DY']]
co2 = df[['tbl', 'lty']]
co3 = df[['DP', 'EP']]
co4 = df[['BAA', 'AAA']]
y = df[['EQP']].squeeze()

station_ar1 = df[['y_lag', 'new_cay']]

cointe_ar1 = [co1, co2, co3, co4]
names_ar1 = ['co1', 'co2', 'co3', 'co4']

for i in range(len(cointe_ar1)):
    cointe_ar1[i].name = names_ar1[i]

In [9]:
X_ = co1.join(station_ar1)

## Nonlienar Models

In [None]:
# def sin_func(x): 
#     station = x 
#     def objective_func(params):
#         d1 = param_num['theta']
#         d2 = param_num['beta']
        
#         theta = params[0:d1]
#         beta = params[d1:d1+d2]
#         gammas = range(0,param_num['gamma'])
        
#         u = single_index(x.iloc[:,:d1])(theta)
        
#         func = np.sin(u + params[d1 + d2 + gammas[0]]) + np.dot(x.iloc[:,d1:d1+d2], beta)
#         return func
#     return objective_func

In [10]:
Models.param_num = {'theta':2,
               'beta':2,
               'gamma':1
               }

In [12]:
cls = CLS_Estimator(obj_func = Models.sin_func, x0 = [0.001]*5, options={'maxiter':50000})
cls.fit(X_,y)
cls.params_

array([ 0.90222832, -0.87716602,  0.94101479,  0.48557383,  0.32175819])

In [None]:
def cos_func(x):
    def objective_func(params, 
                       d1 = param_num['theta'], 
                       d2 = param_num['beta'], 
                       extra = range(0,param_num['gamma'])):
        func = np.cos(single_index(x.iloc[:,:d1])(params[0:d1])+params[d1+d2+extra[0]])+np.dot(
            x.iloc[:,d1:d1+d2], params[d1:d1+d2])
        return func
    return objective_func

In [None]:
# def scaled_sin_func(x):
#     def objective_func(params, 
#                        d1 = param_num['theta'], 
#                        d2 = param_num['beta'], 
#                        extra = range(0,param_num['gamma'])):
#         func = np.sin(params[d1+d2+extra[1]]*single_index(x.iloc[:,:d1])(
#             params[0:d1])+params[d1+d2+extra[0]])+np.dot(x.iloc[:,d1:d1+d2], params[d1:d1+d2])
#         return func
#     return objective_func

In [None]:
# def scaled_cos_func(x):
#     def objective_func(params, 
#                        d1 = param_num['theta'], 
#                        d2 = param_num['beta'], 
#                        extra = range(0,param_num['gamma'])):
#         func = np.cos(params[d1+d2+extra[1]]*single_index(x.iloc[:,:d1])(
#             params[0:d1])+params[d1+d2+extra[0]])+np.dot(x.iloc[:,d1:d1+d2], params[d1:d1+d2])
#         return func
#     return objective_func

In [None]:
# def exp_shift_func(x):
#     def objective_func(params, 
#                        d1 = param_num['theta'], 
#                        d2 = param_num['beta'], 
#                        extra = range(0,param_num['gamma'])):
#         func = 1 - np.exp(params[d1+d2+extra[1]]*((single_index(x.iloc[:,:d1])(
#             params[0:d1]))-params[d1+d2+extra[0]])**2)+np.dot(x.iloc[:,d1:d1+d2], params[d1:d1+d2])
#         return func
#     return objective_func

In [None]:
# def exp_func(x):
#     def objective_func(params, 
#                        d1 = param_num['theta'], 
#                        d2 = param_num['beta'], 
#                        extra = range(0,param_num['gamma'])):
#         func = params[d1+d2+extra[0]]*np.exp(-params[d1+d2+extra[1]]*(single_index(x.iloc[:,:d1])(params[0:d1]))**2
#                                 )+np.dot(x.iloc[:,d1:d1+d2], params[d1:d1+d2])
#         return func
#     return objective_func

In [None]:
# def poly_func(x):
#     def objective_func(params, 
#                        d1 = param_num['theta'], 
#                        d2 = param_num['beta'], 
#                        extra = range(0,param_num['gamma'])):
#         func = params[d1+d2+extra[0]]+params[d1+d2+extra[1]]*(single_index(x.iloc[:,:d1])(
#             params[0:d1]))+params[d1+d2+extra[2]]*((single_index(x.iloc[:,:d1])(
#             params[0:d1]))**2)+np.dot(x.iloc[:,d1:d1+d2], params[d1:d1+d2])
# #                (single_index(x.iloc[:,:d1])(params[0:d1])
#         return func
#     return objective_func

In [None]:
# def linear_func(x):
#     def objective_func(params, 
#                        d1 = param_num['theta'], 
#                        d2 = param_num['beta'], 
#                        extra = range(0,param_num['gamma'])):
#         func = params[d1+d2+extra[0]]+params[d1+d2+extra[1]]*(single_index(x.iloc[:,:d1])(
#             params[0:d1]))+np.dot(x.iloc[:,d1:d1+d2], params[d1:d1+d2])
# #         func = params[d1+d2+extra[0]]+params[d1+d2+extra[1]]*(single_index(x.iloc[:,:d1])(
# #             params[0:d1]))
# #                (single_index(x.iloc[:,:d1])(params[0:d1])
#         return func
#     return objective_func

### Constraint

In [None]:
def constraint_func(x):
    def constraint(params):
        con = 0
        for j in np.arange(0, x.iloc[:,:d1].shape[1]):
            con += params[j]**2
            cons = con - 1
        return cons
    return {'type':'eq', 'fun': constraint}

## Empirical Study

### Cointegrated predictors
- dividend-price ratio and dividend yield
- T-bill rate and long-term yield
- dividend-price ratio and earningprice ratio
- baa- and aaa-rated corporate bond yields

In [None]:
co1 = df[['DP', 'DY']]
co2 = df[['tbl', 'lty']]
co3 = df[['DP', 'EP']]
co4 = df[['BAA', 'AAA']]
y = df[['EQP']].squeeze()

station_ar1 = df[['y_lag', 'new_cay']]

cointe_ar1 = [co1, co2, co3, co4]
names_ar1 = ['co1', 'co2', 'co3', 'co4']

for i in range(len(cointe_ar1)):
    cointe_ar1[i].name = names_ar1[i]

In [None]:
# co1_ar2 = df_AR2[['DP', 'DY']]
# co2_ar2 = df_AR2[['tbl', 'lty']]
# co3_ar2 = df_AR2[['DP', 'EP']]
# co4_ar2 = df_AR2[['BAA', 'AAA']]
# y_lag2 = df_AR2[['EQP']].squeeze()

# station_ar2 = df_AR2[['y_lag', 'new_cay']]

# X_train_AR2 = df_AR2.loc[:"1988-01-01"]
# y_train_AR2 = y_lag2.loc[:"1988-01-01"]

# X_test_AR2 = df_AR2.loc["1988-01-01":"2018-12-01"]
# y_test_AR2 = y_lag2.loc["1988-01-01":"2018-12-01"]

# station_ar2_train = df_AR2.loc[:"1988-01-01"][['y_lag', 'new_cay']]

# cointe_ar2 = [co1_ar2, co2_ar2, co3_ar2, co4_ar2]
# names_ar2 = ['co1', 'co2', 'co3', 'co4']

# for i in range(len(cointe_ar1)):
#     cointe_ar2[i].name = names_ar2[i]

### Stationary variables

### Fit model and Save Results

In [None]:
fun_list = [sin_func,
            cos_func,
            scaled_sin_func,
            scaled_cos_func,
            exp_func,
            exp_shift_func,
            poly_func,
            linear_func
           ]

In [None]:
# Set up hierachical index
fun_names = [i.__name__ for i in fun_list]
cointe_names = [i.name for i in cointe_ar1]
iterables = [fun_names, cointe_names]

In [None]:
#Set up directory
parent = os.getcwd()
folder = 'results'
path = os.path.join(parent, folder)
if not os.path.exists(path):
    os.makedirs(path)

In [None]:
results = pd.DataFrame()
for i, j in itertools.product(fun_list, cointe_ar1):
    ################################# Set up dimensions ######################################
    d1, d2, extra= dimensions(j,station_ar1, i.__name__)
    initial_len = d1+d2+extra[-1]+1
    
    # Set up dataframes
    iterables = [[i.__name__], [j.name]]
    sec_columns = ['param_'+str(i) for i in range(1,initial_len+1)]
    multi_index = pd.MultiIndex.from_product(iterables, names=["function", "variables"])
    multi_columns = pd.MultiIndex.from_product([['NLS', 'CLS'], sec_columns],
                                               names=['Estimator', 'Parameters'])
    result = pd.DataFrame(index = multi_index, columns = multi_columns)
    ###################################### Set up X ##########################################
    X_ = j.join(station_ar1)
    # Fit models
    nls = CLS_Estimator(obj_func = i, x0 = [0.001]*initial_len, options={'maxiter':50000})
    cls = CLS_Estimator(obj_func = i, x0 = [0.001]*initial_len, constraints = constraint_func(X_), options={'maxiter':50000})
    nls.params_ = nls.fit(X_,y).params_
    cls.params_ = cls.fit(X_,y).params_
    # Save results to dataframe
    result.loc[i.__name__,j.name].loc['NLS'] = nls.params_ 
    result.loc[i.__name__,j.name].loc['CLS'] = cls.params_ 
    # Put into one table
    results = results.append(result, ignore_index = False, sort = False)
    
# Export to Excel
results.to_excel('results/full_sample_new_cay.xlsx')

## Use initial values from Linear regression (using Taylor expansion)

In [None]:
orders = {'sin_func':1,
          'cos_func':2,
          'scaled_sin_func':1,
          'scaled_cos_func':2,
          'exp_func':5,
          'exp_shift_func':2,
          'poly_func':2,
          'linear_func':1
               }

In [None]:
def Taylor_init(variables, station, y, function):
    
    d1, d2, extra = dimensions(variables, station, function.__name__)
    
    # find the initials for theta
    LR = LinearRegression()
    LR_theta = LR.fit(variables.iloc[:,1:], variables.iloc[:,:1])
    alpha = np.append(1, -LR_theta.coef_)
    theta = np.array(-alpha/np.linalg.norm(alpha))
#     print(len(theta))
    
    # calculate single-index
    u = single_index(variables)(theta)
    
    # find the initials for beta
    Xs = station.copy()
#     print(Xs.shape[1])
    Xs['u'], Xs['u2'], Xs['u3'], Xs['u4'], Xs['u6'] = u, u**2, u**3, u**4, u**6
    t_order = orders.get(function.__name__)
    
    if function == exp_func:
        X_reg = Xs.iloc[:, 0:d2+t_order].drop(['u', 'u3'], axis = 1)
    else:
        X_reg = Xs.iloc[:, 0:d2+t_order]
#     print(X_reg)
    LR_taylor = LR.fit(X_reg, y)
    theta_beta = np.append(theta,LR_taylor.coef_[:d2])
#     print(len(theta_gamma))
    
    
    # initials for gammas
    initials = []
    if function == sin_func:
        initials = np.append(theta_beta, LR_taylor.intercept_)
    elif function == scaled_sin_func:
        initials = np.append(theta_beta, ([LR_taylor.coef_[0]], [LR_taylor.intercept_]))
    elif function == linear_func:
        initials = np.append(theta_beta, ([LR_taylor.intercept_], [LR_taylor.coef_[0]]))
    elif function == poly_func:
        ini_poly_ = np.append(theta_beta,LR_taylor.coef_[d2:])
        initials = np.insert(ini_poly_, 4, LR_taylor.intercept_)
    elif function == cos_func:
        initials = np.append(theta_beta, [-LR_taylor.coef_[-1]/2])
#         print(-LR_taylor.coef_[-1]/2, -LR_taylor.coef_[d2])
    elif function == scaled_cos_func:
        initials = np.append(theta_beta,(
            [-LR_taylor.coef_[d2]/2*np.sqrt(np.abs(1-LR_taylor.intercept_)),np.sqrt(np.abs(1-LR_taylor.intercept_))]))
#         print(1-LR_taylor.intercept_)
    elif function == exp_shift_func:
        initials = np.append(theta_beta, 
                             [np.sqrt(np.abs(LR_taylor.coef_[-1])),LR_taylor.coef_[-2]/np.sqrt(np.abs(LR_taylor.coef_[-1]))])
#         print(LR_taylor.coef_, LR_taylor.coef_[-2])
    elif function == exp_func:
        initials =np.append(theta_beta, [LR_taylor.intercept_, -LR_taylor.coef_[-3]])
#         initials =np.append(theta_beta, [LR_taylor.intercept_, np.sqrt(np.abs(2*LR_taylor.coef_[-2]/(LR_taylor.intercept_)))])

    return initials

In [None]:
dimensions(co3, station_ar1, cos_func.__name__)

In [None]:
# Taylor_init(co3, station_ar1, y, scaled_cos_func)
Taylor_init(co2.loc[:"1988-01-01"], station_ar1.loc[:"1988-01-01"], y.loc[:"1988-01-01"], exp_func)

# Fit model and Save Results

In [None]:
results_Taylor = pd.DataFrame()
for i, j in itertools.product(fun_list, cointe_ar1):
    # Set up dimensions
    d1, d2, extra= dimensions(j,station_ar1, i.__name__)
    initial_len = d1+d2+extra[-1]+1
    # Set up dataframes
    iterables = [[i.__name__], [j.name]]
    sec_columns = ['param_'+str(i) for i in range(1,initial_len+1)]
    multi_index = pd.MultiIndex.from_product(iterables, names=["function", "variables"])
    multi_columns = pd.MultiIndex.from_product([['NLS', 'CLS'], sec_columns],
                                               names=['Estimator', 'Parameters'])
    result = pd.DataFrame(index = multi_index, columns = multi_columns)
    # Prepare X
    X_ = j.join(station_ar1)
    # Fit models
    nls = CLS_Estimator(obj_func = i, x0 = Taylor_init(j, station_ar1, y, i), options={'maxiter':1000000})
    cls = CLS_Estimator(obj_func = i, x0 = Taylor_init(j, station_ar1, y, i), constraints = constraint_func(X_), 
                        options={'maxiter':1000000})
    nls.params_ = nls.fit(X_,y).params_
    cls.params_ = cls.fit(X_,y).params_
    print(i.__name__, j.name)
    # Save results to dataframe
    result.loc[i.__name__,j.name].loc['NLS'] = nls.params_ 
    result.loc[i.__name__,j.name].loc['CLS'] = cls.params_ 
    # Put into one table
    results_Taylor = results_Taylor.append(result, ignore_index = False, sort = False)
    
# Export to Excel
results_Taylor.to_excel('results/full_taylor_new_cay.xlsx')
results_Taylor.tail()

In [None]:
results_Taylor.head()

### GridSearch and CrossValidation

### Train_test split

In [None]:
# val_length = 1
test_length = 31
step = 1
### quarterly data:4
freq = 4
# cv_outer = TimeSeriesSplit(gap=0, max_train_size=None, n_splits=int((12/step) * test_length), test_size=step)
# cv_inner = TimeSeriesSplit(gap=0, max_train_size=None, n_splits=int((12/step) * val_length), test_size=step)
cv_outer = TimeSeriesSplit(max_train_size=None, n_splits=test_length*freq, test_size=step, gap=0)
# cv_inner = TimeSeriesSplit(gap=0, max_train_size=None, n_splits=4, test_size=step)

### set up dataframes for results

In [None]:
oos_MSE = pd.DataFrame()

rows = df.loc["1988-01-01":"2018-12-01"].index
sec_columns = ['CLS_MSE', 'SM_MSE', 'NLS_MSE', 'AR1_MSE', 'AR2_MSE', 'AR_cay_MSE']
multi_columns = pd.MultiIndex.from_product([['co1', 'co2', 'co3', 'co4'], sec_columns],names=['Variable', 'Model'])
        
oos_MSE = pd.DataFrame(index = multi_columns, columns = rows)

#### The block below is used to test the OOS results only

In [None]:
# func = linear_func

x0 = Taylor_init(co1_ar2.loc[:"1988-01-01"], station_n.loc[:"1988-01-01"], y_train_AR2.loc[:"1988-01-01"], func)

X_ = co1_ar2.join(station_n) 
d1, d2, extra= dimensions(co1_ar2,station_n, func.__name__)
# # cls = CLS_Estimator(obj_func = sin_func, x0 = x0, constraints = constraint_func(X_))
# cls = CLS_Estimator(obj_func = sin_func, x0 = [0.001]*(d1+d2+extra[-1]+1), constraints = constraint_func(X_))
# cv_result = cross_validate(cls, X_, y_lag2, cv=cv_outer, scoring = 'neg_mean_squared_error')

# mse_s0 = cv_result['test_score']
# mse_s0[:5]
# mse_st = cv_result['test_score']
# mse_st[:5]

a = []
for train_index, test_index in cv_outer.split(X_):
    X_train, X_test = X_.iloc[train_index, :], X_.iloc[test_index, :]
    y_train, y_test = y_lag2.iloc[train_index], y_lag2.iloc[test_index]
    cls = CLS_Estimator(obj_func = func, x0 = x0, constraints = constraint_func(X_))
#     cls = CLS_Estimator(obj_func = sin_func, x0 = [0.001]*(d1+d2+extra[-1]+1), constraints = constraint_func(X_))
    cls.fit(X_train, y_train)
    a.append(cls.params_)
a = pd.DataFrame(a)
a.to_excel('coefs_g8_newcay.xlsx', sheet_name = 'both')

In [None]:
a = []
for train_index, test_index in cv_outer.split(X_):
    X_train, X_test = X_.iloc[train_index, :], X_.iloc[test_index, :]
    y_train, y_test = y_lag2.iloc[train_index], y_lag2.iloc[test_index]
    x = X_train.to_numpy()
    y_m = y_train.to_numpy()
    
#     beta = np.matmul(np.matmul(np.linalg.inv(np.matmul(x.T, x)), x.T), y_m)
#     alp = np.mean(y_m) - np.matmul(beta, np.mean(x, axis = 0))
    lr.fit(X_train, y_train)
    beta = lr.coef_
    alp = lr.intercept_
    coef = np.append(alp, beta)
    a.append(coef)
a = pd.DataFrame(a)
a.to_excel('linear_func.xlsx', sheet_name = 'both')

## Generate OOS MSE

### project 2

In [None]:
fun_list = [
            sin_func,
            cos_func,
            exp_func,
            exp_shift_func,
            poly_func
           ]

In [None]:
rr = df['rr']
rfree = df['rfree']
station_n = pd.DataFrame()

### CER

In [None]:
P2_pred = {}
        
for i in fun_list:
    print(i.__name__)
    for j in cointe_ar1:    
        # Prepare X
        X_ = j.join(station_n)
        pred_list = []
        ##################################### no need to loop! ##############################################################
        for train_index, test_index in cv_outer.split(X_):
#             print(train_index)
            X_train, X_test = X_.iloc[train_index, :], X_.iloc[test_index, :]
            rr_train, rr_test = rr.iloc[train_index], rr.iloc[test_index]
            # benchmark model: sm
            sm_pred, sm_mse = bench.sample_mean(rr, "1988-01-01", cv_outer = cv_outer)
            P2_pred['SM'] = sm_pred
            # benchmark model: Nonlinear
            station_n = pd.DataFrame()
            d1, d2, extra= dimensions(j,station_n, i.__name__)
        
            nlr = CLS_Estimator(obj_func = i, x0 = [0.001]*(d1+d2+extra[-1]+1), constraints = constraint_func(j))
            nlr.fit(X_train, rr_train)
            pred_list.append(nlr.predict(X_test)[0])
            P2_pred[(i.__name__, j.name)] = pred_list

In [None]:
P2_pred = pd.DataFrame.from_dict(P2_pred)
P2_pred

In [None]:
P2_pred = pd.read_excel('results/P2_signif/p2_pred.xlsx', header=[0,1], index_col=[0])

In [None]:
sm_pred

In [None]:
sigma2 = []
for train_index, test_index in cv_outer.split(X_):
    X_train, X_test = X_.iloc[train_index, :], X_.iloc[test_index, :]
    rr_train, rr_test = rr.iloc[train_index], rr.iloc[test_index]
    sigma2.append((np.std(rr_train[-20:]))**2)

Rp_dict = {}
CER = {}
for j in P2_pred.columns:
    w = []
    w_raw = (1/5)*(np.asarray(P2_pred[j])/np.asarray(sigma2))
    for k,i in enumerate(w_raw):
        if i<0:
            w.append(0)
        elif i>1.5:
            w.append(1.5)
        else:
            w.append(i)
    Rp = w*rr['1988-03-01':]+rfree['1988-03-01':]
    Rp_dict[j] = Rp
    CER_cal = np.mean(Rp) - 0.5*5*(np.std(Rp)**2)
    CER[j] = CER_cal

In [None]:
Rp_dict = pd.DataFrame.from_dict(Rp_dict)
Rp_dict

In [None]:
Rp_dict.to_excel('Rp.xlsx')

In [None]:
CER = pd.DataFrame.from_dict(CER, orient = 'index', columns = ['CER'])

In [None]:
CER['delta_CER'] = CER['CER'] - CER.loc['SM'].values

In [None]:
CER.to_excel('results/CER.xlsx')

In [None]:
    w_sm = []
    w_raw = (1/5)*(np.asarray(sm_pred)/np.asarray(sigma2))
    for k,i in enumerate(w_raw):
        if i<0:
            w_sm.append(0)
        elif i>1.5:
            w_sm.append(1.5)
        else:
            w_sm.append(i)

In [None]:
len(w_sm[:56])

In [None]:
Rp_sm = w_sm[:56]*rr['1988-03-01':'2001-12-01']+rfree['1988-03-01':'2001-12-01']
CER_sm = np.mean(Rp_sm) - 0.5*5*(np.std(Rp_sm)**2)
CER_sm

In [None]:
a = w_sm*rr['1988-03-01':]

In [None]:
Rp_sm = a['1996-03-01':'2011-12-01']+rfree['1996-03-01':'2011-12-01']
CERp = np.mean(Rp_sm) - 0.5*5*(np.std(Rp:'2001-12-01'_sm)**2)
CERp

In [None]:
P2_MSE = {}
        
for i in fun_list:
    print(i.__name__)
    for j in cointe_ar1:    
        # Prepare X
        X_ = j.join(station_ar1)
        
        ##################################### no need to loop! ##############################################################
        # benchmark model: sm
        sm_pred, sm_mse = bench.sample_mean(y, "1988-01-01", cv_outer = cv_outer)
        P2_MSE['SM'] = sm_mse
        # benchmark model: Nonlinear
        station_n = pd.DataFrame()
        d1, d2, extra= dimensions(j,station_n, i.__name__)
        
        nlr = CLS_Estimator(obj_func = i, x0 = [0.001]*(d1+d2+extra[-1]+1), constraints = constraint_func(j))
        cv_nonlinear = cross_validate(nlr, j, y, cv=cv_outer, scoring = 'neg_mean_squared_error')
        P2_MSE[(i.__name__, j.name)] = -cv_nonlinear['test_score']    

In [None]:
P2_MSE = pd.DataFrame.from_dict(P2_MSE)
P2_MSE.head()

In [None]:
R2_dict = {}
for i, j in itertools.product(fun_list, cointe_ar1):
    cumu_sum = []
    target_sum = []
    cumu_R2 = []
    for R in range(P2_MSE.shape[0]):
        target_sum.append(P2_MSE[(i.__name__,j.name)][-(R+1):].sum())
        cumu_sum.append(P2_MSE['SM'][-(R+1):].sum())
    cumu_R2 = [1 - x/y for x,y in zip(target_sum, cumu_sum)]
    R2_dict[(i.__name__, j.name)] = cumu_R2

In [None]:
P2_R2 = pd.DataFrame.from_dict(R2_dict)

In [None]:
P2_R2.to_excel('P2_R2.xlsx')

### Starting Values: from taylors

In [None]:
X_ = co1.join(station_ar1)
for train_index, test_index in cv_outer.split(X_):
    X_train, X_test = X_.iloc[train_index, :], X_.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    print(X_test, y_test)

In [None]:
cls = CLS_Estimator(obj_func = i, x0 = x0, constraints = constraint_func(X_), options={'maxiter':1000000})

In [None]:
writer_MSE = pd.ExcelWriter('MSE_taylor_newcay.xlsx', engine='xlsxwriter')
        
for i in fun_list:
    print(i.__name__)
    for j in cointe_ar1:
        # Prepare X
        X_ = j.join(station_ar1) 
#         print(X_)
        
        # Fit models
        x0 = Taylor_init(j.loc[:"1988-01-01"], station_ar1.loc[:"1988-01-01"], y.loc[:"1988-01-01"], i)
        # Target model
        d1, d2, extra= dimensions(j,station_ar1, i.__name__)
        
        cls = CLS_Estimator(obj_func = i, x0 = x0, constraints = constraint_func(X_), options={'maxiter':1000000})
#         print(cls.predict(X_))
        cv_result = cross_validate(cls, X_, y, cv=cv_outer, scoring = 'neg_mean_squared_error')
#         print(-cv_result['test_score'])
        oos_MSE.loc[j.name].loc['CLS_MSE'] = -cv_result['test_score']
        print(j.name, 'finish cls')
        ##################################### no need to loop! ##############################################################
        # benchmark model: sm
        sm_pred, sm_mse = bench.sample_mean(y, "1988-01-01", cv_outer = cv_outer)
        oos_MSE.loc[j.name].loc['SM_MSE'] = sm_mse
        
        # benchmark model: Nonlinear
        station_n = pd.DataFrame()
        d1, d2, extra= dimensions(j,station_n, i.__name__)
        x0_n = Taylor_init(j.loc[:"1988-01-01"], station_n, y.loc[:"1988-01-01"], i)
        
        nlr = CLS_Estimator(obj_func = i, x0 = x0_n, constraints = constraint_func(j), options={'maxiter':1000000})
        cv_nonlinear = cross_validate(nlr, j, y, cv=cv_outer, scoring = 'neg_mean_squared_error')
        oos_MSE.loc[j.name].loc['NLS_MSE'] = -cv_nonlinear['test_score']    
        print(j.name, 'finish nls')
        # benchmark model: AR1
        lr = LinearRegression()
        
        ar1 = df['y_lag']
        cv_ar1 = cross_validate(lr, ar1.values.reshape(-1, 1), y, cv=cv_outer, scoring = 'neg_mean_squared_error')
        oos_MSE.loc[j.name].loc['AR1_MSE'] = -cv_ar1['test_score']
        
        # AR2
        ar2 = df[['y_lag','y_2lag']]
        cv_ar2 = cross_validate(lr, ar2, y, cv=cv_outer, scoring = 'neg_mean_squared_error')
        oos_MSE.loc[j.name].loc['AR2_MSE'] = -cv_ar2['test_score']

        # benchmark model: AR+cay
        ar_cay = df[['y_lag','new_cay']]
        cv_cay = cross_validate(lr, ar_cay, y, cv=cv_outer, scoring = 'neg_mean_squared_error')
        oos_MSE.loc[j.name].loc['AR_cay_MSE'] = -cv_cay['test_score']

        ####################################################################################################
        oos_MSE.T.to_excel(writer_MSE, sheet_name=i.__name__)
        
writer_MSE.save()
writer_MSE.close()

## $R^2$ Statistics

### oos $R^2$

In [None]:
base = ['SM',
       'NLS',
       'AR1',
       'AR2',
       'AR_cay']

In [None]:
co_list = ['co1', 'co2', 'co3', 'co4']
co_dict = {'co1': 'dy and dp',
          'co2': 'tbl and lty',
          'co3': 'dp and ep',
          'co4': 'BAA and AAA'}

In [None]:
# oos_MSE = pd.read_excel('MSE_taylor_0831.xlsx', header=[0,1], index_col=[0])

In [None]:
# oos_MSE

In [None]:
cumulate_R2 = pd.DataFrame()

rows = oos_MSE.loc["1988-01-01":"2018-12-01"].index
sec_columns_R2 = ['SM', 'NLS', 'AR1', 'AR2', 'AR_cay']
multi_columns_R2 = pd.MultiIndex.from_product([['co1', 'co2', 'co3', 'co4'], sec_columns_R2],
                                                  names=['Variable', 'Model'])      
cumulate_R2 = pd.DataFrame(index = multi_columns_R2, columns = rows)

In [None]:
cumulate_R2

In [None]:
writer_R2 = pd.ExcelWriter('R2_taylor_newcay.xlsx', engine='xlsxwriter')

for f in fun_list:
    oos_MSE = pd.read_excel('MSE_taylor_newcay.xlsx', header=[0,1], index_col=0, sheet_name = f.__name__)
    for i, j in itertools.product(base, co_list):
#         print(i,j)
        cumu_sum = []
        target_sum = []
        cumu_R2 = []
        k = i + '_MSE'
        for R in range(len(oos_MSE[j][k])):
            target_sum.append(oos_MSE[j]['CLS_MSE'][:(R+1)].sum())
            cumu_sum.append(oos_MSE[j][k][:(R+1)].sum())
        cumu_R2 = [1 - x/y for x,y in zip(target_sum, cumu_sum)]
#         print(len(cumu_R2))
        cumulate_R2.loc[j].loc[i] = cumu_R2
    cumulate_R2.T.to_excel(writer_R2, sheet_name=f.__name__)
writer_R2.save()
writer_R2.close()

In [None]:
parent = os.getcwd()
folder = 'OOS_plots/taylor_newacy'
path = os.path.join(parent, folder)
if not os.path.exists(path):
    os.makedirs(path)

for k in fun_list:
    R2 = pd.read_excel('R2_taylor_newcay.xlsx', header=[0,1], index_col=0, sheet_name = k.__name__)['1989-03-01':]
    for i,j in itertools.product(co_list, base):
        fig = plt.figure(figsize = (12,8))
        plt.plot(R2[i][j])
        plt.axhline(y=0, color='r', linestyle='--')
        plt.title('1-Step OOS: '+ co_dict[i] + ' (' + 'Model: '+ k.__name__[:-5] + '; BM:' + j + ')', fontsize=20)
        plt.ylabel("$R^2_{OOS}$", fontsize=16)
        plt.savefig(os.path.join(path, k.__name__[:-5] + '_' + i + '_' + j))

## Regenerate results for project 2

In [None]:
writer_R2 = pd.ExcelWriter('results/OOS_R2_project2.xlsx', engine='xlsxwriter')

for f in fun_list:
    oos_MSE = pd.read_excel('results/OOS_MSE_start0.xlsx', header=[0,1], index_col=0, sheet_name = f)
    for i, j in itertools.product(base, co_list):
        cumu_sum = []
        target_sum = []
        cumu_R2 = []
        k = i + '_MSE'
        for R in range(len(oos_MSE[j][k])):
            target_sum.append(oos_MSE[j]['NLS_MSE'][-(R+1):].sum())
            cumu_sum.append(oos_MSE[j][k][-(R+1):].sum())
        cumu_R2 = [1 - x/y for x,y in zip(target_sum, cumu_sum)]
        cumulate_R2.loc[j].loc[i] = cumu_R2
    cumulate_R2.T.to_excel(writer_R2, sheet_name=f)
writer_R2.save()
writer_R2.close()

In [None]:
fun_list = ['sin_func',
            'cos_func',
            'scaled_sin_func',
            'scaled_cos_func',
            'exp_shift_func',
            'exp_func',
            'poly_func']

In [None]:
parent = os.getcwd()
folder = 'OOS_plots/project2'
path = os.path.join(parent, folder)
if not os.path.exists(path):
    os.makedirs(path)

for k in fun_list:
    R2 = pd.read_excel('results/OOS_R2_project2.xlsx', header=[0,1], index_col=0, sheet_name = k)
    for i in co_list:
#         print(i)
        fig = plt.figure(figsize = (12,8))
        plt.plot(R2[i]['SM'])
        plt.axhline(y=0, color='r', linestyle='--')
        
        plt.title('1-Step OOS: '+ i + ' (' + 'Model: '+ k[:-5] + '; BM:' + 'SM' + ', start:0)', fontsize=20)
        plt.ylabel("$R^2_{OOS}$", fontsize=16)
        plt.savefig(os.path.join(path, k[:-5] + '_' + i + '_' + 'SM' + '_0'))

# Plot U

In [None]:
results = pd.read_excel('results/full_sample.xlsx', header=[0,1], index_col=[0,1])
results_taylor = pd.read_excel('results/Taylor_fullsample.xlsx', header=[0,1], index_col=[0,1])

In [None]:
#Set up directory
parent = os.getcwd()
folder = 'single_index'
path = os.path.join(parent, folder)
if not os.path.exists(path):
    os.makedirs(path)

for k in fun_list:
    for i,j in enumerate(cointe_ar2):
        fig = plt.figure(figsize = (8,6))
        plt.plot(single_index(j)(results_taylor['CLS'][['param_1', 'param_2']].loc[k].loc[co_list[i]]))
        plt.title('single-index: '+ co_list[i] + ' (' + 'Model: '+ k[:-5] + ')', fontsize=20)
        plt.savefig(os.path.join(path, k[:-5] + '_' + co_list[i]))

# In-sample $R^2$

In [None]:
station_n = pd.DataFrame()

In [None]:
writer_R3 = pd.ExcelWriter('insample_R2_newcay.xlsx', engine='xlsxwriter')

R2_insample = pd.DataFrame()
for i, j in itertools.product(fun_list, cointe_ar1):
    # Set up dimensions
    d1, d2, extra= dimensions(j,station_ar1, i.__name__)
    # Set up dataframes
    iterables = [[i.__name__], [j.name]]
    sec_columns = ['in_sample R2']
    multi_index = pd.MultiIndex.from_product(iterables, names=["function", "variables"])
    R2_in = pd.DataFrame(index = multi_index, columns = sec_columns)
    # Prepare X
    X_ = j.join(station_ar1)
    d1, d2, extra= dimensions(j,station_n, i.__name__)
    initial_len = d1+d2+extra[-1]+1
    # Fit models
    cls = CLS_Estimator(obj_func = i, x0 = Taylor_init(j, station_ar1, y, i), constraints = constraint_func(X_))
    cls.fit(X_, y)
    R2_ins = 1 - np.sum((cls.predict(X_) - np.array(y))**2)/np.sum((np.array(y)-np.mean(y))**2)
    R2_in.loc[i.__name__,j.name].loc['in_sample R2'] = R2_ins
    R2_insample = R2_insample.append(R2_in, ignore_index = False, sort = False)
    R2_insample.to_excel(writer_R3)
writer_R3.save()
writer_R3.close()

# NLS significance of $R^2$

In [None]:
#Set up directory
parent = os.getcwd()
folder = 'P2_signif'
path = os.path.join(os.path.join(parent, 'result'), folder)
if not os.path.exists(path):
    os.makedirs(path)

In [None]:
fun_list = [sin_func,
            cos_func,
            scaled_sin_func,
            scaled_cos_func,
            exp_func,
            exp_shift_func,
            poly_func, 
            linear_func]

In [None]:
result_dict = {}
for i,j in itertools.product(fun_list, cointe_ar1):
#     print(i.__name__, j.name)
    pred = np.empty(0)
    for train, test in cv_outer.split(j):
        station_n = pd.DataFrame()
        d1, d2, extra= dimensions(j,station_n, i.__name__)
        # CLS without linear part
        nlr = CLS_Estimator(obj_func = i, x0 = [0.001]*(d1+d2+extra[-1]+1), constraints = constraint_func(j))
        nlr.fit(j.iloc[train,:],y.iloc[train])
        pred = np.append(pred, nlr.predict(j.iloc[test,:]))
    result_dict[(i.__name__, j.name)] = pred

In [None]:
p2_pred = pd.DataFrame.from_dict(result_dict)
p2_pred.to_excel(path+'/p2_pred.xlsx', engine='xlsxwriter')

In [None]:
p2_pred.index = y[-124:].index
p2_pred['EQP'] = y[-124:]

In [None]:
sm_pred, sm_mse = bench.sample_mean(y, "1988-01-01", cv_outer = cv_outer)
p2_pred['SM'] = sm_pred

In [None]:
sig_dict = {}
for i,j in itertools.product(fun_list, cointe_ar1):
    y_hat = p2_pred[i.__name__][j.name]
    y_sm = p2_pred['SM']
    y_true = p2_pred['EQP']
    sig_dict[(i.__name__, j.name)] = (y_true - y_sm)**2 - (y_true - y_hat)**2 + (y_sm - y_hat)**2

In [None]:
sig_df = pd.DataFrame.from_dict(sig_dict)
sig_df.head()

In [None]:
import statsmodels.api as sm 

In [None]:
tp_dict = {}
# p_dict = {}
for i,j in itertools.product(fun_list, cointe_ar1):
    t_val = []
    p_val = []
    for k in range(4,sig_df.shape[0]):
        x = np.repeat(1, k).reshape(-1,1)
        ind = sig_df.index[k-1]
        models = sm.OLS(sig_df[i.__name__][j.name].loc[:ind],x)
        result = models.fit()
        t_val.append(result.tvalues[0])
        p_val.append(result.pvalues[0])
#     tp_dict[(i.__name__, j.name, 't')] = t_val
    tp_dict[i.__name__, j.name] = p_val

In [None]:
p_df = pd.DataFrame.from_dict(tp_dict)
p_df.tail()

In [None]:
# tp_df.to_excel('tp_vals.xlsx')

In [None]:
small_ps = []
for index, row in p_df.iterrows():
    l = [1 if i <= 0.1 else 0 for i in row.values ]
    small_ps.append(np.sum(l))

In [None]:
for i,j in enumerate(small_ps):
    print(i,j)

In [None]:
p_df.iloc[-1].to_excel('sig_20181201.xlsx')

In [None]:
dfp = pd.read_excel('frustrated.xlsx', index_col = 0)
dfp.head()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(8,8)) 
dfp['Figure3'].plot()
plt.hlines(y = 0, xmin = 0, xmax = 130, colors='r')
plt.xticks(rotation=45)
plt.ylabel('$R^2_{OOS}$', fontsize=18)
filename = 'cwy(f4).png'
plt.savefig(fname = filename)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(8,8)) 
dfp['b'].plot()
plt.hlines(y = 0, xmin = 0, xmax = 130, colors='r')
plt.xticks(rotation=45)
plt.ylabel('$R^2_{OOS}$', fontsize=18)
filename = 'co2f4.png'
plt.savefig(fname = filename)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(8,8)) 
dfp['c'].plot()
plt.hlines(y = 0, xmin = 0, xmax = 130, colors='r')
plt.xticks(rotation=45)
plt.ylabel('$R^2_{OOS}$', fontsize=18)
filename = 'co2f5.png'
plt.savefig(fname = filename)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(8,8)) 
dfp['d'].plot()
plt.hlines(y = 0, xmin = 0, xmax = 130, colors='r')
plt.xticks(rotation=45)
plt.ylabel('$R^2_{OOS}$', fontsize=18)

filename = 'co1f3.png'
plt.savefig(fname = filename)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(8,8)) 
dfp['e'].plot()
plt.hlines(y = 0, xmin = 0, xmax = 130, colors='r')
plt.xticks(rotation=45)
plt.ylabel('$R^2_{OOS}$', fontsize=18)
filename = 'co3f3.png'
plt.savefig(fname = filename)

In [None]:
dfp = pd.read_excel('frustrated.xlsx', index_col = 0)

fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(8,8)) 
dfp['f'].plot()
plt.hlines(y = 0, xmin = 0, xmax = 130, colors='r')
plt.xticks(rotation=45)
plt.ylabel('$R^2_{OOS}$', fontsize=18)
filename = 'co4f3.png'
plt.savefig(fname = filename)

In [None]:
|