In [632]:
# Load dependencies
import csv
import pandas as pd
import numpy as np
import os
import gc
import warnings
import itertools as it
import DataProcessFunctions as DP
import PredictionStep1 as pred
import SupportFunctions as supp
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression as lr
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras import regularizers
from tensorflow.keras import callbacks
from matplotlib.pyplot import cm
import time as time 


%load_ext autoreload
%autoreload 2

np.set_printoptions(suppress=True)
warnings.simplefilter(action='ignore', category=FutureWarning)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# Initialize warning log container 
log = list()

In [501]:
# Load Pre-processed data from Data Processing PCA.ipynb
FM_data = pd.read_csv(os.path.dirname(os.getcwd()) + '\\FM2_data.csv')
returns = pd.read_csv(os.path.dirname(os.getcwd()) + '\\returns2_data.csv').set_index(["permno", "date"])
industry_code = pd.read_csv(os.path.dirname(os.getcwd()) + '\\industry2_codes.csv').set_index(["permno", "date"])

supp.downcast(FM_data)
supp.downcast(returns)
supp.downcast(industry_code)

Before downcast: 1.769 GB and float64    102
int64        3
dtype: int64
After downcast: 0.878 GB and float32    102
int32        2
int8         1
dtype: int64
Before downcast: 0.026 GB and float64    1
dtype: int64
After downcast: 0.018 GB and float32    1
dtype: int64
Before downcast: 0.026 GB and float64    1
dtype: int64
After downcast: 0.018 GB and float32    1
dtype: int64


In [4]:
# Set number of iterations
ite = 29 # 30 splits --> iteration = [0;29]

# Initialize arrays to store loss and explained variation
loss = pd.DataFrame(columns = range(ite), index = ["LR", "Lasso", "NN"])
xplained_variation = pd.DataFrame(columns = range(ite), index = ["LR", "Lasso", "NN"])

# Initialize arrays to store annual loss and explained variation
loss_annual = pd.DataFrame(columns = range(ite), index = ["LR", "Lasso", "NN"])
xplained_variation_annual = pd.DataFrame(columns = range(ite), index = ["LR", "Lasso", "NN"])

# Initialize arrays to store predicted and actual returns used in portfolio sorts later
LR_pred_actual = pd.DataFrame()
lasso_pred_actual = pd.DataFrame()
NN_pred_actual = pd.DataFrame()

for i in range(ite):
    
    # Compute training, validation, and test set
    training, validation, test = DP.complete_data_process(industry_code, returns, FM_data, iteration = i)
    
    # Split in X and Y
    training_x, training_y = pred.XY_split(training)
    validation_x, validation_y = pred.XY_split(validation)
    test_x, test_y = pred.XY_split(test)
    
    # ---------------------------
    
    # Algorithm 1: Simple Linear (PCR in Gu, kelly, and Xiu (2020) due to PCA)
    # Fit model on training set & predict on test set
    LR = lr().fit(training_x, training_y)
    LR_pred = LR.predict(test_x)
    
    # Algoritm 1: Compute loss and explained varation, and combine 
    # actual and predicted returns. Append all. 
    LR_loss, LR_explained_var, LR_pred_actual_temp, LR_loss_annual, LR_explained_var_annual = pred.to_append(LR_pred, test_y)
    loss.iloc[0, i] = LR_loss
    loss_annual.iloc[0, i] = LR_loss_annual
    xplained_variation.iloc[0, i] = LR_explained_var
    xplained_variation_annual.iloc[0, i] = LR_explained_var_annual
    LR_pred_actual = LR_pred_actual.append(LR_pred_actual_temp)
    
    # ---------------------------
    
    # ALgorithm 2: LASSO
    # Fit model on training set and select tuning parameter based on validation set
    lambda_grid = pred.lambda_grid(training_x, training_y)
    loss_validation = []

    for lamb in lambda_grid:
        lasso = Lasso(alpha = lamb, tol = 0.001).fit(training_x, training_y)
        lasso_pred = lasso.predict(validation_x)
        loss_validation.append(pred.loss_function(lasso_pred, validation_y))

    # Fit model with error minimizing tuning parameter
    lambda_min = lambda_grid[loss_validation.index(min(loss_validation))]
    lasso_min = Lasso(alpha = lambda_min).fit(training_x, training_y)
    lasso_min_pred = lasso_min.predict(test_x)

    # Algorithm 2: Appending 
    lasso_loss, lasso_explained_var, lasso_pred_actual_temp, lasso_loss_annual, lasso_explained_var_annual = pred.to_append(lasso_min_pred, test_y)
    loss.iloc[1, i] = lasso_loss
    loss_annual.iloc[1, i] = lasso_loss_annual
    xplained_variation.iloc[1, i] = lasso_explained_var
    xplained_variation_annual.iloc[1, i] = lasso_explained_var_annual
    lasso_pred_actual = lasso_pred_actual.append(lasso_pred_actual_temp)
    
    # ---------------------------
    
    # ALgorithm 3: NN 
    # Build NN architecture (L1 regularization and batch normalization)
    model = pred.NN(training_x)

    # Define callback for early stopping
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

    # Fit model
    model.fit(training_x, training_y, epochs = 50, batch_size = 500, verbose = 0, validation_data = (validation_x, validation_y), callbacks = [callback])

    # Compute predictions
    NN_pred = model.predict(test_x)
    
    # Algorithm 3: Appending
    NN_loss, NN_explained_var, NN_pred_actual_temp, NN_loss_annual, NN_explained_var_annual = pred.to_append(NN_pred, test_y)
    loss.iloc[2, i] = NN_loss
    loss_annual.iloc[2, i] = NN_loss_annual
    xplained_variation.iloc[2, i] = NN_explained_var
    xplained_variation_annual.iloc[2, i] = NN_explained_var_annual
    NN_pred_actual = NN_pred_actual.append(NN_pred_actual_temp)
    
    # ---------------------------
    
    # Free memory
    del training
    del validation
    del test
    del training_x
    del training_y
    del validation_x
    del validation_y
    del test_x
    del test_y
    
    gc.collect()

In [329]:
# Return for each decile at all points in time
LR_step_1 = pred.portfolio_sorts_1(LR_pred_actual)
lasso_step_1 = pred.portfolio_sorts_1(lasso_pred_actual)
NN_step_1 = pred.portfolio_sorts_1(NN_pred_actual)

# Cumulative log returns (cannot accumulate returns in pct.)
LR_step_1_log = LR_step_1.copy()
LR_step_1_log.ret = np.log(1 + LR_step_1_log.ret)
LR_step_1_log.rename(columns = {"ret":"log_ret"}, inplace = True)

lasso_step_1_log = lasso_step_1.copy()
lasso_step_1_log.ret = np.log(1 + lasso_step_1_log.ret)
lasso_step_1_log.rename(columns = {"ret":"log_ret"}, inplace = True)

NN_step_1_log = NN_step_1.copy()
NN_step_1_log.ret = np.log(1 + NN_step_1_log.ret)
NN_step_1_log.rename(columns = {"ret":"log_ret"}, inplace = True)

# Accumualtive return of all deciles for both
# predicted and actual returns 
LR_cum = pred.portfolio_sorts_acc_return(LR_step_1_log)
lasso_cum = pred.portfolio_sorts_acc_return(lasso_step_1_log)
NN_cum = pred.portfolio_sorts_acc_return(NN_step_1_log)

# Monthly average return and std. deviation, and annualized SR
# of both predicted and actual returns
LR_mean, LR_std, LR_sr = pred.portfolio_sorts_SR(LR_step_1)
lasso_mean, lasso_std, lasso_sr = pred.portfolio_sorts_SR(lasso_step_1)
NN_mean, NN_std, NN_sr = pred.portfolio_sorts_SR(NN_step_1)


In [697]:
# Figure: Cumulative return of all deciles for each ML model
pred.cumulative_ret_fig(data = LR_step_1_log, name = "LR_cumulative_ret", save_fig = True, hide = True)
pred.cumulative_ret_fig(data = lasso_step_1_log, name = "lasso_cumulative_ret", save_fig = True, hide = True)
pred.cumulative_ret_fig(data = NN_step_1_log, name = "NN_cumulative_ret", save_fig = True, hide = True)

In [698]:
# Figure: Cumulative return of 1st and 10th decile of specified ML models 
pred.deciles_10_1_fig(name = "deciles_10_1", save_fig = True, hide = True, data1 = LR_step_1_log, data2 = lasso_step_1_log, data3 = NN_step_1_log)

In [726]:
# Prep data for export to R so as to customaize for tables 

# Table 1: monthly loss and explained variation (Multiply with 100 to get pct.)
monthly_pricing_error = loss.mean(axis = 1) * 100
monthly_xplained_variation = xplained_variation.mean(axis = 1) * 100
table1 = pd.concat([monthly_pricing_error, monthly_xplained_variation], axis = 1)
table1.columns = ["Squared Pricing Error", "Explained Variation"]
table1 = table1.transpose()
table1.columns = ["Linear Regression", "Lasso", "Neural Network"]
table1.to_csv(os.path.dirname(os.getcwd()) + '\\table1_data.csv', header = True, index = True)

''' Incorrect as is. Consult Predictionstep1.py, to_append function
# Table 2: Annual loss and explained variation (Multiply with 100 to get pct.)
annual_pricing_error = loss_annual.mean(axis = 1) * 100
annual_xplained_variation = xplained_variation_annual.mean(axis = 1) * 100
table2 = pd.concat([annual_pricing_error, annual_xplained_variation], axis = 1)
table2.columns = ["Squared Pricing Error", "Explained Variation"]
table2 = table2.transpose()
table2.columns = ["Linear Regression", "Lasso", "Neural Network"]
table2.to_csv(os.path.dirname(os.getcwd()) + '\\table2_data.csv', header = True, index = True)
'''

# Table 3: (Multiply with 100 to get pct.)
table3_LR = pd.concat([LR_mean * 100, LR_std.ret * 100, LR_sr.ret], axis = 1).round(2)
table3_LR.columns = ["Avg", "Pred", "Std", "SR"]
table3_lasso = pd.concat([lasso_mean * 100 , lasso_std.ret * 100, lasso_sr.ret], axis = 1).round(2)
table3_lasso.columns = ["Avg", "Pred", "Std", "SR"]
table3_NN = pd.concat([NN_mean * 100, NN_std.ret * 100, NN_sr.ret], axis = 1).round(2)
table3_NN.columns = ["Avg", "Pred", "Std", "SR"]

# Table 4: Tabel for appendix figurerne (skal bruge LR_cum, lasso_cum, NN_cum) og så bare kun ret søjlen. Har så tabel der viser end point for figurene (denne tabel kommer i appendix) 
table4 = pd.concat([LR_cum.log_ret, lasso_cum.log_ret, NN_cum.log_ret], axis = 1).round(3)
table4.columns = ["Linear Regression", "Lasso", "Neural Network"]
table4 = table4.transpose()
table4.to_csv(os.path.dirname(os.getcwd()) + '\\table4_data.csv', header = True, index = True) 

In [None]:
# Skal jeg lave en normals OLS på ikke PCA data? bare som totalt benchmark? -- køre det på FM_data + industry codes så -- ingen interaktion terms eller PCA 