In [2]:
# Import the usual suspects
import csv
import pandas as pd
import numpy as np
import os
import Toolbox as tb
import Lossfunction as lf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression as lr
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
import tensorflow as tf

%load_ext autoreload
%autoreload 2

np.set_printoptions(suppress=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Initialize warning log container 
log = list()

In [3]:
# Import data
training = pd.read_csv(os.path.dirname(os.getcwd()) + '\\Data' +'\\data_t_01.csv', index_col = ["permno", "date"])
validation = pd.read_csv(os.path.dirname(os.getcwd()) + '\\Data' +'\\data_v_01.csv', index_col = ["permno", "date"])
test = pd.read_csv(os.path.dirname(os.getcwd()) + '\\Data' +'\\data_tt_01.csv', index_col = ["permno", "date"])

# Downcast
tb.downcast(training)
tb.downcast(validation)
tb.downcast(test)

Before downcast: 0.502 GB and float64    72
int64      68
dtype: int64
After downcast: 0.161 GB and float32    72
int8       68
dtype: int64
Before downcast: 0.185 GB and float64    72
int64      68
dtype: int64
After downcast: 0.059 GB and float32    72
int8       68
dtype: int64
Before downcast: 2.625 GB and float64    72
int64      68
dtype: int64
After downcast: 0.841 GB and float32    72
int8       68
dtype: int64


In [4]:
# Sanity Check
assert np.all(training.columns == validation.columns)
assert np.all(training.columns == test.columns)
assert np.all(validation.columns == test.columns)

In [5]:
# Split into Y and X
training_x = training.iloc[:, :-1]
training_y = training.ret

validation_x = validation.iloc[:, :-1]
validation_y = validation.ret

test_x = test.iloc[:, :-1]
test_y = test.ret

In [139]:
# Algorithm 1: Simple Linear (PCR in Gu, kelly, and Xiu (2020) due to PCA)

# Fit model on training set
linear_reg = lr().fit(training_x, training_y)

# Predict model on test set
linear_reg_pred = linear_reg.predict(test_x)

# Loss function
loss_linear_reg = lf.loss_function(linear_reg_pred, test_y)

# Explained variation
xplained_var_linear_reg = lf.explained_variation(linear_reg_pred, test_y)

# Portfolio sorts

# Stock characteristics

In [140]:
loss_linear_reg

44.87910449429597

In [141]:
xplained_var_linear_reg

array([-0.01487567])

In [137]:
# Algorithm 2: Lasso 

# Fit model on training set and select tuning parameter based on validation set
lambda_grid = lf.lambda_grid(training_x, training_y)
loss_validation = []

for lamb in lambda_grid:
    lasso = Lasso(alpha = lamb).fit(training_x, training_y)
    lasso_pred = lasso.predict(validation_x)
    loss_validation.append(lf.loss_function(lasso_pred, validation_y))

# Fit model with squared euclidian distance minimizing tuning parameter as chosen by forward chaining
lambda_min = lambda_grid[loss_validation.index(min(loss_validation))]
lasso_min = Lasso(alpha = lambda_min).fit(training_x, training_y)
lasso_min_pred = lasso_min.predict(test_x)

# Loss function
loss = lf.loss_function(lasso_min_pred, test_y)

# Explaiend variation
xplained_var_lasso = lf.explained_variation(lasso_min_pred, test_y)

# Portfolio sorts

# Stock characteristics

In [15]:
import tensorflow as tf
from tensorflow.keras.layers import Dense

In [None]:
# Gu, Kelly Xiu (2020) NN:
'''

* Gu, Kelly og Xiu laver ikke hyperparameter tuning som sådan. Det er for computationally intensive hvorfor de i stedet
selecter et par NN architectures ex-ante og bruger dette som et reasonable estimate for lower bound performance for NN. 

* De bruger i stedet validation set til deres early stopping algo; efter hver epoche laver vi prediction på validation set
hvis de begynder at stige, så stopper vi algoen - ideen er her at der begynder at blive overfittet. i model.fit() er der 
et argument der hedder validation_data, hvor man kan specificere validation sæt. Video, hvor de gør det: 
https://www.youtube.com/watch?v=qFJeN9V1ZsI&t=34s&ab_channel=freeCodeCamp.org

* Kan ikke umiddelbart se hvor mange epoches de bruger ej heller hvad deres batch-size er. Ej heller hvad learning rate
starter med at være og hvad regularization parameter er. 


To-Do:
* Skal jeg lære at lave NN på den advanced måde også? 
* Objective function skal være regularized
* ReLu giver kun positve Y'er; skal kunne være negative også. Tror løsningen kunne være batch normalization.
* De øvrige ting er mindre vigtige, men det vil være learning rate shrinkage, early stopping (for at bruge validation sæt) 
og ensemble 
* Indirekte fikse data partitioning, så jeg kan lave predictions for hvert split 
* Skal jeg lave de samme NN arkitekturer, som de gør? Eller skal jeg bare nøjes med en? 

'''

In [6]:
# Algorithm 3: Neural Network

# Build NN architecture 
model = tf.keras.models.Sequential()
model.add(Dense(units = 32, activation = 'relu', input_dim = len(training_x.columns))) # 1. Hidden layer
model.add(Dense(units = 16, activation = 'relu')) # 2. Hidden layer
model.add(Dense(units = 8, activation = 'relu')) # 3. Hidden layer
model.add(Dense(units = 4, activation = 'relu')) # 4. Hidden layer
model.add(Dense(units = 1, activation = 'linear')) # Output layer - linear fixes nonnegative predictions

# Compile model
model.compile(optimizer = 'adam', loss = 'mean_squared_error') # jeg kan også bruge huber loss ved 'huber'

# Fit model
model.fit(training_x, training_y, epochs = 1, batch_size = 40, verbose = 1)

# Compute predictions
y_hat = model.predict(test_x)

# Compute loss
lf.loss_function(y_hat, test_y)

# Variation explained
lf.explained_variation(y_hat, test_y)

In [None]:
# Algo 4 + 5: Gradient Boosting + Random Forest?

In [23]:
# To code:
# Skal jeg lave en pipeline for alle mine ML modeller som ham der kodede til den der konkurrence? 
# Skal have lavet en mappe til mine scripts
# Skal have relateret prediction til stocks så jeg kan se hvilke der er høje og lave
    # Her kunne jeg have et problem med et tale om macroøkonomisk predictors. Men det er vel også mere firm char
    # som er interessant at vide 
# Skal have kodet portfolio sorts -- her skal jeg måske kigge lidt på aflevering i AME2
    # 1) Ranger i deciler efter predicted return. 2) Lav porteføljer hver periode baseret på prediction. 
    # 3) Se hvad return er for hver portefølje hver periode og tag average over perioden 
# Kode noget op som samler resultaterne og kommer dem i en latex tabel. f.eks. explained variation og MSE
    
# Algoritmer: OLS, Lasso / EN, NN, RF (classification?), GRBT 