In [37]:
import glob
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

In [2]:
files = glob.glob("./csv/*.csv")
df = pd.concat((pd.read_csv(f) for f in files))
df["date"] = pd.to_datetime(df["date"])
df = df.set_index(["date", "ticker"]).sort_index()

In [3]:
def trim_99(column):
    """
    Trims data to between 1st and 99th percentile of original.
    Sets values lower than 1 percentile and greater than 99 percentile
    to nan.
    """
    upper = np.percentile(column, 99)
    lower = np.percentile(column, 1)
    return column.mask((column > upper) | (column < lower), other=np.nan)

In [4]:
# Filters
df = df.replace([np.inf, -np.inf], np.nan)
df = df.apply(trim_99, axis=0)
df = df.dropna()

In [5]:
num_dates = len(df.index.levels[0])
break_point = int(0.75 * num_dates)
train_index = df.index.levels[0][:break_point]
test_index = df.index.levels[0][break_point:]

train_data = df.loc[train_index]
test_data = df.loc[test_index]

In [6]:
def train_test_iter(df, n_splits):
    """
    Yield train and test indexes for cross validation.
    Expects multiindex dataframe with dates as first level of index.
    Intended for panel data and creates training data with expanding window.
    """
    # Extract dates
    dates = df.index.levels[0]
    # From number of splits, determine number of samples in each dataset
    window_size = int(len(dates) / n_splits)
    for i in range(1, n_splits):
        # Find where training window ends, multiply by i to create expanding window
        break_point = window_size * i
        train_index = dates[:break_point]
        # If we're on the last iteration, return all remaining data as test set
        # to account for remainders in window size across n_splits
        # otherwise return test data of size 'window_size'
        if i == (n_splits - 1):
            test_index = dates[break_point:]
        else:
            test_index = dates[break_point:(break_point + window_size)]
        yield train_index, test_index

In [7]:
x = train_data.drop(columns=["monthly_ret", "forward_ret"])
y = train_data["forward_ret"]

x_test = test_data.drop(columns=["monthly_ret", "forward_ret"])
y_test = test_data["forward_ret"]

In [8]:
import torch
import torch.nn as nn

In [10]:
class MLP(nn.Module):
    def __init__(self, in_dim, out_dim, hidden_dim):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ELU(),
            nn.Dropout(),
            nn.Linear(hidden_dim, out_dim)
        )
    
    def forward(self, x):
        return self.network(x)

In [29]:
torch_x = torch.Tensor(x.values)
torch_y = torch.Tensor(y.values).reshape(-1, 1)

in_dim = torch_x.shape[1]

In [41]:
model = MLP(in_dim, 1, 8)
optim = torch.optim.AdamW(model.parameters(), lr=0.001)
loss = nn.MSELoss()

train_losses = []
validation_losses = []
model.train()
for epoch in range(500):
    optim.zero_grad()
    output = model(torch_x)
    
    train_loss = loss(output, torch_y)
    print(train_loss)
    train_losses.append(train_loss.detach())
    
    train_loss.backward()
    optim.step()

tensor(8107.9336, grad_fn=<MseLossBackward>)
tensor(7030.9507, grad_fn=<MseLossBackward>)
tensor(16076.0059, grad_fn=<MseLossBackward>)
tensor(20786.2441, grad_fn=<MseLossBackward>)
tensor(10461.3740, grad_fn=<MseLossBackward>)
tensor(11928.0723, grad_fn=<MseLossBackward>)
tensor(18499.4082, grad_fn=<MseLossBackward>)
tensor(12235.5947, grad_fn=<MseLossBackward>)
tensor(13116.7002, grad_fn=<MseLossBackward>)
tensor(9900.9932, grad_fn=<MseLossBackward>)
tensor(26076.9824, grad_fn=<MseLossBackward>)
tensor(16006.7686, grad_fn=<MseLossBackward>)
tensor(3585.1030, grad_fn=<MseLossBackward>)
tensor(7987.2202, grad_fn=<MseLossBackward>)
tensor(3069.7007, grad_fn=<MseLossBackward>)
tensor(8170.0957, grad_fn=<MseLossBackward>)
tensor(14759.1934, grad_fn=<MseLossBackward>)
tensor(8616.8252, grad_fn=<MseLossBackward>)
tensor(19823.6660, grad_fn=<MseLossBackward>)
tensor(3370.7878, grad_fn=<MseLossBackward>)
tensor(2842.7896, grad_fn=<MseLossBackward>)
tensor(17061.6992, grad_fn=<MseLossBackward>

tensor(843.7716, grad_fn=<MseLossBackward>)
tensor(5.0834, grad_fn=<MseLossBackward>)
tensor(924.3154, grad_fn=<MseLossBackward>)
tensor(486.9303, grad_fn=<MseLossBackward>)
tensor(105.8371, grad_fn=<MseLossBackward>)
tensor(475.9043, grad_fn=<MseLossBackward>)
tensor(874.7952, grad_fn=<MseLossBackward>)
tensor(12.5432, grad_fn=<MseLossBackward>)
tensor(652.9130, grad_fn=<MseLossBackward>)
tensor(525.4619, grad_fn=<MseLossBackward>)
tensor(264.1717, grad_fn=<MseLossBackward>)
tensor(23.7083, grad_fn=<MseLossBackward>)
tensor(66.9349, grad_fn=<MseLossBackward>)
tensor(105.9178, grad_fn=<MseLossBackward>)
tensor(55.9922, grad_fn=<MseLossBackward>)
tensor(394.4296, grad_fn=<MseLossBackward>)
tensor(409.2939, grad_fn=<MseLossBackward>)
tensor(13.3341, grad_fn=<MseLossBackward>)
tensor(106.4465, grad_fn=<MseLossBackward>)
tensor(413.3377, grad_fn=<MseLossBackward>)
tensor(247.5684, grad_fn=<MseLossBackward>)
tensor(609.1597, grad_fn=<MseLossBackward>)
tensor(550.2241, grad_fn=<MseLossBackwa

tensor(9.4082, grad_fn=<MseLossBackward>)
tensor(8.8189, grad_fn=<MseLossBackward>)
tensor(7.2468, grad_fn=<MseLossBackward>)
tensor(32.5001, grad_fn=<MseLossBackward>)
tensor(7.0834, grad_fn=<MseLossBackward>)
tensor(18.9287, grad_fn=<MseLossBackward>)
tensor(17.2562, grad_fn=<MseLossBackward>)
tensor(44.0542, grad_fn=<MseLossBackward>)
tensor(43.4187, grad_fn=<MseLossBackward>)
tensor(6.6265, grad_fn=<MseLossBackward>)
tensor(38.2261, grad_fn=<MseLossBackward>)
tensor(13.4132, grad_fn=<MseLossBackward>)
tensor(4.0274, grad_fn=<MseLossBackward>)
tensor(17.8399, grad_fn=<MseLossBackward>)
tensor(45.1118, grad_fn=<MseLossBackward>)
tensor(7.3565, grad_fn=<MseLossBackward>)
tensor(2.0567, grad_fn=<MseLossBackward>)
tensor(26.8823, grad_fn=<MseLossBackward>)
tensor(8.0639, grad_fn=<MseLossBackward>)
tensor(27.0844, grad_fn=<MseLossBackward>)
tensor(24.0540, grad_fn=<MseLossBackward>)
tensor(17.2152, grad_fn=<MseLossBackward>)
tensor(44.0373, grad_fn=<MseLossBackward>)
tensor(5.7744, grad_

tensor(0.6371, grad_fn=<MseLossBackward>)
tensor(5.6191, grad_fn=<MseLossBackward>)
tensor(2.6840, grad_fn=<MseLossBackward>)
tensor(2.4459, grad_fn=<MseLossBackward>)
tensor(1.1758, grad_fn=<MseLossBackward>)
tensor(2.4187, grad_fn=<MseLossBackward>)
tensor(2.9304, grad_fn=<MseLossBackward>)
tensor(4.6966, grad_fn=<MseLossBackward>)
tensor(0.9147, grad_fn=<MseLossBackward>)
tensor(5.4922, grad_fn=<MseLossBackward>)
tensor(2.1336, grad_fn=<MseLossBackward>)
tensor(1.0608, grad_fn=<MseLossBackward>)
tensor(4.6026, grad_fn=<MseLossBackward>)
tensor(0.8658, grad_fn=<MseLossBackward>)
tensor(4.6816, grad_fn=<MseLossBackward>)
tensor(0.8158, grad_fn=<MseLossBackward>)
tensor(2.1329, grad_fn=<MseLossBackward>)
tensor(1.0350, grad_fn=<MseLossBackward>)
tensor(2.4261, grad_fn=<MseLossBackward>)
tensor(2.3417, grad_fn=<MseLossBackward>)
tensor(6.6944, grad_fn=<MseLossBackward>)
tensor(2.1012, grad_fn=<MseLossBackward>)
tensor(5.2931, grad_fn=<MseLossBackward>)
tensor(1.8198, grad_fn=<MseLossBac

tensor(2.0079, grad_fn=<MseLossBackward>)
tensor(0.4376, grad_fn=<MseLossBackward>)
tensor(0.4107, grad_fn=<MseLossBackward>)
tensor(2.3810, grad_fn=<MseLossBackward>)
tensor(1.2981, grad_fn=<MseLossBackward>)
tensor(0.5393, grad_fn=<MseLossBackward>)
tensor(0.5154, grad_fn=<MseLossBackward>)
tensor(2.5830, grad_fn=<MseLossBackward>)
tensor(1.1969, grad_fn=<MseLossBackward>)
tensor(0.9420, grad_fn=<MseLossBackward>)
tensor(0.3132, grad_fn=<MseLossBackward>)
tensor(1.1835, grad_fn=<MseLossBackward>)
tensor(0.5260, grad_fn=<MseLossBackward>)
tensor(0.4042, grad_fn=<MseLossBackward>)
tensor(0.3261, grad_fn=<MseLossBackward>)
tensor(1.5075, grad_fn=<MseLossBackward>)
tensor(0.5267, grad_fn=<MseLossBackward>)
tensor(0.8650, grad_fn=<MseLossBackward>)
tensor(0.6873, grad_fn=<MseLossBackward>)
tensor(0.6437, grad_fn=<MseLossBackward>)
tensor(1.1859, grad_fn=<MseLossBackward>)
tensor(2.8784, grad_fn=<MseLossBackward>)
tensor(0.3403, grad_fn=<MseLossBackward>)
tensor(0.6587, grad_fn=<MseLossBac

tensor(0.3410, grad_fn=<MseLossBackward>)
tensor(0.6083, grad_fn=<MseLossBackward>)
tensor(1.1246, grad_fn=<MseLossBackward>)
tensor(0.2116, grad_fn=<MseLossBackward>)
tensor(0.8537, grad_fn=<MseLossBackward>)
tensor(0.3393, grad_fn=<MseLossBackward>)
tensor(0.1856, grad_fn=<MseLossBackward>)
tensor(0.3821, grad_fn=<MseLossBackward>)
tensor(0.3901, grad_fn=<MseLossBackward>)
tensor(0.7566, grad_fn=<MseLossBackward>)
tensor(0.1640, grad_fn=<MseLossBackward>)
tensor(0.2060, grad_fn=<MseLossBackward>)
tensor(0.1651, grad_fn=<MseLossBackward>)
tensor(0.0956, grad_fn=<MseLossBackward>)
tensor(1.5325, grad_fn=<MseLossBackward>)
tensor(0.2437, grad_fn=<MseLossBackward>)
tensor(1.2120, grad_fn=<MseLossBackward>)
tensor(0.8369, grad_fn=<MseLossBackward>)
tensor(0.5098, grad_fn=<MseLossBackward>)
tensor(0.3089, grad_fn=<MseLossBackward>)
tensor(0.9492, grad_fn=<MseLossBackward>)
tensor(0.7469, grad_fn=<MseLossBackward>)
tensor(0.8485, grad_fn=<MseLossBackward>)
tensor(0.2207, grad_fn=<MseLossBac

In [47]:
model(torch_x).detach().numpy()

array([[ 0.07550856],
       [-0.05987364],
       [ 0.19688344],
       ...,
       [-0.07334246],
       [-0.07782627],
       [-0.21096814]], dtype=float32)

In [46]:
from sklearn.metrics import r2_score