## PCA + ANN Baseline
* Reference paper: https://jfin-swufe.springeropen.com/track/pdf/10.1186/s40854-019-0138-0.pdf

In [66]:
import pandas as pd
import torch.utils.data as Data
import torch
import torch.nn as nn
from sklearn.decomposition import PCA
from sklearn import metrics
pd.set_option('display.max_columns', 500)
DATA_PATH = '../data/Processed_S&P.csv'

In [67]:
base_df = pd.read_csv(DATA_PATH)

## Preprocessing

In [68]:
base_df['Date'] = pd.to_datetime(base_df['Date'], format='%Y-%m-%d')
base_df.sort_values(by='Date', inplace=True)
base_df.drop(columns=['Name', 'Date'], axis=1, inplace=True)

In [69]:
base_df.fillna(0, inplace=True)

In [70]:
# detect outliers
for col in base_df.columns:
    q1, q3 = base_df[col].quantile([0.25, 0.75])
    iqr = q3 - q1
    lb, rb = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    for i in range(len(base_df)):
        if base_df[col][i] > rb:
            base_df[col][i] = rb
        if base_df[col][i] < lb:
            base_df[col][i] = lb

## Dataloader
* PCA with n_components = 82

In [71]:
class MarketDataset(Data.Dataset):
    TRAIN_RATIO = 0.7
    TEST_RATIO = 0.15

    def __init__(self, type, train_ds=None):
        global base_df
        n = len(base_df)
        if type == 'Test':
            df = base_df.iloc[int((1 - MarketDataset.TEST_RATIO) * n):]
            assert (train_ds is not None)
            df = (df - train_ds.mean) / train_ds.std
            self.X = train_ds.pca.transform(df)
        elif type == 'Train':
            df = base_df.iloc[:int(MarketDataset.TRAIN_RATIO * n)]
            self.mean = df.mean()
            self.std = df.std()
            df = (df - self.mean) / self.std
            self.pca = PCA()
            self.pca.fit(df)
            self.X = self.pca.transform(df)
        elif type == 'Validation':            
            df = df = base_df.iloc[int(MarketDataset.TRAIN_RATIO * n): int((1 - MarketDataset.TEST_RATIO) * n)]
            assert (train_ds is not None)
            df = (df - train_ds.mean) / train_ds.std
            self.X = train_ds.pca.transform(df)
            
        self.X = torch.tensor(self.X).float()
        self.df = df.reset_index(drop=True)

    def get_label(self, idx):
        return torch.tensor([float(self.df['Close'][idx + 1] > self.df['Close'][idx])])

    def __getitem__(self, idx):
        return self.X[idx], self.get_label(idx)

    def __len__(self):
        return len(self.X) - 1

In [72]:
train_ds = MarketDataset(type='Train')
validation_ds = MarketDataset(type='Validation', train_ds=train_ds)
test_ds = MarketDataset(type='Test', train_ds=train_ds)

## Model

In [73]:
model = nn.Sequential(
    nn.Linear(82, 41),
    nn.Dropout(0.5),
    nn.ReLU(),
    nn.Linear(41, 20),
    nn.Dropout(0.2),
    nn.ReLU(),
    nn.Linear(20, 10),
    nn.Dropout(0.2),
    nn.ReLU(),
    nn.Linear(10, 5),
    nn.ReLU(),
    nn.Linear(5, 1),
    nn.Sigmoid()
)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [74]:
epochs = 100
validation_interval = 3
validation_accuracies = []
previous_model_state = {}
early_stopping_threshold = 0.015
train_accuracies = []
train_losses = []
train_f_scores = []

for e in range(1, epochs + 1):
    running_loss = 0
    labels = []
    predicted = []
    for i in range(len(train_ds)):
        X, Y = train_ds[i]
        optimizer.zero_grad()

        output = model(X)
        predicted.append(int(output[0] >= 0.5))
        labels.append(int(Y[0]))
        loss = criterion(output, Y)

        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    train_acc = metrics.accuracy_score(labels, predicted)
    f_score = metrics.f1_score(labels, predicted)
    running_loss /= len(train_ds)

    print(f'Epoch {e} ---')
    print(f'Training loss: {running_loss}')
    print(f'Accuracy: {train_acc}') 
    print(f'F-score: {f_score}')

    train_accuracies.append(train_acc)
    train_f_scores.append(f_score)
    train_losses.append(running_loss)

    if e % validation_interval == 0:
        model.eval()
        labels = []
        predicted = []
        validation_loss = 0
        for i in range(len(validation_ds)):
            X, Y = validation_ds[i]
            with torch.no_grad():
                predicted.append(int(model(X)[0] >= 0.5))
            labels.append(int(Y[0]))
        validation_acc = metrics.accuracy_score(labels, predicted)
        print(f'Validation accuracy: {validation_acc}')
        validation_accuracies.append(validation_acc)
        model.train()
        if len(validation_accuracies) > 1 and (validation_accuracies[-2] - validation_accuracies[-1] >= early_stopping_threshold):
            model.load_state_dict(previous_model_state)
            break
        previous_model_state = model.state_dict()

Epoch 1 ---
Training loss: 0.6887976384541072
Accuracy: 0.55155010814708
F-score: 0.7109665427509294
Epoch 2 ---
Training loss: 0.6883623382988471
Accuracy: 0.5508291276135544
F-score: 0.710367271036727
Epoch 3 ---
Training loss: 0.6881081785499061
Accuracy: 0.55155010814708
F-score: 0.7109665427509294
Validation accuracy: 0.5084175084175084
Epoch 4 ---
Training loss: 0.6876298680353612
Accuracy: 0.55155010814708
F-score: 0.7109665427509294
Epoch 5 ---
Training loss: 0.6865515817217287
Accuracy: 0.55155010814708
F-score: 0.7109665427509294
Epoch 6 ---
Training loss: 0.6872744930246019
Accuracy: 0.55155010814708
F-score: 0.7109665427509294
Validation accuracy: 0.5084175084175084
Epoch 7 ---
Training loss: 0.685885303189794
Accuracy: 0.55155010814708
F-score: 0.7109665427509294
Epoch 8 ---
Training loss: 0.6856187265728907
Accuracy: 0.55155010814708
F-score: 0.7109665427509294
Epoch 9 ---
Training loss: 0.6862942188569944
Accuracy: 0.55155010814708
F-score: 0.7109665427509294
Validation 

In [75]:
model.eval()
test_labels = []
test_predicted = []
for i in range(len(test_ds)):
    X, Y = test_ds[i]
    with torch.no_grad():
        test_predicted.append(int(model(X)[0] >= 0.5))
    test_labels.append(int(Y[0]))
print(f'Test accuracy: {metrics.accuracy_score(test_labels, test_predicted)}')
print(f'F-score on test set: {metrics.f1_score(test_labels, test_predicted)}')

Test accuracy: 0.5589225589225589
F-score on test set: 0.7120879120879122


In [76]:
torch.save(model.state_dict(), 'model-params.pt')