## PCA + ANN Baseline
* Reference paper: https://jfin-swufe.springeropen.com/track/pdf/10.1186/s40854-019-0138-0.pdf

In [428]:
import pandas as pd
import torch.utils.data as Data
import torch
import torch.nn as nn
from sklearn.decomposition import PCA
from sklearn import metrics
pd.set_option('display.max_columns', 500)
DATA_PATH = '../data/Processed_S&P.csv'

In [429]:
base_df = pd.read_csv(DATA_PATH)

## Preprocessing

In [430]:
base_df['Date'] = pd.to_datetime(base_df['Date'], format='%Y-%m-%d')
base_df.sort_values(by='Date', inplace=True)
base_df.drop(columns=['Name', 'Date'], axis=1, inplace=True)

In [431]:
base_df.fillna(0, inplace=True)

In [432]:
# detect outliers
for col in base_df.columns:
    q1, q3 = base_df[col].quantile([0.25, 0.75])
    iqr = q3 - q1
    lb, rb = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    for i in range(len(base_df)):
        if base_df[col][i] > rb:
            base_df[col][i] = rb
        if base_df[col][i] < lb:
            base_df[col][i] = lb

## Dataloader
* PCA with n_components = 82

In [433]:
class MarketDataset(Data.Dataset):
    TRAIN_RATIO = 0.7
    VALIDATION_RATIO = 0.15
    TEST_RATIO = 0.15

    def __init__(self, test=False, train_ds=None):
        global base_df
        if test:
            df = base_df.iloc[int((1 - MarketDataset.TEST_RATIO) * len(base_df)):]
            assert (train_ds is not None)
            df = (df - train_ds.mean) / train_ds.std
            self.X = train_ds.pca.transform(df)
        else:
            df = base_df.iloc[:int((1 - MarketDataset.TEST_RATIO) * len(base_df))]
            self.mean = df.mean()
            self.std = df.std()
            df = (df - self.mean) / self.std
            self.pca = PCA()
            self.pca.fit(df)
            self.X = self.pca.transform(df)
            self.train_len = int((MarketDataset.TRAIN_RATIO * len(df)) / (MarketDataset.TRAIN_RATIO + MarketDataset.VALIDATION_RATIO))
            self.validation_len = len(df) - self.train_len - 1
            self.validation_range = range(self.train_len + 1, self.train_len + self.validation_len)

        self.X = torch.tensor(self.X).float()
        self.df = df.reset_index(drop=True)

    def get_label(self, idx):
        return torch.tensor([float(self.df['Close'][idx + 1] > self.df['Close'][idx])])

    def __getitem__(self, idx):
        return self.X[idx], self.get_label(idx)

    def __len__(self):
        return len(self.X) - 1
    
    def reference_accuracy(self):
        cnt_up, cnt_down = 0, 0
        labels = []
        predicted = []
        for i in range(len(self)):
            label = int(self.get_label(i)[0])
            cnt_up += label == 1
            cnt_down += label == 0
            labels.append(label)
            predicted.append(cnt_up > cnt_down)
        return metrics.accuracy_score(labels, predicted)

train_ds = MarketDataset(test=False)

## Model

In [434]:
model = nn.Sequential(
    nn.Linear(82, 41),
    nn.Dropout(0.5),
    nn.ReLU(),
    nn.Linear(41, 10),
    nn.Dropout(0.2),
    nn.ReLU(),
    nn.Linear(10, 5),
    nn.ReLU(),
    nn.Linear(5, 1),
    nn.Sigmoid()
)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [435]:
epochs = 100
validation_interval = 3
validation_accuracies = []
previous_model_state = {}
early_stopping_threshold = 0.015
train_accuracies = []
train_losses = []
train_f_scores = []

for e in range(1, epochs + 1):
    running_loss = 0
    labels = []
    predicted = []
    for i in range(train_ds.train_len):
        X, Y = train_ds[i]
        optimizer.zero_grad()

        output = model(X)
        predicted.append(int(output[0] >= 0.5))
        labels.append(int(Y[0]))
        loss = criterion(output, Y)

        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    train_acc = metrics.accuracy_score(labels, predicted)
    f_score = metrics.f1_score(labels, predicted)
    running_loss /= len(train_ds)

    print(f'Epoch {e} ---')
    print(f'Training loss: {running_loss}')
    print(f'Accuracy: {train_acc}') 
    print(f'F-score: {f_score}')

    train_accuracies.append(train_acc)
    train_f_scores.append(f_score)
    train_losses.append(running_loss)

    if e % validation_interval == 0:
        model.eval()
        labels = []
        predicted = []
        validation_loss = 0
        for i in train_ds.validation_range:
            X, Y = train_ds[i]
            with torch.no_grad():
                predicted.append(int(model(X)[0] >= 0.5))
            labels.append(int(Y[0]))
        validation_acc = metrics.accuracy_score(labels, predicted)
        print(f'Validation accuracy: {validation_acc}')
        validation_accuracies.append(validation_acc)
        model.train()
        if len(validation_accuracies) > 1 and (validation_accuracies[-2] - validation_accuracies[-1] >= early_stopping_threshold): #to check
            model.load_state_dict(previous_model_state)
            break
        previous_model_state = model.state_dict()

Epoch 1 ---
Training loss: 0.5679429599014871
Accuracy: 0.5468299711815562
F-score: 0.7014712861888941
Epoch 2 ---
Training loss: 0.5678136614026938
Accuracy: 0.542507204610951
F-score: 0.7022972339428036
Epoch 3 ---
Training loss: 0.5668293122721709
Accuracy: 0.5511527377521613
F-score: 0.7098276665114113
Validation accuracy: 0.5067567567567568
Epoch 4 ---
Training loss: 0.5666305873797273
Accuracy: 0.5540345821325648
F-score: 0.7111525898273449
Epoch 5 ---
Training loss: 0.5654647447235153
Accuracy: 0.5540345821325648
F-score: 0.7119590507212658
Epoch 6 ---
Training loss: 0.5651674222521683
Accuracy: 0.5461095100864554
F-score: 0.705607476635514
Validation accuracy: 0.5067567567567568
Epoch 7 ---
Training loss: 0.5649329563984178
Accuracy: 0.5590778097982709
F-score: 0.7121354656632174
Epoch 8 ---
Training loss: 0.5626739416348828
Accuracy: 0.5655619596541787
F-score: 0.7129938124702522
Epoch 9 ---
Training loss: 0.5641679624775396
Accuracy: 0.55835734870317
F-score: 0.70543008169149

In [436]:
test_ds = MarketDataset(test=True, train_ds=train_ds)

In [437]:
model.eval()
test_labels = []
test_predicted = []
for i in range(len(test_ds)):
    X, Y = test_ds[i]
    with torch.no_grad():
        test_predicted.append(int(model(X)[0] >= 0.5))
    test_labels.append(int(Y[0]))
print(f'Test accuracy: {metrics.accuracy_score(test_labels, test_predicted)}')
print(f'F-score on test set: {metrics.f1_score(test_labels, test_predicted)}')
print(f'Reference accuracy: {test_ds.reference_accuracy()}')

Test accuracy: 0.5555555555555556
F-score on test set: 0.6764705882352942
Reference accuracy: 0.6161616161616161


In [438]:
torch.save(model.state_dict(), 'model-params.pt')