## PCA + ANN Baseline
* Reference paper: https://jfin-swufe.springeropen.com/track/pdf/10.1186/s40854-019-0138-0.pdf

In [99]:
import pandas as pd
import pandas as pd
import torch.utils.data as Data
import torch
import torch.nn as nn
from sklearn.decomposition import PCA
from sklearn import metrics
pd.set_option('display.max_columns', 500)

In [100]:
base_df = pd.read_csv('data/Processed_S&P.csv')

## Preprocessing

In [101]:
base_df['Date'] = pd.to_datetime(base_df['Date'], format='%Y-%m-%d')
base_df.sort_values(by='Date', inplace=True)
base_df.drop(columns=['Name', 'Date'], axis=1, inplace=True)

In [102]:
base_df.fillna(0, inplace=True)

In [103]:
# detect outliers
for col in base_df.columns:
    q1, q3 = base_df[col].quantile([0.25, 0.75])
    iqr = q3 - q1
    lb, rb = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    for i in range(len(base_df)):
        if base_df[col][i] > rb:
            base_df[col][i] = rb
        if base_df[col][i] < lb:
            base_df[col][i] = lb

## Dataloader
* PCA with n_components = 82

In [104]:
class MarketDataset(Data.Dataset):
    DATA_PATH = 'data/Processed_S&P.csv'
    TRAIN_RATIO = 0.7
    VALIDATION_RATIO = 0.15
    TEST_RATIO = 0.15

    def __init__(self, test=False, train_ds=None):
        global base_df
        if test:
            df = base_df.iloc[int((1 - MarketDataset.TEST_RATIO) * len(base_df)):]
            assert (train_ds is not None)
            df = (df - train_ds.mean) / train_ds.std
            self.X = train_ds.pca.transform(df)
        else:
            df = base_df.iloc[:int((1 - MarketDataset.TEST_RATIO) * len(base_df))]
            self.mean = df.mean()
            self.std = df.std()
            df = (df - self.mean) / self.std
            self.pca = PCA()
            self.pca.fit(df)
            self.X = self.pca.transform(df)
            self.train_len = int((MarketDataset.TRAIN_RATIO * len(df)) / (MarketDataset.TRAIN_RATIO + MarketDataset.VALIDATION_RATIO))
            self.validation_len = len(df) - self.train_len - 1
            self.validation_range = range(self.train_len + 1, self.train_len + self.validation_len)

        self.X = torch.tensor(self.X).float()
        self.df = df.reset_index(drop=True)

    def get_label(self, idx):
        return torch.tensor([float(self.df['Close'][idx + 1] > self.df['Close'][idx])])

    def __getitem__(self, idx):
        return self.X[idx], self.get_label(idx)

    def __len__(self):
        return len(self.X) - 1
    

train_ds = MarketDataset(test=False)

In [105]:
model = nn.Sequential(
    nn.Linear(82, 41),
    nn.Dropout(0.3),
    nn.ReLU(),
    nn.Linear(41, 20),
    nn.Dropout(0.2),
    nn.ReLU(),
    nn.Linear(20, 10),
    nn.Dropout(0.2),
    nn.ReLU(),
    nn.Linear(10, 5),
    nn.ReLU(),
    nn.Linear(5, 1),
    nn.Sigmoid()
)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [106]:
epochs = 100
validation_interval = 4
validation_accuracies = []
train_accuracies = []
train_losses = []
train_f_scores = []

for e in range(1, epochs + 1):
    running_loss = 0
    labels = []
    predicted = []
    for i in range(train_ds.train_len):
        X, Y = train_ds[i]
        optimizer.zero_grad()

        output = model(X)
        labels.append(int(output[0] >= 0.5))
        predicted.append(int(Y[0]))
        loss = criterion(output, Y)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    train_acc = metrics.accuracy_score(labels, predicted)
    f_score = metrics.f1_score(labels, predicted)
    running_loss /= len(train_ds)

    print(f'Epoch {e} ---')
    print(f'Training loss: {running_loss}')
    print(f'Accuracy: {train_acc}') 
    print(f'F-score: {f_score}')

    train_accuracies.append(train_acc)
    train_f_scores.append(f_score)
    train_losses.append(running_loss)

    if e % validation_interval == 0:
        model.eval()
        labels = []
        predicted = []
        for i in train_ds.validation_range:
            X, Y = train_ds[i]
            with torch.no_grad():
                labels.append(int(model(X)[0] >= 0.5))
            predicted.append(int(Y[0]))
        validation_acc = metrics.accuracy_score(labels, predicted)
        print(f'Validation accuracy: {validation_acc}')
        validation_accuracies.append(validation_acc)
        model.train()
        if len(validation_accuracies) > 1 and validation_accuracies[-1] < validation_accuracies[-2]:
            break

Epoch 1 ---
Training loss: 0.5723672087185461
Accuracy: 0.5180115273775217
F-score: 0.5962582981291491
Epoch 2 ---
Training loss: 0.5662601844847025
Accuracy: 0.5324207492795389
F-score: 0.6673500768836494
Epoch 3 ---
Training loss: 0.5607976587572154
Accuracy: 0.5612391930835735
F-score: 0.6518010291595198
Epoch 4 ---
Training loss: 0.5596082985401154
Accuracy: 0.5857348703170029
F-score: 0.6859639541234298
Validation accuracy: 0.5304054054054054
Epoch 5 ---
Training loss: 0.5450985162831202
Accuracy: 0.6001440922190202
F-score: 0.6790052053209948
Epoch 6 ---
Training loss: 0.5392789073310549
Accuracy: 0.6260806916426513
F-score: 0.6916221033868094
Epoch 7 ---
Training loss: 0.5282102182971794
Accuracy: 0.6311239193083573
F-score: 0.6893203883495146
Epoch 8 ---
Training loss: 0.5059435451077159
Accuracy: 0.6693083573487032
F-score: 0.7209726443768998
Validation accuracy: 0.5405405405405406
Epoch 9 ---
Training loss: 0.506416118106299
Accuracy: 0.6621037463976945
F-score: 0.71314984709

In [107]:
test_ds = MarketDataset(test=True, train_ds=train_ds)

In [110]:
model.eval()
test_labels = []
test_predicted = []
for i in range(len(test_ds)):
    X, Y = test_ds[i]
    with torch.no_grad():
        test_labels.append(int(model(X)[0] >= 0.5))
    test_predicted.append(int(Y[0]))
print(f'Test accuracy: {metrics.accuracy_score(test_labels, test_predicted)}')
print(f'F-score on test set: {metrics.f1_score(test_labels, test_predicted)}')

Test accuracy: 0.5723905723905723
F-score on test set: 0.6894865525672371


In [112]:
torch.save(model.state_dict(), 'model-params.pt')