## PCA + ANN Baseline
* Reference paper: https://jfin-swufe.springeropen.com/track/pdf/10.1186/s40854-019-0138-0.pdf

In [29]:
import pandas as pd
import pandas as pd
import torch.utils.data as Data
import torch
import torch.nn as nn
from sklearn.decomposition import PCA
pd.set_option('display.max_columns', 500)

In [30]:
base_df = pd.read_csv('data/Processed_S&P.csv')

## Preprocessing

In [31]:
base_df['Date'] = pd.to_datetime(base_df['Date'], format='%Y-%m-%d')
base_df.sort_values(by='Date', inplace=True)
base_df.drop(columns=['Name', 'Date'], axis=1, inplace=True)

In [32]:
base_df.fillna(0, inplace=True)

In [33]:
# detect outliers
for col in base_df.columns:
    q1, q3 = base_df[col].quantile([0.25, 0.75])
    iqr = q3 - q1
    lb, rb = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    for i in range(len(base_df)):
        if base_df[col][i] > rb:
            base_df[col][i] = rb
        if base_df[col][i] < lb:
            base_df[col][i] = lb

## Dataloader
* PCA with n_components = 82

In [34]:
class MarketDataset(Data.Dataset):
    DATA_PATH = 'data/Processed_S&P.csv'
    TRAIN_RATIO = 0.7
    VALIDATION_RATIO = 0.15
    TEST_RATIO = 0.15

    def __init__(self, test=False, pca=None):
        global base_df
        if test:
            df = base_df.iloc[int((1 - MarketDataset.TEST_RATIO) * len(base_df)):]
            assert (pca is not None)
            self.X = pca.transform(df)
        else:
            df = base_df.iloc[:int((1 - MarketDataset.TEST_RATIO) * len(base_df))]
            self.mean = df.mean()
            self.std = df.std()
            df = (df - self.mean) / self.std
            self.pca = PCA()
            self.pca.fit(df)
            self.X = self.pca.transform(df)
            
        self.X = torch.tensor(self.X).float()
        self.df = df

    def get_label(self, idx):
        return torch.tensor([float(self.df['Close'][idx + 1] > self.df['Close'][idx])])

    def __getitem__(self, idx):
        return self.X[idx], self.get_label(idx)

    def __len__(self):
        return len(self.X) - 1

train_ds = MarketDataset(test=False)

In [35]:
model = nn.Sequential(
    nn.Linear(82, 82), 
    nn.ReLU(),
    nn.Linear(82, 41),
    nn.ReLU(),
    nn.Linear(41, 20),
    nn.ReLU(),
    nn.Linear(20, 10),
    nn.ReLU(),
    nn.Linear(10, 5),
    nn.ReLU(),
    nn.Linear(5, 1),
    nn.Sigmoid()
)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 100
for e in range(epochs):
    running_loss = 0
    accuracy = 0
    for i in range(len(train_ds)):
        X, Y = train_ds[i]
        optimizer.zero_grad()
        output = model(X)
        accuracy += int(output[0] >= 0.5) == int(Y[0])
        loss = criterion(output, Y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {e}')
    print(f'Training loss: {running_loss / len(train_ds)}')
    print(f'Accuracy: {accuracy / len(train_ds)}')

Epoch 0
Training loss: 0.705431390445381
Accuracy: 0.45578635014836794
Epoch 1
Training loss: 0.69120494860393
Accuracy: 0.5465875370919882
Epoch 2
Training loss: 0.681382850979839
Accuracy: 0.5691394658753709
Epoch 3
Training loss: 0.6657873709009027
Accuracy: 0.6160237388724036
Epoch 4
Training loss: 0.6462225256979642
Accuracy: 0.6356083086053412
Epoch 5
Training loss: 0.6129042134352183
Accuracy: 0.6777448071216617
Epoch 6
Training loss: 0.5754708726788134
Accuracy: 0.7050445103857567
Epoch 7
Training loss: 0.5230725060360788
Accuracy: 0.7359050445103857
Epoch 8
Training loss: 0.46779226195816787
Accuracy: 0.7774480712166172
Epoch 9
Training loss: 0.4115459620569744
Accuracy: 0.8112759643916914
Epoch 10
Training loss: 0.3657210823491981
Accuracy: 0.8427299703264095
Epoch 11
Training loss: 0.3333219807252474
Accuracy: 0.855192878338279


KeyboardInterrupt: 