## PCA + ANN Baseline
* Reference paper: https://jfin-swufe.springeropen.com/track/pdf/10.1186/s40854-019-0138-0.pdf

In [22]:
import pandas as pd
import pandas as pd
import torch.utils.data as Data
import torch
import numpy as np
from sklearn.decomposition import PCA
pd.set_option('display.max_columns', 500)

In [23]:
base_df = pd.read_csv('data/Processed_S&P.csv')

## Preprocessing

In [24]:
base_df['Date'] = pd.to_datetime(base_df['Date'], format='%Y-%m-%d')
base_df.sort_values(by='Date', inplace=True)
base_df.drop(columns=['Name', 'Date'], axis=1, inplace=True)

In [25]:
base_df.fillna(0, inplace=True)

In [26]:
# detect outliers
for col in base_df.columns:
    q1, q3 = base_df[col].quantile([0.25, 0.75])
    iqr = q3 - q1
    lb, rb = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    for i in range(len(base_df)):
        if base_df[col][i] > rb:
            base_df[col][i] = rb
        if base_df[col][i] < lb:
            base_df[col][i] = lb

## Dataloader
* PCA with n_components = 82

In [28]:
class MarketDataset(Data.Dataset):
    DATA_PATH = 'data/Processed_S&P.csv'
    TRAIN_RATIO = 0.7
    VALIDATION_RATIO = 0.15
    TEST_RATIO = 0.15

    def __init__(self, test=False):
        global base_df
        if test:
            df = base_df.iloc[int((1 - MarketDataset.TEST_RATIO) * len(base_df)):]
        else:
            df = base_df.iloc[:int((1 - MarketDataset.TEST_RATIO) * len(base_df))]
            self.mean = df.mean()
            self.std = df.std()
            df = (df - self.mean) / self.std
            self.pca = PCA()
            self.pca.fit(df)
            print(len(self.pca.singular_values_))

        self.df = df

    def get_label(self, idx):
        return int(self.df['Close'][idx + 1] > self.df['Close'][idx])

    def __getitem__(self, idx):
        return torch.tensor(self.df.iloc[idx]).float(), self.get_label(idx)

    def __len__(self):
        return len(self.df) - 1

train_ds = MarketDataset(test=False)

82
