In [213]:
import pandas as pd
import torch.utils.data as Data
import torch.nn as nn
import numpy as np
import torch
from sklearn.decomposition import PCA
from tqdm import tqdm

In [214]:
TOTAL_FEATURES = 6
DAYS = 60
BATCH_SIZE = 32

In [215]:
class StockMarketDataReg(Data.Dataset):
    def __init__(self, train=None, split=0.2, target_market="S&P", days=60):
        self.days = days + 1 # as last entry is to be predicted

        df = pd.read_csv(f"../data/Processed_{target_market}.csv")
        df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
        df = df.sort_values(by='Date')
        df = df.drop(columns=['Name', 'Date'])
        df = df.fillna(0)
        
        # outlier detection
        for col in df.columns:
            q1, q3 = df[col].quantile([0.25, 0.75])
            iqr = q3 - q1
            lb, rb = q1 - 1.5 * iqr, q3 + 1.5 * iqr
            for i in range(len(df)):
                if df[col][i] > rb:
                    df[col][i] = rb
                if df[col][i] < lb:
                    df[col][i] = lb

        self.feature_list = list(set(df.columns) - set(['Close']))
        if train is None:
            u = df.iloc[0: int(len(df) * (1 - split))]
            # pca
            self.pca = PCA(TOTAL_FEATURES - 1)
            self.pca_data = self.pca.fit_transform(u[self.feature_list])
            self.closing_prices = u['Close'].to_numpy()
            self.data = np.hstack(( self.closing_prices.reshape((-1, 1)), self.pca_data ))
        else:
            u = df.iloc[int(len(df) * (1 - split)) - self.days:]
            # pca
            self.pca_data = train.pca.transform(u[self.feature_list])
            self.closing_prices = u['Close'].to_numpy()
            self.data = np.hstack(( self.closing_prices.reshape((-1, 1)), self.pca_data ))

        self.num_rows = u.shape[0]
    
    def __getitem__(self, idx):
        end_idx = self.days + idx - 1
        market_tens = torch.from_numpy(self.data[idx: end_idx + 1])
        return market_tens.float()
    
    def __len__(self):
        return self.num_rows - self.days

In [216]:
train_ds = StockMarketDataReg(target_market='S&P', split=0.2, days=DAYS)
test_ds = StockMarketDataReg(target_market='S&P', split=0.2, days=DAYS, train=train_ds)

In [217]:
train_dataloader = Data.DataLoader(train_ds, batch_size=BATCH_SIZE)

In [218]:
sample = next(iter(train_dataloader))
sample.shape

torch.Size([32, 61, 6])

In [219]:
class Generator(nn.Module):
    '''
    Generates fake data using features of past DAYS days
    '''
    def __init__(self, num_features, days, hidden_size=300):
        super().__init__()
        self.num_features = num_features
        self.days = days
        self.hidden_size = hidden_size
        # model
        self.lstm = nn.LSTM(input_size=self.num_features, hidden_size=self.hidden_size, num_layers=1, batch_first=True)
        self.decoder = nn.Sequential(
                nn.Linear(self.hidden_size * self.days, self.num_features),
                nn.Tanh()
            )
    def forward(self, inp):
        out1, _ = self.lstm(inp)
        out1 = out1.reshape((-1, self.hidden_size * self.days))
        out2 = self.decoder(out1)
        out2 = out2.squeeze()
        return out2

In [220]:
class Discriminator(nn.Module):
    '''
    Discriminates between real data and fake data of DAYS + 1 days
    '''
    def __init__(self, num_features, c=5):
        # c denotes flatenning constant
        super().__init__()
        self.c = c
        self.num_features = num_features
        self.convs = nn.Sequential(
            nn.Conv1d(self.num_features, 32, kernel_size=5, stride=2),
            nn.LeakyReLU(0.01),
            nn.Conv1d(32, 64, kernel_size=5, stride=2),
            nn.LeakyReLU(0.01),
            nn.BatchNorm1d(64, momentum=0.9, eps=1e-05), 
            nn.Conv1d(64, 128, kernel_size=5, stride=2),
            nn.LeakyReLU(0.01),
            nn.BatchNorm1d(128, momentum=0.9, eps=1e-05),
        )
        self.dense = nn.Sequential(
            nn.Linear(128 * self.c, 220),
            nn.BatchNorm1d(220, momentum=0.9, eps=1e-05),
            nn.LeakyReLU(0.01),
            nn.Linear(220, 220),
            nn.ReLU(),
            nn.Linear(220, 1),
            nn.Sigmoid()
        )
    def forward(self, inp):
        inp = torch.transpose(inp, 2, 1)
        out1 = self.convs(inp)
        out1 = out1.reshape(-1, 128 * self.c)
        out2 = self.dense(out1)
        out2 = out2.squeeze()
        return out2

In [221]:
criterion = torch.nn.BCELoss()

generator = Generator(TOTAL_FEATURES, DAYS)
discriminator = Discriminator(TOTAL_FEATURES)
    
optimizer_G = torch.optim.Adam(generator.parameters(), lr=0.002)
optimizer_D = torch.optim.Adam(discriminator.parameters(), lr=0.002)

In [222]:
NUM_EPOCHS = 100

In [223]:
for epoch in range(NUM_EPOCHS):
    running_G_loss, running_D_loss = 0, 0
    mae, cnt = 0, 0
    for data in tqdm(train_dataloader):
        #### discriminator
        discriminator.zero_grad()
        real_prev, real_now = data[:, :-1], data[:, -1]
        labels = torch.ones(len(data)).float()
        
        ## real
        # forward
        output = discriminator(data)
        # loss
        error_D_real = criterion(output, labels)
        error_D_real.backward()

        ## fake
        labels.fill_(0)
        # forward on generator
        fake_now = generator(real_prev)
        fake = torch.hstack((real_prev, fake_now.unsqueeze(1)))
        # forward on discriminator
        output = discriminator(fake.detach())
        # loss
        error_D_fake = criterion(output, labels)
        error_D_fake.backward()
        
        # step
        optimizer_D.step()

        #### generator
        labels.fill_(1)
        optimizer_G.zero_grad()
        # calculate discriminator output
        output = discriminator(fake)
        # loss
        error_G = criterion(output, labels)
        error_G.backward()
        # step
        optimizer_G.step()
        
        error_D = error_D_real + error_D_fake
        running_D_loss += error_D.item()
        running_G_loss += error_G.item()
        mae += abs(real_now[:, 0] - fake_now[:, 0]).sum().item()
        cnt += 1

    mae /= cnt        
    print(f'epoch {epoch + 1}: g_loss={running_G_loss:.4f}, d_loss={running_D_loss:.4f}, mae={mae:.4f}')        
    print()

100%|██████████| 48/48 [00:15<00:00,  3.09it/s]


epoch 1: g_loss=113.8384, d_loss=35.4277, mae=51087.7522



100%|██████████| 48/48 [00:08<00:00,  5.58it/s]


epoch 2: g_loss=259.8348, d_loss=9.2406, mae=51087.0350



100%|██████████| 48/48 [00:08<00:00,  5.44it/s]


epoch 3: g_loss=287.1808, d_loss=12.0325, mae=51087.0350



100%|██████████| 48/48 [00:08<00:00,  5.66it/s]


epoch 4: g_loss=250.8051, d_loss=7.8515, mae=51087.0350



100%|██████████| 48/48 [00:08<00:00,  5.84it/s]


epoch 5: g_loss=302.1500, d_loss=7.9688, mae=51087.0350



100%|██████████| 48/48 [00:08<00:00,  5.84it/s]


epoch 6: g_loss=234.4207, d_loss=7.7787, mae=51087.0350



100%|██████████| 48/48 [00:08<00:00,  6.00it/s]


epoch 7: g_loss=360.6979, d_loss=2.2033, mae=51087.0350



100%|██████████| 48/48 [00:07<00:00,  6.03it/s]


epoch 8: g_loss=277.1443, d_loss=2.7217, mae=51087.0350



100%|██████████| 48/48 [00:07<00:00,  6.02it/s]


epoch 9: g_loss=389.9433, d_loss=5.1676, mae=51087.0350



100%|██████████| 48/48 [00:07<00:00,  6.07it/s]


epoch 10: g_loss=294.4085, d_loss=3.9021, mae=51087.0350



100%|██████████| 48/48 [00:08<00:00,  5.66it/s]


epoch 11: g_loss=408.3199, d_loss=0.9840, mae=51087.0350



100%|██████████| 48/48 [00:07<00:00,  6.14it/s]


epoch 12: g_loss=419.9713, d_loss=0.1993, mae=51087.0350



100%|██████████| 48/48 [00:07<00:00,  6.30it/s]


epoch 13: g_loss=438.6294, d_loss=0.0953, mae=51087.0350



100%|██████████| 48/48 [00:07<00:00,  6.07it/s]


epoch 14: g_loss=456.5538, d_loss=0.0653, mae=51087.0350



100%|██████████| 48/48 [00:07<00:00,  6.12it/s]


epoch 15: g_loss=469.9033, d_loss=0.0506, mae=51087.0350



100%|██████████| 48/48 [00:09<00:00,  5.16it/s]


epoch 16: g_loss=480.9430, d_loss=0.0411, mae=51087.0350



100%|██████████| 48/48 [00:09<00:00,  5.22it/s]


epoch 17: g_loss=490.4641, d_loss=0.0344, mae=51087.0350



100%|██████████| 48/48 [00:08<00:00,  5.47it/s]


epoch 18: g_loss=498.8964, d_loss=0.0294, mae=51087.0350



100%|██████████| 48/48 [00:08<00:00,  5.55it/s]


epoch 19: g_loss=506.5244, d_loss=0.0256, mae=51087.0350



100%|██████████| 48/48 [00:08<00:00,  5.38it/s]


epoch 20: g_loss=513.5109, d_loss=0.0225, mae=51087.0350



100%|██████████| 48/48 [00:07<00:00,  6.31it/s]


epoch 21: g_loss=519.9827, d_loss=0.0199, mae=51087.0350



100%|██████████| 48/48 [00:09<00:00,  5.12it/s]


epoch 22: g_loss=526.0050, d_loss=0.0178, mae=51087.0350



 44%|████▍     | 21/48 [00:04<00:05,  5.04it/s]


KeyboardInterrupt: 