In [257]:
import pandas as pd
import torch.utils.data as Data
import torch.nn as nn
import numpy as np
import torch
from sklearn.decomposition import PCA
from tqdm import tqdm

In [258]:
TOTAL_FEATURES = 6
DAYS = 60
BATCH_SIZE = 128

In [259]:
class StockMarketDataReg(Data.Dataset):
    def __init__(self, train=None, split=0.2, target_market="S&P", days=60):
        self.days = days + 1 # as last entry is to be predicted

        df = pd.read_csv(f"../data/Processed_{target_market}.csv")
        df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
        df = df.sort_values(by='Date')
        df = df.drop(columns=['Name', 'Date'])
        df = df.fillna(0)
        
        # outlier detection
        for col in df.columns:
            q1, q3 = df[col].quantile([0.25, 0.75])
            iqr = q3 - q1
            lb, rb = q1 - 1.5 * iqr, q3 + 1.5 * iqr
            for i in range(len(df)):
                if df[col][i] > rb:
                    df[col][i] = rb
                if df[col][i] < lb:
                    df[col][i] = lb

        self.feature_list = list(set(df.columns) - set(['Close']))
        if train is None:
            u = df.iloc[0: int(len(df) * (1 - split))]
            # pca
            self.pca = PCA(TOTAL_FEATURES - 1)
            self.pca_data = self.pca.fit_transform(u[self.feature_list])
            self.closing_prices = u['Close'].to_numpy()
            self.data = np.hstack(( self.closing_prices.reshape((-1, 1)), self.pca_data ))
        else:
            u = df.iloc[int(len(df) * (1 - split)) - self.days:]
            # pca
            self.pca_data = train.pca.transform(u[self.feature_list])
            self.closing_prices = u['Close'].to_numpy()
            self.data = np.hstack(( self.closing_prices.reshape((-1, 1)), self.pca_data ))

        self.num_rows = u.shape[0]
    
    def __getitem__(self, idx):
        end_idx = self.days + idx - 1
        market_tens = torch.from_numpy(self.data[idx: end_idx + 1])
        return market_tens.float()
    
    def __len__(self):
        return self.num_rows - self.days

In [260]:
train_ds = StockMarketDataReg(target_market='S&P', split=0.2, days=DAYS)
test_ds = StockMarketDataReg(target_market='S&P', split=0.2, days=DAYS, train=train_ds)

In [261]:
train_dataloader = Data.DataLoader(train_ds, batch_size=BATCH_SIZE)

In [262]:
sample = next(iter(train_dataloader))
sample.shape

torch.Size([128, 61, 6])

In [263]:
class Generator(nn.Module):
    '''
    Generates fake data using features of past DAYS days
    '''
    def __init__(self, num_features, days, hidden_size=300):
        super().__init__()
        self.num_features = num_features
        self.days = days
        self.hidden_size = hidden_size
        # model
        self.lstm = nn.LSTM(input_size=self.num_features, hidden_size=self.hidden_size, num_layers=1, batch_first=True)
        self.decoder = nn.Sequential(
                nn.Linear(self.hidden_size * self.days, self.num_features),
                nn.Tanh()
            )
    def forward(self, inp):
        out1, _ = self.lstm(inp)
        out1 = out1.reshape((-1, self.hidden_size * self.days))
        out2 = self.decoder(out1)
        out2 = out2.squeeze()
        return out2

In [264]:
class Discriminator(nn.Module):
    '''
    Discriminates between real data and fake data of DAYS + 1 days
    '''
    def __init__(self, num_features, c=5):
        # c denotes flatenning constant
        super().__init__()
        self.c = c
        self.num_features = num_features
        self.convs = nn.Sequential(
            nn.Conv1d(self.num_features, 32, kernel_size=5, stride=2),
            nn.LeakyReLU(0.01),
            nn.Conv1d(32, 64, kernel_size=5, stride=2),
            nn.LeakyReLU(0.01),
            nn.BatchNorm1d(64, momentum=0.9, eps=1e-05), 
            nn.Conv1d(64, 128, kernel_size=5, stride=2),
            nn.LeakyReLU(0.01),
            nn.BatchNorm1d(128, momentum=0.9, eps=1e-05),
        )
        self.dense = nn.Sequential(
            nn.Linear(128 * self.c, 220),
            nn.BatchNorm1d(220, momentum=0.9, eps=1e-05),
            nn.LeakyReLU(0.01),
            nn.Linear(220, 220),
            nn.LeakyReLU(0.01),
            nn.Linear(220, 1),
            nn.Sigmoid()
        )
    def forward(self, inp):
        inp = torch.transpose(inp, 2, 1)
        out1 = self.convs(inp)
        out1 = out1.reshape(-1, 128 * self.c)
        out2 = self.dense(out1)
        out2 = out2.squeeze()
        return out2

In [265]:
criterion = torch.nn.BCELoss()

generator = Generator(TOTAL_FEATURES, DAYS)
discriminator = Discriminator(TOTAL_FEATURES)
    
optimizer_G = torch.optim.Adam(generator.parameters(), lr=0.001)
optimizer_D = torch.optim.SGD(discriminator.parameters(), lr=0.001)

In [266]:
NUM_EPOCHS = 100

In [267]:
for epoch in range(NUM_EPOCHS):
    running_G_loss, running_D_loss = 0, 0
    mae, cnt = 0, 0
    for data in tqdm(train_dataloader):
        #### discriminator
        discriminator.zero_grad()
        real_prev, real_now = data[:, :-1], data[:, -1]
        labels = torch.ones(len(data)).float()
        
        ## real
        # forward
        output = discriminator(data)
        # loss
        error_D_real = criterion(output, labels)
        error_D_real.backward()

        ## fake
        labels.fill_(0)
        # forward on generator
        fake_now = generator(real_prev)
        fake = torch.hstack((real_prev, fake_now.unsqueeze(1)))
        # forward on discriminator
        output = discriminator(fake.detach())
        # loss
        error_D_fake = criterion(output, labels)
        error_D_fake.backward()
        
        # step
        optimizer_D.step()

        #### generator
        labels.fill_(1)
        optimizer_G.zero_grad()
        # calculate discriminator output
        output = discriminator(fake)
        # loss
        error_G = criterion(output, labels)
        error_G.backward()
        # step
        optimizer_G.step()
        
        error_D = error_D_real + error_D_fake
        running_D_loss += error_D.item()
        running_G_loss += error_G.item()
        mae += abs(real_now[:, 0] - fake_now[:, 0]).sum().item()
        cnt += len(data)

    mae /= cnt        
    print(f'epoch {epoch + 1}: g_loss={running_G_loss:.4f}, d_loss={running_D_loss:.4f}, mae={mae:.4f}')        
    print()

100%|██████████| 12/12 [00:20<00:00,  1.72s/it]


epoch 1: g_loss=8.7490, d_loss=16.6505, mae=1608.8606



100%|██████████| 12/12 [00:20<00:00,  1.74s/it]


epoch 2: g_loss=8.7342, d_loss=16.6267, mae=1608.9316



100%|██████████| 12/12 [00:19<00:00,  1.60s/it]


epoch 3: g_loss=8.7143, d_loss=16.5887, mae=1608.9316



100%|██████████| 12/12 [00:18<00:00,  1.54s/it]


epoch 4: g_loss=8.6937, d_loss=16.5652, mae=1608.9316



100%|██████████| 12/12 [00:19<00:00,  1.63s/it]


epoch 5: g_loss=8.6778, d_loss=16.5474, mae=1608.9316



100%|██████████| 12/12 [00:12<00:00,  1.06s/it]


epoch 6: g_loss=8.6696, d_loss=16.5189, mae=1608.9316



100%|██████████| 12/12 [00:10<00:00,  1.11it/s]


epoch 7: g_loss=8.6641, d_loss=16.5000, mae=1608.9316



100%|██████████| 12/12 [00:12<00:00,  1.01s/it]


epoch 8: g_loss=8.6524, d_loss=16.4800, mae=1608.9316



100%|██████████| 12/12 [00:12<00:00,  1.06s/it]


epoch 9: g_loss=8.6423, d_loss=16.4615, mae=1608.9316



 42%|████▏     | 5/12 [00:04<00:06,  1.08it/s]


KeyboardInterrupt: 