In [None]:
!mkdir /root/.kaggle
!mv kaggle.json /root/.kaggle

!rm -r sample_data
!kaggle competitions download -c jpx-tokyo-stock-exchange-prediction
!unzip ./jpx-tokyo-stock-exchange-prediction.zip -d jpx-tokyo-stock-exchange-prediction

In [None]:
!pip install wandb
!wandb login

In [3]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
class TSDataset(Dataset):
  def __init__(self, df, seq_len=128, padding_token=0, vec_dates=True, normalize=True):
    self.vec_dates = vec_dates
    
    self.df = df
    self.indices = []
    self.seq_len = seq_len
    self.normalize = normalize
    self.padding_token = padding_token
    
    #Creating indices
    start = 0
    for _ in range(-(len(self.df) // -self.seq_len)):
      self.indices.append((start, start+self.seq_len))
      start+=self.seq_len
    
    #fixing non-perfect intervals, --in place
    idx = 0
    while idx<len(self.indices):
      start, end = self.indices[idx]
      intervals = self.df[start:end]['SecuritiesCode'].value_counts(sort=False).values
      if len(intervals) != 1:
        self.indices = self.indices[:idx] + [(start, start+intervals[0]), (start+intervals[0], end)] + self.indices[idx+1:]
        idx+=1
      idx+=1
    
    #Getting normalizing values for each stock
    if normalize:
      self.norm_values = {}
      stock_list = dframe.SecuritiesCode.unique()
      for stock in stock_list:
        local_series = dframe_1[dframe_1['SecuritiesCode'] == stock].Close
        self.norm_values[stock] = (local_series.max(), local_series.min())
  
  def __len__(self):
    return len(self.indices)
  
  def __getitem__(self, idx):
    start, end = self.indices[idx]
    seq_df = self.df[start:end]
    
    target = (seq_df['Target'].values[-1])

    sequence = np.expand_dims(seq_df['Close'].values, 1)
    
    #Normalizing
    if self.normalize:
      stock_max, stock_min = self.norm_values[seq_df['SecuritiesCode'].iloc[0]]
      sequence = (sequence - stock_min)/(stock_max - stock_min)
    
    #Padding
    if sequence.shape[0] != self.seq_len:
     sequence = np.pad(sequence, pad_width=[(self.seq_len-sequence.shape[0], 0), (0, 0)], constant_values=self.padding_token, mode='constant')

    #careful here padding_mask shape shouldn't be the same as sequence's, it works now bc we're using only one feature
    padding_mask = (sequence == self.padding_token)
    if self.vec_dates:
      date_vec = np.concatenate([np.expand_dims(seq_df['Date'].dt.year.values, 1), 
                                 np.expand_dims(seq_df['Date'].dt.month.values, 1), 
                                 np.expand_dims(seq_df['Date'].dt.day.values, 1)], 
                                axis=1)
      date_vec = np.pad(date_vec, pad_width=[(self.seq_len-date_vec.shape[0], 0), (0, 0)], constant_values=self.padding_token, mode='constant')
      
      return {'sequence':sequence,
              'date':date_vec,
              'mask':padding_mask,
              'target':target}
    else:
      
      return {'sequence':sequence,
              'mask':padding_mask,
              'target':target}

In [5]:
class time2vec(nn.Module):
  def __init__(self, in_features, out_features):
    super().__init__()
    self.w_linear = nn.Parameter(data=torch.rand(in_features, 1))
    self.b_linear = nn.Parameter(data=torch.rand(1))
    self.w_function = nn.Parameter(data=torch.rand(in_features, out_features-1))
    self.b_function = nn.Parameter(data=torch.rand(out_features-1))

    #maybe a bit more straightforward
    #self.linear_params = nn.Linear(in_features, 1, bias=True)
    #self.function_params = nn.Linear(in_features, out_features-1, bias=True)

    #initialize params?
    #nn.init.kaiming_normal_(self.w_linear)
    #nn.init.kaiming_normal_(self.b_linear)
    #nn.init.kaiming_normal_(self.w_function)
    #nn.init.kaiming_normal_(self.b_function)

  def forward(self, x):
    linear_out = torch.matmul(x, self.w_linear)+self.b_linear
    func_out = torch.sin(torch.matmul(x, self.w_function)+self.b_function)
    return torch.concat((linear_out, func_out), dim=-1)

In [6]:
class TSTransformer(nn.Module):
  def __init__(self, in_features, time_features=7, mlp_dim=1024, enc_layers=2, enc_heads=2):
    super().__init__()
    self.time2vec = time2vec(in_features, time_features)
    self.encoder_layer = nn.TransformerEncoderLayer(d_model=in_features+time_features, nhead=enc_heads, 
                                                    dropout=0, activation=F.gelu, batch_first=True, 
                                                    norm_first=True)
    self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=enc_layers)

    self.mlp = nn.Linear(in_features+time_features, mlp_dim)
    self.regressor = nn.Linear(mlp_dim, 1)

  def forward(self, seq, mask):
    time_embeddings = self.time2vec(seq)  
    x = torch.concat((seq, time_embeddings), dim=-1)
    x = self.encoder(src=x, src_key_padding_mask=mask)

    x = F.relu(self.mlp(x))
    x = self.regressor(x)

    return x[:, -1, :] #returning only last seq element

In [7]:
class TSTransformer_VecDates(nn.Module):
  def __init__(self, in_features, time_features=7, mlp_dim=1024, enc_layers=2, enc_heads=2):
    super().__init__()
    self.time2vec = time2vec(3, time_features)
    self.encoder_layer = nn.TransformerEncoderLayer(d_model=in_features+time_features, nhead=enc_heads, 
                                                    dropout=0, activation=F.gelu, batch_first=True, 
                                                    norm_first=True)
    self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=enc_layers)

    self.mlp = nn.Linear(in_features+time_features, mlp_dim)
    self.regressor = nn.Linear(mlp_dim, 1)

  def forward(self, seq, date_vec, mask):

    time_embeddings = self.time2vec(date_vec)
    x = torch.concat((seq, time_embeddings), dim=-1)
    x = self.encoder(src=x, src_key_padding_mask=mask)

    x = F.relu(self.mlp(x))
    x = self.regressor(x)

    return x[:, -1, :] #returning only last seq element

In [9]:
seq_len = 128

padding_token = 0.0
missing_token = -1.0


dframe = pd.read_csv('jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv', parse_dates=['Date'])

stock_list = dframe.SecuritiesCode.unique()
dframe_1 = dframe.drop(['Open', 'High', 'Low', 'Volume', 'RowId', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag'], axis=1)
dframe_1 = dframe_1[~dframe_1['Close'].isnull()] #Getting rid of null values for this experiment
dframe_1 = dframe_1.sort_values(['SecuritiesCode', 'Date'], ascending=[True, True]).reset_index(drop=True)

dset = TSDataset(dframe_1, seq_len=128, vec_dates=True, normalize=True)
dloader = DataLoader(dset, batch_size=128, shuffle=True, num_workers=1)

In [30]:
import wandb
wandb.init(project='TSTransformer_Test')
wandb.config={'learning_rate': 1e-4,
              'epochs':5,
              'batch_size':128}
config = wandb.config

In [31]:
model = TSTransformer_VecDates(in_features=1, time_features=7).to(device)
wandb.watch(model)

lr = 1e-4
epochs = 5
crit = nn.MSELoss()
optim = torch.optim.Adam(model.parameters(), lr=lr)


running_loss = []
for _ in range(epochs):
  for _, batch in enumerate(dloader):
    seq, date, mask, target = batch['sequence'].to(device), batch['date'].to(device), batch['mask'].to(device), batch['target'].to(device)

    optim.zero_grad()
    
    out = model(seq.float(), date.float(), mask.squeeze(-1).float())
    loss = crit(out.squeeze(-1), target.float())
    
    loss.backward()
    optim.step()
    
    wandb.log({'loss': loss})
    running_loss.append(loss.item())

In [32]:
out.squeeze(-1)

tensor([-0.0359, -0.0789,  0.0096,  0.0549,  0.0800, -0.0903,  0.0997,  0.0520,
        -0.0604,  0.0010, -0.0508, -0.0192, -0.0420, -0.0677, -0.0727, -0.0129,
         0.0014, -0.0856,  0.0680,  0.0275, -0.0184, -0.0192, -0.0032, -0.1767,
         0.0130, -0.0544,  0.0218, -0.0156,  0.0273, -0.1254,  0.0143, -0.0763,
        -0.0121, -0.1134, -0.0624, -0.0720,  0.0016, -0.0235, -0.0270, -0.0370,
        -0.0714,  0.0058,  0.0135,  0.0694,  0.0078, -0.0067], device='cuda:0',
       grad_fn=<SqueezeBackward1>)

In [33]:
target

tensor([-0.0117, -0.0125, -0.0114, -0.0038, -0.0248, -0.0023, -0.0421, -0.0057,
         0.0076,  0.0322, -0.0071,  0.0196, -0.0095,  0.0262, -0.0224,  0.0255,
         0.0062,  0.0065,  0.0016,  0.0061, -0.0120, -0.0082,  0.0090, -0.0110,
         0.0194,  0.0233,  0.0447,  0.0140, -0.0057, -0.0079, -0.0053, -0.0059,
        -0.0107,  0.0169,  0.0000, -0.0067,  0.0326,  0.0048,  0.0252,  0.0180,
         0.0050,  0.0429, -0.0091,  0.0095, -0.0377,  0.0045], device='cuda:0',
       dtype=torch.float64)