In [None]:
!mkdir /root/.kaggle
!mv kaggle.json /root/.kaggle

!rm -r sample_data
!kaggle competitions download -c jpx-tokyo-stock-exchange-prediction
!unzip ./jpx-tokyo-stock-exchange-prediction.zip -d jpx-tokyo-stock-exchange-prediction

In [2]:
import torch
import numpy as np
import pandas as pd

from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [3]:
class time2vec(nn.Module):
  def __init__(self, in_features, out_features):
    super().__init__()
    self.w_linear = nn.Parameter(data=torch.rand(in_features, 1))
    self.b_linear = nn.Parameter(data=torch.rand(1))
    self.w_function = nn.Parameter(data=torch.rand(in_features, out_features-1))
    self.b_function = nn.Parameter(data=torch.rand(out_features-1))

    #maybe a bit more straightforward
    #self.linear_params = nn.Linear(in_features, 1, bias=True)
    #self.function_params = nn.Linear(in_features, out_features-1, bias=True)

    #initialize params?
    #self.w_linear.init.kaiming_normal_()
    #self.b_linear.init.kaiming_normal_()
    #self.w_function.init.kaiming_normal_()
    #self.b_function.init.kaiming_normal_()
  def forward(self, x):
    linear_out = torch.matmul(x, self.w_linear)+self.b_linear
    func_out = torch.sin(torch.matmul(x, self.w_function)+self.b_function)
    return torch.concat((linear_out, func_out), dim=-1)

In [4]:
class TSTransformer(nn.Module):
  def __init__(self, in_features, time_features=1, mlp_dim=1024, enc_layers=2, enc_heads=2):
    super().__init__()
    self.time2vec = time2vec(in_features, time_features)
    self.encoder_layer = nn.TransformerEncoderLayer(d_model=in_features+time_features, nhead=enc_heads, 
                                                    dropout=0, activation=F.gelu, batch_first=True, norm_first=False)
    self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=enc_layers)

    self.mlp = nn.Linear(in_features+time_features, mlp_dim)
    self.regressor = nn.Linear(mlp_dim, 1)

  def forward(self, x):
    time_embeddings = self.time2vec(x)
    x = torch.concat((x, time_embeddings), dim=-1)
    x = self.encoder(x)

    x = F.relu(self.mlp(x))
    x = self.regressor(x)

    return x

In [None]:
#getting rid of null values and some columns for this experiment
#dframe_1 = dframe[~dframe['Close'].isnull()].drop(['RowId', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag'], axis=1)

In [5]:
seq_len = 128

padding_token = 0.0
missing_token = -1.0


dframe = pd.read_csv('jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv', parse_dates=['Date'])
#dframe = pd.read_csv('jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv')
stock_list = dframe.SecuritiesCode.unique()

In [6]:
dframe_1 = dframe.drop(['Open', 'High', 'Low', 'Volume', 'RowId', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag'], axis=1)
dframe_1 = dframe_1.sort_values(['SecuritiesCode', 'Date'], ascending=[True, True]).reset_index(drop=True)
dframe_1

Unnamed: 0,Date,SecuritiesCode,Close,Target
0,2017-01-04,1301,2742.0,0.000730
1,2017-01-05,1301,2738.0,0.002920
2,2017-01-06,1301,2740.0,-0.001092
3,2017-01-10,1301,2748.0,-0.005100
4,2017-01-11,1301,2745.0,-0.003295
...,...,...,...,...
2332526,2021-11-29,9997,668.0,0.026987
2332527,2021-11-30,9997,667.0,-0.001460
2332528,2021-12-01,9997,685.0,0.017544
2332529,2021-12-02,9997,684.0,0.014368


In [7]:
####Testing dataset ideas ----------
start = 0
indices = []
for _ in range(-(len(dframe_1) // -seq_len)):
  indices.append((start, start+seq_len))
  start+= seq_len


#Creates new idx array
new_idxs = []
for pos, elemen in enumerate(indices):
  start, end = elemen
  intervals = dframe_1[start:end]['SecuritiesCode'].value_counts(sort=False).values #False keeps the OG order(according to github)
  if len(intervals) != 1:
    new_idxs.extend([(start, start+intervals[0]), (start+intervals[0], end)])
  else:
    new_idxs.append((start, end))

#Works in place
idx = 0
while idx<len(indices):
  start, end = indices[idx]
  intervals = dframe_1[start:end]['SecuritiesCode'].value_counts(sort=False).values
  if len(intervals)!=1:
    indices = indices[:idx] + [(start, start+intervals[0]), (start+intervals[0], end)] + indices[idx+1:]
    idx+=1
  idx+=1

In [7]:
class TSDataset(Dataset):
  def __init__(self, dataframe, seq_len=128):
    self.df = dataframe
    self.indices = []

    #Creating indices
    start = 0
    for _ in range(-(len(dframe_1) // -seq_len)):
      self.indices.append((start, start+seq_len))
      start+=seq_len
    
    #fixing non-perfect intervals, --in place
    idx = 0
    while idx<len(self.indices):
      start, end = self.indices[idx]
      intervals = self.df[start:end]['SecuritiesCode'].value_counts(sort=False).values
      if len(intervals) != 1:
        self.indices = self.indices[:idx] + [(start, start+intervals[0]), (start+intervals[0], end)] + self.indices[idx+1:]
        idx+=1
      idx+=1
  
  def __len__(self):
    return len(self.indices)
  
  def __getitem__(self, idx):
    start, end = self.indices[idx]
    sequence = np.expand_dims(self.df[start:end]['Close'].values, 1)
    target = self.df[start:end]['Target'].values[-1]
    return sequence, target

In [None]:
#needs to be modified
def pad(seq, seq_len, padding_token):
  target = seq['Target'].iloc[-1]
  train_seq = seq.drop('Target', axis=1).values
  
  train_seq = np.pad(train_seq, pad_width=[(0, seq_len-train_seq.shape[0]), (0, 0)], constant_values=padding_token, mode='constant')
  return train_seq, target

In [10]:
#needs padding
testo_dset = TSDataset(dframe_1, seq_len=128)
for elemen in range(32):
  print(elemen, testo_dset[elemen][0].shape)

0 (128, 1)
1 (128, 1)
2 (128, 1)
3 (128, 1)
4 (128, 1)
5 (128, 1)
6 (128, 1)
7 (128, 1)
8 (128, 1)
9 (50, 1)
10 (78, 1)
11 (128, 1)
12 (128, 1)
13 (128, 1)
14 (128, 1)
15 (128, 1)
16 (128, 1)
17 (128, 1)
18 (128, 1)
19 (100, 1)
20 (28, 1)
21 (128, 1)
22 (128, 1)
23 (128, 1)
24 (128, 1)
25 (128, 1)
26 (128, 1)
27 (128, 1)
28 (128, 1)
29 (128, 1)
30 (22, 1)
31 (106, 1)
