<a href="https://colab.research.google.com/github/daisuke-8080/kaggle_jpx/blob/main/%E6%A0%AA%E4%BE%A1%E4%BA%88%E6%B8%AC%E7%94%A8_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch

In [None]:
df = pd.read_csv('/content/drive/MyDrive/kaggle/informer_train_csv/1301.csv')

In [None]:
from torch.utils.data import Dataset

class src_train(Dataset):
  def __init__(self, df):
    super().__init__()

    pred_len = 30
    self.df = df[:-(pred_len)]
    self.len = len(self.df)

  def __len__(self):
    return self.len
  
  def __getitem__(self, index):
    dataset = self.df[['Open','High','Low','Close']].iloc[index].values

    return dataset
from torch.utils.data import Dataset

class tgt_train(Dataset):
  def __init__(self, df):
    super().__init__()
    self.df = df

    index_list = []
    count = 0
    pred_len = 30
    for i in range(len(df)):
      count += 1
      seq_len = 60
      if count % seq_len == 0 and count <= len(df) - pred_len:
        index_list.append(i)
    
    df_ = pd.DataFrame()
    for i in index_list:
      df_x  = self.df.iloc[i : i + pred_len][['Open','High','Low','Close']]
      df_ = pd.concat((df_, df_x), axis=0)
    
    self.df_ = df_

    self.len = len(df_)

  def __len__(self):
    return self.len
  
  def __getitem__(self, index):
    tgt = self.df_.iloc[index][['Open','High','Low','Close']].values

    return tgt

In [None]:
dataset_ex_src = src_train(df)
dataset_ex_tgt = tgt_train(df)

In [None]:
from torch.utils.data import DataLoader
BATCH_SIZE=60

src_loader = DataLoader(dataset_ex_src, batch_size=BATCH_SIZE, drop_last=True, pin_memory=True)
from torch.utils.data import DataLoader
BATCH_SIZE=30

tgt_loader = DataLoader(dataset_ex_tgt, batch_size=BATCH_SIZE, drop_last=True, pin_memory=True)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
from torch import nn

class PositionalEncoding(nn.Module):
  def __init__(self, d_model,max_len, device):
      super().__init__()
      self.d_model = d_model
      self.max_len= max_len
      self.position_tensor = self.get_position_tensor().to(device=device)

  def get_positional_encoding(self,  pos, i):
      w = pos / ((10000**(2*i))/self.d_model)
      if i % 2  ==  0:
        return np.sin(w)
      else:
        return np.cos(w)

  def get_position_tensor(self):
      position_list = [[self.get_positional_encoding(pos, i) for i in range(1,self.d_model +1)] for pos in range(1,  self.max_len +1)]
      return torch.tensor(np.array(position_list)).float()

  def forward(self, x):
      seq_len = x.size(1)
      return x + self.position_tensor[:seq_len, :].unsqueeze(0).to(device=device)


class Scaled_dot_product_attention(nn.Module):
  def __init__(self, d_k):
      super().__init__()
      self.d_k = d_k

  def forward(self, q, k, v, mask=None):
      scaler = np.sqrt(self.d_k)
      attention_weight = torch.matmul(q, torch.transpose(k,1,2)) / scaler

      if mask is not None:
          if mask.dim() != attention_weight.dim():
              raise ValueError(
                  "mask.dim != attention_weight.dim, mask.dim={}, attention_weight.dim={}".format(
                      mask.dim(), attention_weight.dim()
                    )
                )
          attention_weight = attention_weight.data.masked_fill_(mask, -torch.finfo(torch.float).max)
          attention_weight = attention_weight.to(device=device)
      attention_weight = nn.functional.softmax(attention_weight, dim=2)

      return torch.matmul(attention_weight, v)
      

class Multihead_attention(nn.Module):
  def __init__(self, d_model, heads_num):
      super().__init__()
      self.h = heads_num
      self.d_model = d_model
      self.d_k = d_model // self.h
      self.d_v = d_model // self.h

      self.q_W = nn.Parameter(torch.Tensor(self.h, d_model, self.d_k).to(device=device))
      self.k_W = nn.Parameter(torch.Tensor(self.h, d_model, self.d_k).to(device=device))
      self.v_W = nn.Parameter(torch.Tensor(self.h, d_model, self.d_v).to(device=device))

      self.scaled_dot_product_attention = Scaled_dot_product_attention(self.d_k)
      self.linear = nn.Linear(self.h* self.d_v, d_model)

  def forward(self, q, k, v, mask=None):
      batch_size, seq_len = q.size(0),q.size(1)

      q = q.repeat(self.h, 1,1,1)
      k = k.repeat(self.h, 1,1,1)
      v = v.repeat(self.h, 1,1,1)

      q = torch.einsum('hijk, hkl ->hijl', q, self.q_W)
      k = torch.einsum('hijk, hkl ->hijl', k, self.k_W)
      v = torch.einsum('hijk, hkl ->hijl', v, self.v_W)

      q = q.view(self.h * batch_size, seq_len, self.d_k)
      k = k.view(self.h * batch_size, seq_len, self.d_k)
      v = v.view(self.h * batch_size, seq_len, self.d_v)

      if mask is not None:
        mask = mask.repeat(self.h, 1,1).to(device=device)

      attention_output = self.scaled_dot_product_attention(q, k, v, mask)
      attention_output = attention_output.to(device=device)
      attention_output = torch.chunk(attention_output, self.h, dim=0)
      attention_output = torch.cat(attention_output, dim=2)

      output = self.linear(attention_output)
      return output


class ffn(nn.Module):
  def __init__(self, d_model, d_ff):
      super().__init__()
      self.linear1= nn.Linear(d_model, d_ff)
      self.linear2 = nn.Linear(d_ff, d_model)
    
  def forward(self, x):
      return self.linear2(nn.functional.relu((self.linear1(x))))


class TransformerEncoderLayer(nn.Module):
  def  __init__(self, d_model, d_ff,  heads_num, dropout_rate, layer_norm_eps):
      super().__init__()
      self.multi_head_attention = Multihead_attention(d_model, heads_num)
      self.dropout_self_attention = nn.Dropout(dropout_rate)
      self.layer_norm_self_attention = nn.LayerNorm(d_model, eps=layer_norm_eps)

      self.ffn = ffn(d_model,d_ff)
      self.dropout_ffn = nn.Dropout(dropout_rate)
      self.layer_norm_ffn =  nn.LayerNorm(d_model, eps=layer_norm_eps)

  def __self_attention_block(self, x, mask=None):
      x = self.multi_head_attention(x,x,x, mask)
      x = self.dropout_self_attention(x)
      return x

  def __feed_forward_block(self,x):
      x = self.ffn(x)
      x = self.dropout_ffn(x)
      return x
    
  def forward(self,x, mask=None):
      #x = self.layer_norm_self_attention(self.__self_attention_block(x, mask) + x)
      x = self.__self_attention_block(x, mask) + x
      #x = self.layer_norm_ffn(self.__feed_forward_block(x) + x)
      x = self.__feed_forward_block(x) + x
      return x


class TransformerEncoder(nn.Module):
  def __init__(self, max_len, d_model, d_ff, heads_num, dropout_rate, layer_norm_eps, N, device):
      super().__init__()
      self.positional_encoding = PositionalEncoding(d_model, max_len, device=device) 
      self.encoder_layers = nn.ModuleList(
          [TransformerEncoderLayer(d_model, d_ff,  heads_num, dropout_rate, layer_norm_eps) 
          for _ in range(N)]
      )
  def forward(self, x, mask=None):
        x = self.positional_encoding(x)
        for encoder_layer in self.encoder_layers:
          x = encoder_layer(x, mask)
        return x


from torch.nn.modules.normalization import LayerNorm


class TransformerDecoderLayer(nn.Module):
  def __init__(self, d_model, d_ff,  heads_num, dropout_rate, layer_norm_eps):
      super().__init__() 

      self.self_attention = Multihead_attention(d_model, heads_num)
      self.dropout_self_attention = nn.Dropout(dropout_rate)
      self.layer_norm_self_attention = nn.LayerNorm(d_model, eps=layer_norm_eps)

      self.src_tgt_attention = Multihead_attention(d_model, heads_num)
      self.dropout_src_tgt_attention = nn.Dropout(dropout_rate)
      self.layer_norm_src_tgt_attention = nn.LayerNorm(d_model, eps=layer_norm_eps)

      self.ffn = ffn(d_model, d_ff)
      self.dropout_ffn = nn.Dropout(dropout_rate)
      self.layer_norm_ffn = nn.LayerNorm(d_model, eps=layer_norm_eps)

  def __src_tgt_attention_block(self, src, tgt, mask=None):
      return self.dropout_src_tgt_attention(self.src_tgt_attention(tgt, src, src, mask))

  def __self_attention_block(self, x, mask=None):
      return self.dropout_self_attention(self.self_attention(x, x, x, mask))

  def __feed_forward_block(self,x):
      return self.dropout_ffn(self.ffn(x))

  def forward(self, tgt, src, src_tgt_mask, self_mask=None):
      #tgt = self.layer_norm_self_attention(tgt + self.__self_attention_block(tgt, self_mask))
      tgt = tgt + self.__self_attention_block(tgt, self_mask)
      #x = self.layer_norm_src_tgt_attention(tgt + self.__src_tgt_attention_block(src, tgt, src_tgt_mask))
      x = tgt + self.__src_tgt_attention_block(src, tgt, src_tgt_mask)
      #x = self.layer_norm_ffn(x + self.__feed_forward_block(x))
      x = x + self.__feed_forward_block(x)
      return x


class TransformerDecoder(nn.Module):
  def  __init__(self, max_len, d_model, d_ff, heads_num, dropout_rate, layer_norm_eps, N, device):
      super().__init__()
      self.positional_encoding = PositionalEncoding(d_model, max_len, device)
      self.decoder_layers = nn.ModuleList(
          [TransformerDecoderLayer(d_model, d_ff, heads_num, dropout_rate, layer_norm_eps)
          for _ in range(N)]
      )

  def forward(self, src, tgt, src_tgt_mask, self_mask=None):
      tgt = self.positional_encoding(tgt)
      for decoder_layer in self.decoder_layers:
        tgt = decoder_layer(tgt, src, src_tgt_mask, self_mask=None)
      return tgt


class MyTransformer(nn.Module):
  def __init__(self, tgt_size, max_len_enc, max_len_dec, d_model_enc, d_model_dec, heads_num, d_ff, dropout_rate = 0.1, layer_norm_eps=1e-5, N=6, device=device):
      super().__init__()
      self.max_len_enc = max_len_enc
      self.max_len_dec = max_len_dec
      self.d_model_enc = d_model_enc
      self.d_model_dec = d_model_dec

      self.heads_num = heads_num
      self.d_ff = d_ff
      self.dropout_rate = dropout_rate
      self.layer_norm_eps = layer_norm_eps
      self.N = N
      self.device = device
      self.tgt_size = tgt_size

      self.encoder = TransformerEncoder(max_len_enc, d_model_enc, heads_num, d_ff, N=6, dropout_rate = 0.1, layer_norm_eps=1e-5, device=device)
      self.decoder = TransformerDecoder(max_len_dec, d_model_dec, heads_num, d_ff, N=6, dropout_rate = 0.1, layer_norm_eps=1e-5, device=device)
      self.linear = nn.Linear(d_model_dec, tgt_size)
  
  def _subsequent_mask(self, x):
      batch_size = x.size(0)
      max_len = x.size(1)
      mask =  torch.tril(torch.ones(batch_size, max_len, max_len)).eq(0).to(self.device)
      return  mask

  def _pad_mask(self, x):
      seq_len = x.size(1)
      d_model = x.size(2)
      x = x.squeeze(0)
      mask = [[np.nan for i in range(d_model)] for j in range(seq_len)]
      mask = torch.tensor(np.array(mask)).to(device=device)
      mask = x.eq(mask)
      mask = torch.any(mask == True, axis=1)
      mask.size()
      mask = torch.unsqueeze(mask,0)
      mask = torch.unsqueeze(mask, 0)
      mask = mask.repeat(1, seq_len, 1)
      mask.size()
      return mask.to(device=device)

  def forward(self, src, tgt):
      pad_mask_src = self._pad_mask(src)
      src = self.encoder(src, pad_mask_src)
      mask_self_attn = torch.logical_or(
            self._subsequent_mask(tgt), self._pad_mask(tgt))
      output = self.decoder(src, tgt, mask_self_attn, pad_mask_src)
      y = self.linear(output)
      return  output

In [None]:
class PredError(Exception):
  pass

class LossError(Exception):
  pass

device = "cuda" if torch.cuda.is_available() else "cpu"
model = MyTransformer(tgt_size=4, max_len_enc=60, max_len_dec=30, d_model_enc=4, d_model_dec=4, heads_num=2, d_ff=2048, dropout_rate=0.1, layer_norm_eps=0.0001, N=6, device=device)
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr=0.01, betas=(0.9, 0.999), eps=1e-08)
criterion = nn.MSELoss()
epochs = 10

for epoch  in range(epochs):
    model.train()
    model.to(device)
    count  = 0

    for (src, tgt) in zip(src_loader, tgt_loader):
      count += 1
      
      src = src.float().unsqueeze(0).to(device=device)
      tgt = tgt.float().unsqueeze(0).to(device=device)

      optimizer.zero_grad()

      if torch.isnan(src).any() or torch.isnan(tgt).any():
        continue
      
      else:

        if count >1:
          previous_pred = pred

        pred = model(src, tgt)

        if torch.isnan(pred).any():
          print(count)
          print(previous_pred)
          print(pred)
          print(pred.size())
          raise PredError('pred == nan')

        tgt_ = tgt[:, 1:, :]
        pred_ = pred[:, :-1, :]
        
        loss = criterion(tgt_, pred_)

        if torch.isnan(loss).any():
          print(count)
          print(previous_pred)
          print(pred)
          print(pred.size())
          raise LossError('loss == nan')

        loss.backward()

        optimizer.step()

    print(f"[train] epoch: {epoch+1}/{epochs}, epoch_loss: {loss.item():.3f}")

print(pred)



[train] epoch: 1/10, epoch_loss: 143498.547
[train] epoch: 2/10, epoch_loss: 16577.256
[train] epoch: 3/10, epoch_loss: 1213.323
[train] epoch: 4/10, epoch_loss: 860.718
[train] epoch: 5/10, epoch_loss: 841.663
[train] epoch: 6/10, epoch_loss: 849.093
[train] epoch: 7/10, epoch_loss: 843.712
[train] epoch: 8/10, epoch_loss: 841.792
[train] epoch: 9/10, epoch_loss: 846.968
[train] epoch: 10/10, epoch_loss: 840.146
tensor([[[3034.2700, 3043.5879, 3012.5332, 3042.2500],
         [3034.2781, 3058.5615, 3022.7634, 3046.7725],
         [3049.2825, 3063.3105, 3032.6416, 3061.8052],
         [3068.7014, 3093.8037, 3057.2126, 3087.2495],
         [3094.2871, 3092.9004, 3067.8398, 3086.8052],
         [3088.8179, 3088.3054, 3042.7634, 3068.1055],
         [3039.4036, 3099.0679, 3033.5059, 3097.4675],
         [3094.3760, 3128.5798, 3083.4949, 3126.5911],
         [3129.0056, 3138.8037, 3103.1868, 3136.1060],
         [3114.2781, 3118.8330, 3072.4265, 3098.0229],
         [3099.0022, 3124.0679, 3

In [None]:
src_pred = df[['Open','High','Low','Close']].iloc[-90:-30].values
tgt_pred = df[['Open','High','Low','Close']].iloc[-30:].values
src_pred = torch.tensor(src_pred).unsqueeze(0).float().to(device)
tgt_pred = torch.tensor(tgt_pred).unsqueeze(0).float().to(device)
src_of_src = torch.cat((src_pred, tgt_pred), dim=1)

pred_len =20
count = 0
model.eval()
for i in range(pred_len):
  count += 1
  pred_x = model(src_pred, tgt_pred)
  next_1 =  pred_x[:,-1:,:]
  src_of_src = torch.cat((src_of_src, next_1), 1)
  src_pred = src_of_src[:,count:count+60,:]
  tgt_pred = pred_x
  prediction_a = src_of_src[:,-count:,:]
  
prediction_a

tensor([[[2987.0603, 2986.6592, 2962.8809, 2984.4114],
         [2991.1206, 2990.3184, 2960.7617, 2986.8228],
         [2995.1809, 2993.9775, 2958.6426, 2989.2341],
         [2999.2412, 2997.6367, 2956.5234, 2991.6455],
         [3003.3015, 3001.2959, 2954.4043, 2994.0569],
         [3007.3618, 3004.9551, 2952.2852, 2996.4683],
         [3011.4221, 3008.6143, 2950.1660, 2998.8796],
         [3015.4824, 3012.2734, 2948.0469, 3001.2910],
         [3019.5427, 3015.9326, 2945.9277, 3003.7024],
         [3023.6030, 3019.5918, 2943.8086, 3006.1138],
         [3027.6633, 3023.2510, 2941.6895, 3008.5251],
         [3031.7236, 3026.9102, 2939.5703, 3010.9365],
         [3035.7839, 3030.5693, 2937.4512, 3013.3479],
         [3039.8442, 3034.2285, 2935.3320, 3015.7593],
         [3043.9045, 3037.8877, 2933.2129, 3018.1707],
         [3047.9648, 3041.5469, 2931.0938, 3020.5820],
         [3052.0251, 3045.2061, 2928.9746, 3022.9934],
         [3056.0854, 3048.8652, 2926.8555, 3025.4048],
         [

In [None]:
src_pred_b = df[['Open','High','Low','Close']].iloc[-61:-1].values
tgt_pred_b = df[['Open','High','Low','Close']].iloc[-1:].values
src_pred_b = torch.tensor(src_pred_b).unsqueeze(0).float().to(device)
tgt_pred_b = torch.tensor(tgt_pred_b).unsqueeze(0).float().to(device)
src_of_src_b = torch.cat((src_pred_b, tgt_pred_b), dim=1)
nans = np.zeros((1,29,4))
nans[:,:,:] = np.nan
nans = torch.tensor(nans).float().to(device)
tgt_pred_b = torch.cat((tgt_pred_b, nans), dim=1)
tgt_pred_b

In [None]:
pred_len = 20
count = 0
model.eval()
for i in range(pred_len+1):
  count += 1
  prediction_b = model(src_pred_b, tgt_pred_b)
  tgt_pred_b[:,count:count+1,:] = pred_x[:,count-1:count,:]

prediction_b = prediction_b[:,1:pred_len+1,:]

In [None]:
prediction_b

tensor([[[3115.2664, 3116.8428, 2965.4980, 3060.6389],
         [3095.2664, 3116.8428, 2960.4980, 3060.6389],
         [3080.2664, 3081.8428, 2944.4980, 3040.6389],
         [3065.2664, 3096.8428, 2935.4980, 3042.6389],
         [3071.2664, 3091.8428, 2941.4980, 3060.6389],
         [3095.2664, 3131.8428, 2941.4980, 3046.6389],
         [3082.2664, 3126.8428, 2947.4980, 3100.6389],
         [3170.2664, 3161.8428, 3010.4980, 3135.6389],
         [3170.2664, 3176.8428, 3020.4980, 3130.6389],
         [3160.2664, 3326.8428, 3030.4980, 3300.6389],
         [3235.2664, 3236.8428, 3060.4980, 3160.6389],
         [3160.2664, 3196.8428, 3020.4980, 3115.6389],
         [3150.2664, 3161.8428, 2985.4980, 3080.6389],
         [3120.2664, 3126.8428, 2990.4980, 3100.6389],
         [3120.2664, 3131.8428, 2980.4980, 3095.6389],
         [3120.2664, 3186.8428, 2990.4980, 3160.6389],
         [3185.2664, 3206.8428, 3040.4980, 3145.6389],
         [3185.2664, 3196.8428, 3050.4980, 3170.6389],
         [