In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [24]:
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

In [25]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads 
        self.head_dim = d_model // num_heads

        assert self.head_dim * self.num_heads == self.d_model, "Embedding size needs to be divisible by num_heads!"

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)


        # self.qkv_linear = nn.Linear(d_model, d_model * 3)

        self.fc_out = nn.Linear(d_model, d_model)

    def forward(self, query, key, value):
        # N = query.shape[0]
        # QKV = self.qkv_linear(torch.cat([query, key, value], dim=0))
        # Q, K, V = torch.split(QKV, [N, N, N], dim=0)

        # Q = Q.reshape(N, -1, self.num_heads, self.head_dim)
        # K = K.reshape(N, -1, self.num_heads, self.head_dim)
        # V = V.reshape(N, -1, self.num_heads, self.head_dim)

        Q = self.q_linear(query)
        K = self.k_linear(key)
        V = self.v_linear(value)

        Q = Q.view(Q.shape[0], -1, self.num_heads, self.head_dim)
        K = K.view(K.shape[0], -1, self.num_heads, self.head_dim)
        V = V.view(V.shape[0], -1, self.num_heads, self.head_dim)

        attention = torch.einsum("nqhd,nkhd->nhqk", [Q, K]) / (self.head_dim ** 0.5)
        attention = F.softmax(attention, dim=-1)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, V]).reshape(Q.shape[0], -1, self.d_model)

        return self.fc_out(out), attention



In [50]:
class ProbSparseMultiHeadAttention(MultiHeadAttention):
    def __init__(self, d_model, num_heads, attention_ratio=0.25):
        super().__init__(d_model, num_heads)
        self.attention_ratio = attention_ratio

    def forward(self,query, key, value, mask):
        # N = query.shape[0]
        # QKV = self.qkv_linear(torch.cat([query, key, value], dim=0))
        # Q, K, V = torch.split(QKV, [N, N, N], dim=0)

        # Q = Q.reshape(N, -1, self.num_heads, self.head_dim)
        # K = K.reshape(N, -1, self.num_heads, self.head_dim)
        # V = V.reshape(N, -1, self.num_heads, self.head_dim)

        batch_size = query.size(0)

        Q = self.q_linear(query)
        K = self.k_linear(key)
        V = self.v_linear(value)

        Q = Q.view(Q.shape[0], -1, self.num_heads, self.head_dim)
        K = K.view(K.shape[0], -1, self.num_heads, self.head_dim)
        V = V.view(V.shape[0], -1, self.num_heads, self.head_dim)

        scaled_attention_logits = torch.einsum("nqhd,nkhd->nhqk", [Q, K]) / (self.head_dim ** 0.5)
        top_val, top_idx = torch.topk(scaled_attention_logits,
                                      int(self.attention_ratio * scaled_attention_logits.shape[-1]), 
                                      sorted=False)
        attention = torch.zeros_like(scaled_attention_logits).scatter_(-1, top_idx, top_val)

        # seq_Q, seq_K = attention.size(-2), attention.size(-1)
        # mask = torch.triu(torch.ones(seq_Q, seq_K), diagonal=1)
        attention += (mask * -1e9)

        attention = F.softmax(attention, dim=-1)

        out = torch.einsum("bhql,blhd->bqhd", [attention, V]).reshape(batch_size, -1, self.d_model)
        return self.fc_out(out), attention

In [51]:
mha = ProbSparseMultiHeadAttention(512, 8).to(device)

batch_size = 32
seq_len = 100
d_model = 512

query = torch.rand(batch_size, seq_len, d_model).to(device)
key   = torch.rand(batch_size, seq_len, d_model).to(device)
value = torch.rand(batch_size, seq_len, d_model).to(device)

output, attention_weights = mha(query, key, value)

"""
print(f'Output shape: {output.shape}')
print(f'Attneion weight shape: {attention_weights.shape}')

q_linear = nn.Linear(d_model, d_model)
k_linear = nn.Linear(d_model, d_model)
v_linear = nn.Linear(d_model, d_model)

qkv_linear = nn.Linear(d_model, d_model * 3)

Q = q_linear(query)
K = k_linear(key)
V = v_linear(value)

N = query.shape[0]

QKV = qkv_linear(query)
Q2, K2, V2 = torch.split(QKV, split_size_or_sections=d_model, dim=-1)

n_heads = 8
head_dim = d_model // n_heads

Q = Q.view(Q.shape[0], -1, n_heads, head_dim)
K = K.view(K.shape[0], -1, n_heads, head_dim)
V = V.view(V.shape[0], -1, n_heads, head_dim)

print(f'Q shape: {Q.shape}')
print(f'K shape: {K.shape}')
print(f'V shape: {V.shape}')

scaled_attention_logits = torch.einsum("nqhd,nkhd->nhqk", [Q, K]) / (head_dim ** 0.5)
print(f"scaled_attention_logits shape: {scaled_attention_logits.shape}")

top_val, top_idx = torch.topk(scaled_attention_logits, int(0.25 * scaled_attention_logits.shape[-1]), sorted=False)
print(f'top_val shape: {top_val.shape}')
print(f'top_idx shape: {top_idx.shape}')

attention = torch.zeros_like(scaled_attention_logits).scatter_(-1, top_idx, top_val)
print(f'attention shape: {attention.shape}')

attention = F.softmax(attention, dim=-1)
print(f'softmax attention shape: {attention.shape}')
"""

TypeError: ProbSparseMultiHeadAttention.forward() missing 1 required positional argument: 'mask'

In [52]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

In [53]:
d_model = 512
max_len = 5000
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.)) / d_model))
print(f'pe.shape: {pe.shape}')
print(f'position.shape: {position.shape}')
print(f'div_term.shape: {div_term.shape}')

pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
print(f'pe.shape: {pe.shape}')

pe.shape: torch.Size([5000, 512])
position.shape: torch.Size([5000, 1])
div_term.shape: torch.Size([256])
pe.shape: torch.Size([5000, 1, 512])


In [54]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

d_model = 512
d_ff = 2048
dropout = 0.1

positionwise_feedforward = PositionwiseFeedForward(d_model, d_ff, dropout)
x = torch.rand(32, 100, d_model)

output = positionwise_feedforward(x)
print(output.shape)

torch.Size([32, 100, 512])


In [55]:
class InformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dim_feedforward, dropout=0.1):
        super().__init__()

        self.self_attn = MultiHeadAttention(d_model, num_heads)
        # self.feed_forward = nn.Sequential(
        #     nn.Linear(d_model, dim_feedforward),
        #     nn.ReLU(),
        #     nn.Linear(dim_feedforward, d_model)
        # )
        self.feed_forward = PositionwiseFeedForward(d_model, dim_feedforward, dropout)

        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        attn_output, _ = self.self_attn(x, x, x)
        x = x + self.dropout(attn_output)
        x = self.layernorm1(x)

        ff_output = self.feed_forward(x)
        x = x + self.dropout(ff_output)
        x = self.layernorm2(x)

        return x


In [57]:
class InformerEncoder(nn.Module):
    def __init__(self, d_model, num_heads, dim_feedforward, num_layers, dropout=0.1):
        super().__init__()

        self.encoder_layers = nn.ModuleList([
            InformerEncoderLayer(d_model, num_heads, dim_feedforward, dropout)
            for _ in range(num_layers)
        ])

        self.pos_encoder = PositionalEncoding(d_model)

    def forward(self, x, mask):
        x = self.pos_encoder(x)

        for layer in self.encoder_layers:
            x = layer(x)

        return x


num_layers = 3

encoder_layer = InformerEncoderLayer(512, 8, 2048, 0.1)
informer_encoder = InformerEncoder(d_model=512, num_heads=8, dim_feedforward=2048, num_layers=num_layers, dropout=0.1)

src = torch.rand(32, 100, 512)

mask = torch.triu(torch.ones(100, 100), diagonal=1).to(device)
output = informer_encoder(src, mask)
print(output.shape)

torch.Size([32, 100, 512])


In [66]:
class InformerDecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dim_feedforward, dropout=0.1):
        super().__init__()
        # self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.self_attn = ProbSparseMultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionwiseFeedForward(d_model, dim_feedforward, dropout)

        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.layernorm3 = nn.LayerNorm(d_model)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, memory):
        attn_output, _ = self.self_attn(x, x, x, mask)
        x = x + self.dropout(attn_output)
        x = self.layernorm1(x)

        attn_output, _ = self.cross_attn(x, memory, memory)
        x = x + self.dropout(attn_output)
        x = self.layernorm2(x)

        ff_output = self.feed_forward(x)
        x = x + self.dropout(ff_output)
        x = self.layernorm3(x)

        return x

In [67]:
class InformerDecoder(nn.Module):
    def __init__(self, d_model, num_heads, dim_feedforward, num_layers, dropout=0.1):
        super().__init__()
        self.decoder_layers = nn.ModuleList([
            InformerDecoderLayer(d_model, num_heads, dim_feedforward, dropout)
            for _ in range(num_layers)
        ])

        self.pos_encoder = PositionalEncoding(d_model)

    def forward(self, x, memory):
        x = self.pos_encoder(x)

        for layer in self.decoder_layers:
            x = layer(x, memory)

        return x

num_layers = 3
informer_decoder = InformerDecoder(d_model=512, num_heads=8, dim_feedforward=2048, num_layers=num_layers, dropout=0.1)

x = torch.rand(32, 100, 512)
memory = torch.rand(32, 100, 512)

output = informer_decoder(x, memory)
print(output.shape)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, mps:0 and cpu!

In [60]:
class Informer(nn.Module):
    def __init__(self, d_model, num_heads, dim_feedforward, num_encoder_layers, num_decoder_layers, dropout=0.1):
        super().__init__()

        self.encoder = InformerEncoder(d_model, num_heads, dim_feedforward, num_encoder_layers, dropout)
        self.decoder = InformerDecoder(d_model, num_heads, dim_feedforward, num_decoder_layers, dropout)
        self.output_layer = nn.Linear(d_model, d_model)

    def forward(self, x, y):
        memory = self.encoder(x)
        output = self.decoder(y, memory)
        output = self.output_layer(output)
        return output

informer_model = Informer(d_model=512, num_heads=8, dim_feedforward=2048, num_encoder_layers=3, num_decoder_layers=3, dropout=0.1)

x = torch.rand(32, 100, 512)
y = torch.rand(32, 100, 512)

output = informer_model(x, y)
print(output.shape)

TypeError: InformerEncoder.forward() missing 1 required positional argument: 'mask'

In [61]:
import numpy as np
import pandas as pd
from typing import Tuple 
from torch.utils.data import Dataset, DataLoader
from pathlib import Path

data_root = Path("~/Work/Crypto/spot/monthly/klines")
yearmon = "2023-07"

def load_raw_data(symbol):
    filename = data_root / symbol / '1m' / f'{symbol}-1m-{yearmon}.csv'
    df = pd.read_csv(filename,
                     header=None,
                     names=[
                        "open_time",
                        "open",
                        "high",
                        "low",
                        "close",
                        "volume",
                        "close_time",
                        "quote_volume",
                        "n_trades",
                        "taker_buy_base_volume",
                        "taker_buy_quote_volume",
                        "ignore",
                     ])

    df.set_index("open_time", inplace=True)
    df.sort_index(inplace=True)
    return df

df1 = load_raw_data("BTCUSDT")
df2 = load_raw_data("ETHUSDT")

df = pd.DataFrame({
    "BTC": np.log(df1["open"]).diff(),
    "ETH": np.log(df2["open"]).diff(),
}).dropna(how="any")

tensor_data: torch.Tensor = torch.FloatTensor(df.values)

class TimeSeriesDataset(Dataset):
    def __init__(self, data: torch.Tensor, src_len: int, tgt_len: int) -> None:
        self.data = data
        self.src_len = src_len
        self.tgt_len = tgt_len 
        self.total_len = src_len + tgt_len 

    def __len__(self) -> int:
        return len(self.data) - self.total_len + 1

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        start = idx 
        end = idx + self.total_len
        sample = self.data[start:end]
        src = sample[:self.src_len]
        tgt = sample[self.src_len:]
        return src, tgt

src_seq_len: int = 60 * 48
tgt_seq_len: int = 60
batch_szie:  int = 32

custom_dataset = TimeSeriesDataset(tensor_data, src_seq_len, tgt_seq_len)
train_loader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)


In [62]:
%%time

model = Informer(d_model=2, num_heads=2, dim_feedforward=2048, num_encoder_layers=3, num_decoder_layers=3, dropout=0.1)
model.to("mps")
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),  lr=1e-4)

epochs = 10
for epoch in range(epochs):
    for i, (x, y) in enumerate(train_loader):
        x, y = x.to("mps"), y.to("mps")
        outputs = model(x, y)

        loss = criterion(outputs, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch [{epoch+1}/{epochs}], Batch [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}")


TypeError: InformerEncoder.forward() missing 1 required positional argument: 'mask'

In [17]:
x.shape

torch.Size([32, 2880, 2])