In [429]:
import torch
from torch import nn, FloatTensor, Tensor
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from torch.optim import AdamW
import bank_dataset
import importlib
from tqdm.notebook import tqdm
import math

importlib.reload(bank_dataset)

torch.manual_seed(42)

<torch._C.Generator at 0x1243ccb10>

In [430]:
bank_dataset = bank_dataset.BankDataset("./bank_dataset.csv")
bank_dataset_train, bank_dataset_test = random_split(bank_dataset, [0.7, 0.3])
bank_dataloader_train = DataLoader(bank_dataset_train, batch_size=1, shuffle=True)
bank_dataloader_test = DataLoader(bank_dataset_test, batch_size=1, shuffle=True)

In [431]:
class AttentionHead(nn.Module):
    def __init__(self, embedding_size: int,  head_size: int):
        super().__init__()
        self.q = nn.Linear(embedding_size, head_size, bias=False)
        self.k = nn.Linear(embedding_size, head_size, bias=False)
        self.v = nn.Linear(embedding_size, head_size, bias=False)
        self.head_size = head_size

    def forward(self, X: Tensor): 
        keys = self.k(X)
        queries = self.q(X)
        values = self.v(X)

        scores = queries.matmul(keys.T)
        scores = scores / torch.sqrt(torch.tensor(self.head_size))
        scores = F.softmax(scores, dim=-1)
        res = scores.matmul(values)

        return res

In [432]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim, n_heads):
        super().__init__()
        assert embedding_dim % n_heads == 0

        head_size = embedding_dim // n_heads
        self.ath = nn.ModuleList([AttentionHead(embedding_dim, head_size) for _ in range(n_heads)])
    
    def forward(self, X):
        X = [ath(X) for ath in self.ath]
        X = torch.cat(X, dim=1)

        return X

In [433]:
class ResidualAdd(nn.Module):
    def __init__(self, inner: nn.Module):
        super().__init__()
        self.inner = inner

    def forward(self, X):
        r = self.inner(X)
        return r+X

In [434]:
class Transformer(nn.Module):
    def __init__(self, context_size, embedding_dim, n_heads):
        super().__init__()
        self.layers = nn.Sequential(
            ResidualAdd(MultiHeadAttention(embedding_dim, n_heads)),
            nn.LayerNorm(embedding_dim),
            ResidualAdd(
                nn.Sequential(
                    nn.Linear(embedding_dim, embedding_dim * 3),
                    nn.SiLU(),
                    nn.Linear(embedding_dim * 3, embedding_dim),
                )
            ),
            nn.LayerNorm(embedding_dim),
        )


    def forward(self, X):
        return self.layers(X)

In [435]:
# https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.one_hot.html
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_len: int = 5000, dropout: float = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [436]:
class BertProMax(nn.Module):
    def __init__(self, context_size, n_embeddings, embedding_dim, transformer_count, n_heads, n_classes):
        super().__init__()
        self.embeddings = nn.Embedding(n_embeddings, embedding_dim)
        self.positional_encoding = PositionalEncoding(embedding_dim, context_size, 0.1)
        self.layers = nn.Sequential(
            *(Transformer(context_size, embedding_dim, n_heads) for _ in range(transformer_count)),
            nn.Linear(embedding_dim, n_classes),
        )
        
    def forward(self, X):
        X = self.embeddings(X) 
        # X += self.positional_encoding(torch.arange(0, self.context_size))
        X = self.positional_encoding(X)
        X = self.layers(X)
        X = X.mean(dim=0)
        return X

In [437]:
model = BertProMax(
    context_size=bank_dataset.context_window_size,
    n_embeddings=bank_dataset.unique_word_count,
    embedding_dim=4,
    transformer_count=1,
    n_heads=1,
    n_classes=3,
)
f"Num parameters: {len(list(model.parameters()))}"

'Num parameters: 14'

In [438]:
optimizer = AdamW(model.parameters())
loss_fn = nn.CrossEntropyLoss()

In [439]:
for i in tqdm(range(100)):
    loss_total = 0
    for x, y in bank_dataloader_train:
        optimizer.zero_grad()
        y_pred = model(x[0])
        loss = loss_fn(y_pred, y[0])
        loss_total += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {i+1}: {loss_total / len(bank_dataloader_train)}")
model.eval()

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 1: 1.131457233239734
Epoch 2: 1.1008698675367568
Epoch 3: 1.0714089069416914
Epoch 4: 0.9306960435456069
Epoch 5: 0.7637306719072281
Epoch 6: 0.6266702467960025
Epoch 7: 0.5917177373770053
Epoch 8: 0.5971649657323878
Epoch 9: 0.5423659943832607
Epoch 10: 0.5082604631820999
Epoch 11: 0.43663948488535076
Epoch 12: 0.4880524722701579
Epoch 13: 0.35839014917257284
Epoch 14: 0.4623039522745385
Epoch 15: 0.3249903495605818
Epoch 16: 0.3417173612951523
Epoch 17: 0.35367207175426224
Epoch 18: 0.3063786306157314
Epoch 19: 0.29107093656800254
Epoch 20: 0.1911649764564736
Epoch 21: 0.23931706377191755
Epoch 22: 0.2590085105876877
Epoch 23: 0.15993791033154126
Epoch 24: 0.13320601594550585
Epoch 25: 0.15863483605620565
Epoch 26: 0.14448339625128678
Epoch 27: 0.18645128893429444
Epoch 28: 0.09056400069129214
Epoch 29: 0.07143258584278916
Epoch 30: 0.08710216058827148
Epoch 31: 0.13949890564708287
Epoch 32: 0.11339520558680373
Epoch 33: 0.19753577679681478
Epoch 34: 0.10590118887186267
Epoch 3

BertProMax(
  (embeddings): Embedding(651, 4)
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (layers): Sequential(
    (0): Transformer(
      (layers): Sequential(
        (0): ResidualAdd(
          (inner): MultiHeadAttention(
            (ath): ModuleList(
              (0): AttentionHead(
                (q): Linear(in_features=4, out_features=4, bias=False)
                (k): Linear(in_features=4, out_features=4, bias=False)
                (v): Linear(in_features=4, out_features=4, bias=False)
              )
            )
          )
        )
        (1): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
        (2): ResidualAdd(
          (inner): Sequential(
            (0): Linear(in_features=4, out_features=12, bias=True)
            (1): SiLU()
            (2): Linear(in_features=12, out_features=4, bias=True)
          )
        )
        (3): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
      )
    )
    (1): L

In [440]:
@torch.no_grad
def eval(index: int):
    x, y = bank_dataset.__getitem__(index)
    y_pred = model(x)
    s = F.softmax(y_pred, dim=-1) 
    highest_class = torch.argmax(s)
    print(bank_dataset.df["Satz"][index])
    print(f"Predicted: {bank_dataset.index2label(highest_class.item())} ({s[highest_class].item()*100}%)")
    print(f"True:      {bank_dataset.index2label(y.item())}")

In [441]:
@torch.no_grad
def loss_eval(index: int):
    x, y = bank_dataset.__getitem__(index)
    y_pred = model(x)
    return loss_fn(y_pred, y)

In [442]:
all_losses = torch.tensor([loss_eval(index) for index in range(len(bank_dataset))])

In [443]:
eval(all_losses.argmax().item())

In der Bank herrschte Hochbetrieb, weil viele ihre Steuerbescheide einzahlen wollten.
Predicted: Sitzbank (99.48650002479553%)
True:      Geldbank


In [444]:
with torch.no_grad():
    total_val_loss = 0
    for x, y in bank_dataloader_test:
        y_pred = model(x[0])
        loss = loss_fn(y_pred, y[0])
        total_val_loss += loss
    avg_val_loss = total_val_loss / len(bank_dataloader_test)

print(f"Val Loss: {avg_val_loss}")

Val Loss: 0.1953512728214264
