In [922]:
import torch
from torch import nn, FloatTensor, Tensor
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from torch.optim import AdamW
import bank_dataset
import importlib
from tqdm.notebook import tqdm
from torchinfo import summary
import math

importlib.reload(bank_dataset)

torch.manual_seed(43);

In [923]:
bank_dataset = bank_dataset.BankDataset("./bank_dataset.csv")
bank_dataset_train, bank_dataset_test = random_split(bank_dataset, [0.8, 0.2])
bank_dataloader_train = DataLoader(bank_dataset_train, batch_size=4, shuffle=True)
bank_dataloader_test = DataLoader(bank_dataset_test, batch_size=1, shuffle=True)

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, embedding_size: int, head_size: int, causal_masking=True):
        super().__init__()
        self.q = nn.Linear(embedding_size, head_size, bias=False)
        self.k = nn.Linear(embedding_size, head_size, bias=False)
        self.v = nn.Linear(embedding_size, head_size, bias=False)
        self.head_size = head_size
        self.causal_masking = causal_masking

    def forward(self, X: Tensor):
        b, c, e = X.shape
        keys: Tensor = self.k(X)
        queries: Tensor = self.q(X)
        values = self.v(X)

        scores = queries.matmul(keys.transpose(-2, -1))
        scores = scores / torch.sqrt(torch.tensor(self.head_size))

        if self.causal_masking:
            mask = torch.tril(torch.ones(c, c))
            mask = mask.masked_fill(mask == 0, float("-inf"))
            scores += mask.unsqueeze(0)

        scores = F.softmax(scores, dim=-1)

        res = scores.matmul(values)

        return res

In [925]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim, n_heads):
        super().__init__()
        assert embedding_dim % n_heads == 0

        head_size = embedding_dim // n_heads
        self.ath = nn.ModuleList([AttentionHead(embedding_dim, head_size) for _ in range(n_heads)])
    
    def forward(self, X):
        X = [ath(X) for ath in self.ath]
        X = torch.cat(X, dim=-1)

        return X

In [926]:
class ResidualAdd(nn.Module):
    def __init__(self, inner: nn.Module):
        super().__init__()
        self.inner = inner

    def forward(self, X):
        r = self.inner(X)
        return r+X

In [927]:
class Transformer(nn.Module):
    def __init__(self, embedding_dim, n_heads):
        super().__init__()
        self.layers = nn.Sequential(
            nn.LayerNorm(embedding_dim),
            ResidualAdd(MultiHeadAttention(embedding_dim, n_heads)),
            nn.LayerNorm(embedding_dim),
            ResidualAdd(
                nn.Sequential(
                    nn.Linear(embedding_dim, embedding_dim * 3),
                    nn.ReLU(),
                    nn.Linear(embedding_dim * 3, embedding_dim),
                )
            ),
        )


    def forward(self, X):
        return self.layers(X)

In [928]:
# https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.one_hot.html
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_len: int = 5000, dropout: float = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor):
        # x shape: (batch_size, seq_len, embedding_dim)
        seq_len = x.size(1)  # Changed from x.size(0) to x.size(1)
        x = x + self.pe[:seq_len, :].unsqueeze(0)  # Add batch dimension
        return self.dropout(x)


In [929]:
class BertProMax(nn.Module):
    def __init__(self, n_embeddings, embedding_dim, transformer_count, n_heads, n_classes):
        super().__init__()
        self.embeddings = nn.Embedding(n_embeddings, embedding_dim)
        self.positional_encoding = PositionalEncoding(embedding_dim)
        self.layers = nn.Sequential(
            *(Transformer(embedding_dim, n_heads) for _ in range(transformer_count)),
            nn.Linear(embedding_dim, n_classes),
        )
        
    def forward(self, X):
        X = self.embeddings(X) 
        X = self.positional_encoding(X)
        X = self.layers(X)
        X = X.mean(dim=1)
        return X

In [930]:
model = BertProMax(
    n_embeddings=bank_dataset.unique_word_count,
    embedding_dim=4,
    transformer_count=1,
    n_heads=2,
    n_classes=3,
)

summary(model, input=(26,), dtypes=[torch.int])

Layer (type:depth-idx)                                  Param #
BertProMax                                              --
├─Embedding: 1-1                                        2,604
├─PositionalEncoding: 1-2                               --
│    └─Dropout: 2-1                                     --
├─Sequential: 1-3                                       --
│    └─Transformer: 2-2                                 --
│    │    └─Sequential: 3-1                             176
│    └─Linear: 2-3                                      15
Total params: 2,795
Trainable params: 2,795
Non-trainable params: 0

In [931]:
optimizer = AdamW(model.parameters())
loss_fn = nn.CrossEntropyLoss()

In [932]:
for i in tqdm(range(200)):
    loss_total = 0
    for x, y in bank_dataloader_train:
        optimizer.zero_grad()
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        loss_total += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {i+1}: {loss_total / len(bank_dataloader_train)}")
model.eval();

  0%|          | 0/200 [00:00<?, ?it/s]

Epoch 1: 1.1692737716215629
Epoch 2: 1.1405173065485779
Epoch 3: 1.1227147181828816
Epoch 4: 1.1160998256118209
Epoch 5: 1.1036120178522888
Epoch 6: 1.0951361832795319
Epoch 7: 1.094039919199767
Epoch 8: 1.085152174587603
Epoch 9: 1.0774236343525074
Epoch 10: 1.066128549752412
Epoch 11: 1.0393584812128986
Epoch 12: 1.0183351702160306
Epoch 13: 0.9670636102005288
Epoch 14: 0.9318198374024144
Epoch 15: 0.8665904794578199
Epoch 16: 0.8039727669071268
Epoch 17: 0.7458708480828338
Epoch 18: 0.7052695546989087
Epoch 19: 0.6857538648225643
Epoch 20: 0.6781817903673207
Epoch 21: 0.6434855926092025
Epoch 22: 0.6040636375546455
Epoch 23: 0.5482117403988485
Epoch 24: 0.5985728360160634
Epoch 25: 0.5627950305740038
Epoch 26: 0.5284068460265795
Epoch 27: 0.5116041174365414
Epoch 28: 0.5089780762791634
Epoch 29: 0.48604126302180467
Epoch 30: 0.49497893425049605
Epoch 31: 0.4790869417289893
Epoch 32: 0.45055737194639667
Epoch 33: 0.45813870524849604
Epoch 34: 0.43122081458568573
Epoch 35: 0.441584350

In [933]:
@torch.no_grad
def eval(index: int):
    x, y = bank_dataset.__getitem__(index)
    y_pred = model(x.unsqueeze(0))
    s = F.softmax(y_pred, dim=-1) 
    highest_class = torch.argmax(s)
    print(bank_dataset.df["Satz"][index])
    print(f"Predicted: {bank_dataset.index2label(highest_class.item())} ({s[0][highest_class].item()*100}%)")
    print(f"True:      {bank_dataset.index2label(y.item())}")

In [934]:
@torch.no_grad
def loss_eval(index: int):
    x, y = bank_dataset.__getitem__(index)
    y_pred = model(x.unsqueeze(0))
    return loss_fn(y_pred, y.unsqueeze(0))

In [935]:
all_losses = torch.tensor([loss_eval(index) for index in range(len(bank_dataset))])

In [936]:
eval(all_losses.argmax().item())

Die Bank verschickte ein Schreiben, das über neue Datenschutzrichtlinien informierte.
Predicted: Flussbank (99.8314380645752%)
True:      Geldbank


In [None]:
with torch.no_grad():
    total_val_loss = 0
    for x, y in bank_dataloader_test:
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        total_val_loss += loss
    avg_val_loss = total_val_loss / len(bank_dataloader_test)

print(f"Val Loss: {avg_val_loss}")

tensor([197, 558, 410, 118, 388, 549, 523,  20, 113, 372, 617, 193,   2,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0])
Val Loss: 0.0
