In [1]:
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
import os
os.chdir(os.path.dirname(os.getcwd()))

In [3]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from hcrot import layers, optim
from hcrot.dataset import *
from hcrot.utils import *

In [4]:
lr_rate = 1e-3
hidden_size = 32
epochs = 10

In [5]:
df = pd.read_csv("./datasets/mnist_test.csv")
label = df['7'].to_numpy()
df = df.drop('7', axis=1)
dat = (df / 255.).to_numpy()

dataset_len = len(dat)
dat = dat.reshape(dataset_len, 28, 28).astype(np.float32)

In [6]:
train_image, test_image = dat[:5000], dat[8001:9001]
train_label, test_label = label[:5000], label[8001:9001]

In [7]:
dataloader = Dataloader(train_image, train_label, batch_size=50, shuffle=True)
testloader = Dataloader(test_image, test_label, batch_size=1, shuffle=False)

In [8]:
def get_sinusoid_encoding_table(n_seq, d_hidn):
    # refs: https://paul-hyun.github.io/transformer-01/
    def cal_angle(position, i_hidn):
        return position / np.power(10000, 2 * (i_hidn // 2) / d_hidn)
    def get_posi_angle_vec(position):
        return [cal_angle(position, i_hidn) for i_hidn in range(d_hidn)]

    sinusoid_table = np.array([get_posi_angle_vec(i_seq) for i_seq in range(n_seq)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # even index sin 
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # odd index cos

    return sinusoid_table

In [9]:
class TransformerForClassification(layers.Module):
    def __init__(self, embed_size=28, num_heads=7, hidden_dim=256, num_layers=2, num_classes=10, seq_length=28):
        super().__init__()
        self.embed_size = embed_size
        self.positional_encoding = np.expand_dims(get_sinusoid_encoding_table(seq_length, embed_size), axis=0)
        self.transformer = layers.Transformer(
            d_model=embed_size,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=hidden_dim,
            batch_first=True
        )
        self.flatten = layers.Flatten()
        self.fc = layers.Linear(seq_length * embed_size, num_classes)

    def forward(self, src, tgt):
        src += self.positional_encoding[:, :src.shape[1], :]
        tgt += self.positional_encoding[:, :tgt.shape[1], :]
        output = self.transformer(src, tgt)
        flatted_output = self.flatten(output)
        out = self.fc(flatted_output)
        return out

In [10]:
model = TransformerForClassification(hidden_dim=hidden_size)
criterion = layers.CrossEntropyLoss()
optimizer = optim.Adam(model, lr_rate=lr_rate)

In [11]:
for epoch in range(epochs):
    total_loss, correct = 0, 0

    model.train()
    for images, labels in dataloader:
        tgt = np.zeros_like(images) # dummpy
        logits = model(images, tgt)
        loss = criterion(logits, labels)

        total_loss += loss.item()
        dz = criterion.backward()
        optimizer.update(dz)
    
    model.eval()
    for images, labels in testloader:
        tgt = np.zeros_like(images) # dummpy
        logits = model(images, tgt)
        predictions = np.argmax(logits, axis=-1)
        correct += (predictions == labels).item()
    
    print(f'Epoch [{epoch+1} / {epochs}] | Loss: {total_loss / len(dataloader):.3f} | Acc: {correct / len(testloader):.3f}')

Epoch [1 / 10] | Loss: 1.693 | Acc: 0.634
Epoch [2 / 10] | Loss: 0.883 | Acc: 0.828
Epoch [3 / 10] | Loss: 0.572 | Acc: 0.890
Epoch [4 / 10] | Loss: 0.437 | Acc: 0.916
Epoch [5 / 10] | Loss: 0.362 | Acc: 0.911
Epoch [6 / 10] | Loss: 0.316 | Acc: 0.921
Epoch [7 / 10] | Loss: 0.282 | Acc: 0.929
Epoch [8 / 10] | Loss: 0.254 | Acc: 0.949
Epoch [9 / 10] | Loss: 0.234 | Acc: 0.949
Epoch [10 / 10] | Loss: 0.219 | Acc: 0.944
