In [3]:
!pip install tiktoken
!gdown 1WTjIveEsM7XpN28xm6F1qgX57QxaoFI_

!gdown 1WbyeG8f-V7VmpKdQam-0tNg4x6XDWoML

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0
Downloading...
From: https://drive.google.com/uc?id=1WTjIveEsM7XpN28xm6F1qgX57QxaoFI_
To: /content/train_set.csv
100% 524k/524k [00:00<00:00, 94.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1WbyeG8f-V7VmpKdQam-0tNg4x6XDWoML
To: /content/validation_set.csv
100% 112k/112k [00:00<00:00, 99.2MB/s]


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tiktoken
import pandas as pd
from sklearn.model_selection import train_test_split

In [14]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.titles = dataframe['title'].str.lower().values
        self.labels = dataframe['label_numeric'].values
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        title = self.titles[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode(title)
        input_ids = torch.tensor(encoding, dtype=torch.long)
        return input_ids, label


In [6]:
def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    max_length = max(len(ids) for ids in input_ids)
    input_ids = torch.stack([torch.cat([ids, torch.zeros(max_length - len(ids), dtype=torch.long)]) for ids in input_ids])
    labels = torch.tensor(labels, dtype=torch.long)
    return input_ids, labels

In [28]:
# rotary embedding
def apply_rotary_pos_emb(q, k, d_k, seq_len):
    # q, k: (..., seq_len, d_model)
    thetas = 10000 ** (torch.arange(0, d_k, 2).float() / d_k) # (d_k / 2, )
    ms = torch.arange(seq_len, dtype=torch.float32).unsqueeze(1) # (max_len, 1)
    pos = ms * thetas # (max_len, d_model / 2)
    cos = torch.cos(pos) # (max_len, d_k / 2)
    sin = torch.sin(pos) # (max_len, d_k / 2)
    q1, q2 = q[..., 0::2], q[..., 1::2]
    q_rot = torch.cat([q1 * cos - q2 * sin, q2 * cos + q1 * sin], dim=-1)
    k1, k2 = k[..., 0::2], k[..., 1::2]
    k_rot = torch.cat([k1 * cos - k2 * sin, k2 * cos + k1 * sin], dim=-1)
    return q_rot, k_rot

In [30]:
# sparse attention
def create_sparse_mask(seq_len, block_size=3):
    mask = torch.ones(seq_len, seq_len, dtype=torch.bool)
    for i in range(seq_len):
        mask[i, max(0, i - block_size):min(seq_len, i + block_size + 1)] = 0
    return mask

In [42]:
# Transformer Encoder Layer with Rotary Position Embedding
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.depth = d_model // num_heads

        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)

        self.dense = nn.Linear(d_model, d_model)

        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def split_heads(self, x):
        batch_size = x.shape[0]
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        return x.transpose(1, 2)

    def scaled_dot_product_attention(self, q, k, v, mask=None):
        seq_len = q.size(2)
        matmul_qk = torch.matmul(q, k.transpose(-2, -1))
        dk = torch.tensor(k.size(-1), dtype=torch.float32)
        scaled_attention_logits = matmul_qk / torch.sqrt(dk)

        b = -torch.log(torch.tensor(seq_len, dtype=torch.float32)) # b = -log(n)

        if mask is not None:
            scaled_attention_logits = scaled_attention_logits.masked_fill(mask == 0, -1e9)

        #attention_weights = torch.nn.functional.softmax(scaled_attention_logits, dim=-1)

        attention_weights = torch.sigmoid(scaled_attention_logits + b) # sigmoid attention
        output = torch.matmul(attention_weights, v)

        return output, attention_weights

    def forward(self, x, mask=None):
        batch_size = x.size(0)
        seq_len = x.size(1)

        q = self.split_heads(self.wq(x))
        k = self.split_heads(self.wk(x))
        v = self.split_heads(self.wv(x))

        q_rot, k_rot = apply_rotary_pos_emb(q, k, self.d_model // self.num_heads, seq_len)
        sparse_mask = create_sparse_mask(seq_len)

        scaled_attention, _ = self.scaled_dot_product_attention(q_rot, k_rot, v, sparse_mask)

        scaled_attention = scaled_attention.transpose(1, 2).contiguous()
        concat_attention = scaled_attention.view(batch_size, -1, self.d_model)

        attn_output = self.dense(concat_attention)

        x = self.layernorm1(x + self.dropout(attn_output))

        ff_output = self.feed_forward(x)

        x = self.layernorm2(x + self.dropout(ff_output))

        return x

In [43]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, d_model, num_heads, d_ff, output_size, num_layers, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.encoder_layers = nn.ModuleList([
            TransformerEncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        self.fc = nn.Linear(d_model, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        x = self.embedding(x)  # (batch_size, seq_len, embed_size)
        for layer in self.encoder_layers:
            x = layer(x, mask)  # (batch_size, seq_len, d_model)
        x = x.mean(dim=1)  # (batch_size, d_model)
        x = self.fc(self.dropout(x))  # (batch_size, output_size)
        return x


In [44]:
train_df = pd.read_csv('/content/train_set.csv')
validation_df = pd.read_csv('/content/validation_set.csv')

tokenizer = tiktoken.get_encoding('gpt2')
train_dataset = TextDataset(train_df, tokenizer)
val_dataset = TextDataset(validation_df, tokenizer)


In [45]:
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [46]:
vocab_size = tokenizer.n_vocab
embed_size = 256
d_model = 256
num_heads = 8
d_ff = 512
output_size = len(train_df['label_numeric'].unique())
num_layers = 2
dropout = 0.1

model = TransformerModel(vocab_size, embed_size, d_model, num_heads, d_ff, output_size, num_layers, dropout)

num_epochs = 20
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [47]:
for epoch in range(30):
    model.train()
    for input_ids, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for input_ids, labels in val_dataloader:
            outputs = model(input_ids)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Validation Accuracy after Epoch {epoch+1}: {accuracy:.2f}%")

Epoch 1/20, Loss: 1.1877806186676025
Validation Accuracy after Epoch 1: 68.28%
Epoch 2/20, Loss: 0.5843272805213928
Validation Accuracy after Epoch 2: 77.10%
Epoch 3/20, Loss: 0.4231613874435425
Validation Accuracy after Epoch 3: 77.52%
Epoch 4/20, Loss: 0.3301980197429657
Validation Accuracy after Epoch 4: 77.52%
Epoch 5/20, Loss: 0.3990277945995331
Validation Accuracy after Epoch 5: 74.85%
Epoch 6/20, Loss: 0.5307866334915161
Validation Accuracy after Epoch 6: 77.21%
Epoch 7/20, Loss: 0.3075634241104126
Validation Accuracy after Epoch 7: 75.98%
Epoch 8/20, Loss: 0.20621258020401
Validation Accuracy after Epoch 8: 74.85%
Epoch 9/20, Loss: 0.1858457624912262
Validation Accuracy after Epoch 9: 78.85%
Epoch 10/20, Loss: 0.13559135794639587
Validation Accuracy after Epoch 10: 75.26%
Epoch 11/20, Loss: 0.14967934787273407
Validation Accuracy after Epoch 11: 77.21%


KeyboardInterrupt: 