In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math


In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=2100):  # Increased buffer
        super().__init__()
        self.d_model = d_model
        self.max_len = max_len
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        seq_len = x.size(1)
        if seq_len > self.max_len:
            # Instead of error, automatically extend (safer than dynamic)
            warnings.warn(f"Extending positional encodings from {self.max_len} to {seq_len}")
            self.extend_pe(seq_len)
        return x + self.pe[:, :seq_len].to(x.device)
    
    def extend_pe(self, new_max_len):
        """Extend positional embeddings if needed"""
        if new_max_len <= self.max_len:
            return
            
        new_pe = torch.zeros(new_max_len, self.d_model, device=self.pe.device)
        new_pe[:self.max_len] = self.pe[0]
        
        position = torch.arange(self.max_len, new_max_len, device=self.pe.device).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, self.d_model, 2, device=self.pe.device) * 
                          -(math.log(10000.0) / self.d_model))
        new_pe[self.max_len:, 0::2] = torch.sin(position * div_term)
        new_pe[self.max_len:, 1::2] = torch.cos(position * div_term)
        
        self.pe = new_pe.unsqueeze(0)
        self.max_len = new_max_len

In [4]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.num_heads = num_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, q, k, v):
        bs = q.size(0)

        # линейные преобразования и деление на головы
        def split(x): return x.view(bs, -1, self.num_heads, self.d_k).transpose(1, 2)

        q, k, v = map(split, (self.q_linear(q), self.k_linear(k), self.v_linear(v)))

        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        attn = F.softmax(scores, dim=-1)
        x = torch.matmul(attn, v)

        x = x.transpose(1, 2).contiguous().view(bs, -1, self.num_heads * self.d_k)
        return self.out(x)


In [5]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))


In [6]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        attn_out = self.self_attn(x, x, x)
        x = self.norm1(x + attn_out)
        ff_out = self.feed_forward(x)
        return self.norm2(x + ff_out)


In [7]:
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model=128, num_heads=4, num_layers=2, num_classes=2, max_len=2100):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([
            TransformerEncoderLayer(d_model, num_heads, d_ff=512)
            for _ in range(num_layers)
        ])
        self.classifier = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embed(x)
        x = self.pos_enc(x)
        for layer in self.layers:
            x = layer(x)
        x = x.mean(dim=1)  # Global Average Pooling
        return self.classifier(x)


In [8]:

# 2. Verify imports work
import torch
from torchtext.datasets import IMDB
print("PyTorch version:", torch.__version__)
print("IMDB dataset loaded successfully")

# 3. Alternative if you need newer versions
# !pip install torch==2.1.0 torchtext==0.16.0

PyTorch version: 2.1.0+cu121
IMDB dataset loaded successfully


In [9]:
import torch
from torch.utils.data import DataLoader
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence


In [10]:
tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for label, line in data_iter:
        yield tokenizer(line)

train_iter = IMDB(split='train')
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])

# Перезагрузим после итерирования
train_iter = IMDB(split='train')
test_iter = IMDB(split='test')


In [11]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: 1 if x == "pos" else 0

def collate_batch(batch):
    label_list, text_list = [], []
    max_len = 2100  # Hard truncation to match model's max_len
    for label, text in batch:
        label_list.append(label_pipeline(label))
        processed_text = text_pipeline(text)[:max_len]  # Trim to max_len
        text_list.append(torch.tensor(processed_text, dtype=torch.long))
    text_list = pad_sequence(text_list, batch_first=True, padding_value=vocab["<pad>"])
    return text_list.to(device), torch.tensor(label_list, dtype=torch.long).to(device)


In [12]:
batch_size = 16
train_loader = DataLoader(list(IMDB(split='train')), batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(list(IMDB(split='test')), batch_size=batch_size, shuffle=False, collate_fn=collate_batch)


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TransformerClassifier(
    vocab_size=len(vocab),
    d_model=128,
    num_heads=4,
    num_layers=2,
    num_classes=2,
    max_len=2100
).to(device)


In [14]:
import torch.optim as optim
from torch.cuda.amp import GradScaler, autocast
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()
scaler = GradScaler()

def train_epoch(model, dataloader):
    model.train()
    total_loss, correct = 0, 0
    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        with autocast():  # Enable mixed precision
            out = model(x)
            loss = criterion(out, y)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
        correct += (out.argmax(1) == y).sum().item()
    return total_loss / len(dataloader), correct / len(dataloader.dataset)


In [15]:
def evaluate(model, dataloader):
    model.eval()
    correct = 0
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            out = model(x)
            correct += (out.argmax(1) == y).sum().item()
    return correct / len(dataloader.dataset)


In [16]:
for epoch in range(5):
    train_loss, train_acc = train_epoch(model, train_loader)
    test_acc = evaluate(model, test_loader)
    print(f"Epoch {epoch+1}: Loss={train_loss:.4f}, Train Acc={train_acc:.4f}, Test Acc={test_acc:.4f}")


Epoch 1: Loss=0.0048, Train Acc=0.9976, Test Acc=1.0000
Epoch 2: Loss=0.0001, Train Acc=1.0000, Test Acc=1.0000
Epoch 3: Loss=0.0000, Train Acc=1.0000, Test Acc=1.0000
Epoch 4: Loss=0.0000, Train Acc=1.0000, Test Acc=1.0000
Epoch 5: Loss=0.0000, Train Acc=1.0000, Test Acc=1.0000
