In [40]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import spacy
from torchtext.vocab import build_vocab_from_iterator
import math

# Define model components (as per user's code with submodules)
class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(seq_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float):
        super().__init__()
        self.d_k = d_model // h
        self.h = h
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, q, k, v, mask):
        batch_size = q.size(0)
        q = self.w_q(q).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        k = self.w_k(k).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        v = self.w_v(v).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask, -1e9)
        attn_weights = torch.softmax(attn_scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        output = torch.matmul(attn_weights, v).transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
        return self.w_o(output)

class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model)
        )

    def forward(self, x):
        return self.net(x)

class EncoderBlock(nn.Module):
    def __init__(self, d_model: int, self_attention: MultiHeadAttentionBlock, ff: FeedForwardBlock, dropout: float):
        super().__init__()
        self.self_attention = self_attention
        self.ff = ff
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, src_mask):
        _x = self.norm1(x)
        x = x + self.dropout(self.self_attention(_x, _x, _x, src_mask))
        _x = self.norm2(x)
        x = x + self.dropout(self.ff(_x))
        return x

class Encoder(nn.Module):
    def __init__(self, d_model: int, blocks: nn.ModuleList):
        super().__init__()
        self.blocks = blocks
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, mask):
        for block in self.blocks:
            x = block(x, mask)
        return self.norm(x)

class DecoderBlock(nn.Module):
    def __init__(self, d_model: int, self_attention: MultiHeadAttentionBlock, cross_attention: MultiHeadAttentionBlock, ff: FeedForwardBlock, dropout: float):
        super().__init__()
        self.self_attention = self_attention
        self.cross_attention = cross_attention
        self.ff = ff
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        _x = self.norm1(x)
        x = x + self.dropout(self.self_attention(_x, _x, _x, tgt_mask))
        _x = self.norm2(x)
        x = x + self.dropout(self.cross_attention(_x, encoder_output, encoder_output, src_mask))
        _x = self.norm3(x)
        x = x + self.dropout(self.ff(_x))
        return x

class Decoder(nn.Module):
    def __init__(self, d_model: int, blocks: nn.ModuleList):
        super().__init__()
        self.blocks = blocks
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for block in self.blocks:
            x = block(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

class ProjectionLayer(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        return torch.log_softmax(self.proj(x), dim=-1)

# Original Transformer class provided by user
class Transformer(nn.Module):
    def __init__(self, src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int=512, N: int=6, h: int=8, dropout: float=0.1, d_ff: int=2048):
        super().__init__()
        self.src_embed = InputEmbeddings(d_model, src_vocab_size)
        self.tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)
        self.src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
        self.tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)
        
        encoder_blocks = []
        for _ in range(N):
            encoder_self_attention = MultiHeadAttentionBlock(d_model, h, dropout)
            ff = FeedForwardBlock(d_model, d_ff, dropout)
            encoder_block = EncoderBlock(d_model, encoder_self_attention, ff, dropout)
            encoder_blocks.append(encoder_block)
        self.encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))
        
        decoder_blocks = []
        for _ in range(N):
            self_attention = MultiHeadAttentionBlock(d_model, h, dropout)
            cross_attention = MultiHeadAttentionBlock(d_model, h, dropout)
            ff = FeedForwardBlock(d_model, d_ff, dropout)
            decoder_block = DecoderBlock(d_model, self_attention, cross_attention, ff, dropout)
            decoder_blocks.append(decoder_block)
        self.decoder = Decoder(d_model, nn.ModuleList(decoder_blocks))
        
        self.projection = ProjectionLayer(d_model, tgt_vocab_size)
        
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def encode(self, src, src_mask):
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)
    
    def decode(self, encoder_output, src_mask, tgt, tgt_mask):
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    
    def project(self, x):
        return self.projection(x)
    
    def forward(self, src, tgt, src_mask, tgt_mask):
        encoder_output = self.encode(src, src_mask)
        decoder_output = self.decode(encoder_output, src_mask, tgt, tgt_mask)
        return self.project(decoder_output)

In [36]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from torchtext.vocab import build_vocab_from_iterator
import math
import re

# Custom simple tokenizer (whitespace-based with basic punctuation handling)
def simple_tokenizer(text):
    return re.findall(r'\w+|[^\w\s]', text.lower())

class TranslationDataset(Dataset):
    def __init__(self, data, src_vocab, tgt_vocab, max_len=100):
        self.data = data
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]['translation']
        
        # Source (English) processing
        src_tokens = ['<bos>'] + simple_tokenizer(item['en'])[:self.max_len-2] + ['<eos>']
        src_tokens += ['<pad>'] * (self.max_len - len(src_tokens))
        
        # Target (Italian) processing
        tgt_tokens = simple_tokenizer(item['it'])[:self.max_len-2]
        tgt_input = ['<bos>'] + tgt_tokens
        tgt_output = tgt_tokens + ['<eos>']
        
        # Padding
        tgt_input += ['<pad>'] * (self.max_len - len(tgt_input))
        tgt_output += ['<pad>'] * (self.max_len - len(tgt_output))
        
        return {
            'src': torch.tensor([self.src_vocab[tok] for tok in src_tokens], dtype=torch.long),
            'tgt_input': torch.tensor([self.tgt_vocab[tok] for tok in tgt_input], dtype=torch.long),
            'tgt_output': torch.tensor([self.tgt_vocab[tok] for tok in tgt_output], dtype=torch.long)
        }

# Load dataset
dataset = load_dataset('opus_books', 'en-it')['train'].train_test_split(0.2)
train_data = dataset['train']
valid_data = dataset['test']

# Build vocabulary from training data
def yield_tokens(data_iter, lang):
    for item in data_iter:
        yield simple_tokenizer(item['translation'][lang])

src_vocab = build_vocab_from_iterator(
    yield_tokens(train_data, 'en'),
    specials=['<pad>', '<bos>', '<eos>']
)
tgt_vocab = build_vocab_from_iterator(
    yield_tokens(train_data, 'it'),
    specials=['<pad>', '<bos>', '<eos>']
)
src_vocab.set_default_index(src_vocab['<pad>'])
tgt_vocab.set_default_index(tgt_vocab['<pad>'])

# Create datasets
max_len = 100
batch_size = 32
train_dataset = TranslationDataset(train_data, src_vocab, tgt_vocab, max_len)
valid_dataset = TranslationDataset(valid_data, src_vocab, tgt_vocab, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

In [44]:
# [Keep all model components identical from your original code...]

# Training setup remains the same
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(
    src_vocab_size=len(src_vocab),
    tgt_vocab_size=len(tgt_vocab),
    src_seq_len=max_len,
    tgt_seq_len=max_len,
    d_model=512,
    N=2,
    h=8
).to(device)


In [45]:

# [Rest of training loop and validation remains identical...]

# Training setup
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab['<pad>'])
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9)

class WarmupScheduler:
    def __init__(self, d_model, warmup_steps, optimizer):
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.optimizer = optimizer
        self.step_num = 0
    
    def step(self):
        self.step_num += 1
        lr = (self.d_model ** -0.5) * min(self.step_num ** -0.5, self.step_num * (self.warmup_steps ** -1.5))
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr

scheduler = WarmupScheduler(512, 4000, optimizer)

In [46]:
# Mask functions
def create_mask(src, tgt_input, src_pad_idx, tgt_pad_idx):
    src_mask = (src == src_pad_idx).unsqueeze(1).unsqueeze(2)
    tgt_pad_mask = (tgt_input == tgt_pad_idx).unsqueeze(1).unsqueeze(2)
    tgt_len = tgt_input.size(1)
    tgt_sub_mask = torch.triu(torch.ones((tgt_len, tgt_len), device=device)).bool()
    tgt_mask = tgt_pad_mask | tgt_sub_mask
    return src_mask, tgt_mask


In [47]:
from tqdm import tqdm

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    # Wrap train_loader with tqdm for progress bar
    train_iterator = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)
    for batch in train_iterator:
        src = batch['src'].to(device)
        tgt_input = batch['tgt_input'].to(device)
        tgt_output = batch['tgt_output'].to(device)
        
        src_mask, tgt_mask = create_mask(src, tgt_input, src_vocab['<pad>'], tgt_vocab['<pad>'])
        
        optimizer.zero_grad()
        output = model(src, tgt_input, src_mask, tgt_mask)
        loss = criterion(output.view(-1, output.size(-1)), tgt_output.view(-1))
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        # Update progress bar with current loss
        train_iterator.set_postfix({"loss": f"{loss.item():.4f}"})
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

# Validation
model.eval()
val_loss = 0
# Wrap valid_loader with tqdm for progress bar
val_iterator = tqdm(valid_loader, desc="Validation", leave=False)
with torch.no_grad():
    for batch in val_iterator:
        src = batch['src'].to(device)
        tgt_input = batch['tgt_input'].to(device)
        tgt_output = batch['tgt_output'].to(device)
        
        src_mask, tgt_mask = create_mask(src, tgt_input, src_vocab['<pad>'], tgt_vocab['<pad>'])
        
        output = model(src, tgt_input, src_mask, tgt_mask)
        loss = criterion(output.view(-1, output.size(-1)), tgt_output.view(-1))
        val_loss += loss.item()
        # Update progress bar with current loss
        val_iterator.set_postfix({"val_loss": f"{loss.item():.4f}"})

print(f"Validation Loss: {val_loss/len(valid_loader):.4f}")

                                                                           

KeyboardInterrupt: 

In [32]:
dataset['train'].train_test_split(0.2)

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 25865
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 6467
    })
})

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import spacy
from torchtext.vocab import build_vocab_from_iterator
import math

# Define model components (as per user's code with submodules)
class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(seq_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float):
        super().__init__()
        self.d_k = d_model // h
        self.h = h
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, q, k, v, mask):
        batch_size = q.size(0)
        q = self.w_q(q).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        k = self.w_k(k).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        v = self.w_v(v).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask, -1e9)
        attn_weights = torch.softmax(attn_scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        output = torch.matmul(attn_weights, v).transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
        return self.w_o(output)

class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model)
        )

    def forward(self, x):
        return self.net(x)

class EncoderBlock(nn.Module):
    def __init__(self, d_model: int, self_attention: MultiHeadAttentionBlock, ff: FeedForwardBlock, dropout: float):
        super().__init__()
        self.self_attention = self_attention
        self.ff = ff
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, src_mask):
        _x = self.norm1(x)
        x = x + self.dropout(self.self_attention(_x, _x, _x, src_mask))
        _x = self.norm2(x)
        x = x + self.dropout(self.ff(_x))
        return x

class Encoder(nn.Module):
    def __init__(self, d_model: int, blocks: nn.ModuleList):
        super().__init__()
        self.blocks = blocks
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, mask):
        for block in self.blocks:
            x = block(x, mask)
        return self.norm(x)

class DecoderBlock(nn.Module):
    def __init__(self, d_model: int, self_attention: MultiHeadAttentionBlock, cross_attention: MultiHeadAttentionBlock, ff: FeedForwardBlock, dropout: float):
        super().__init__()
        self.self_attention = self_attention
        self.cross_attention = cross_attention
        self.ff = ff
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        _x = self.norm1(x)
        x = x + self.dropout(self.self_attention(_x, _x, _x, tgt_mask))
        _x = self.norm2(x)
        x = x + self.dropout(self.cross_attention(_x, encoder_output, encoder_output, src_mask))
        _x = self.norm3(x)
        x = x + self.dropout(self.ff(_x))
        return x

class Decoder(nn.Module):
    def __init__(self, d_model: int, blocks: nn.ModuleList):
        super().__init__()
        self.blocks = blocks
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for block in self.blocks:
            x = block(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

class ProjectionLayer(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        return torch.log_softmax(self.proj(x), dim=-1)

# Original Transformer class provided by user
class Transformer(nn.Module):
    def __init__(self, src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int=512, N: int=6, h: int=8, dropout: float=0.1, d_ff: int=2048):
        super().__init__()
        self.src_embed = InputEmbeddings(d_model, src_vocab_size)
        self.tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)
        self.src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
        self.tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)
        
        encoder_blocks = []
        for _ in range(N):
            encoder_self_attention = MultiHeadAttentionBlock(d_model, h, dropout)
            ff = FeedForwardBlock(d_model, d_ff, dropout)
            encoder_block = EncoderBlock(d_model, encoder_self_attention, ff, dropout)
            encoder_blocks.append(encoder_block)
        self.encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))
        
        decoder_blocks = []
        for _ in range(N):
            self_attention = MultiHeadAttentionBlock(d_model, h, dropout)
            cross_attention = MultiHeadAttentionBlock(d_model, h, dropout)
            ff = FeedForwardBlock(d_model, d_ff, dropout)
            decoder_block = DecoderBlock(d_model, self_attention, cross_attention, ff, dropout)
            decoder_blocks.append(decoder_block)
        self.decoder = Decoder(d_model, nn.ModuleList(decoder_blocks))
        
        self.projection = ProjectionLayer(d_model, tgt_vocab_size)
        
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def encode(self, src, src_mask):
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)
    
    def decode(self, encoder_output, src_mask, tgt, tgt_mask):
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    
    def project(self, x):
        return self.projection(x)
    
    def forward(self, src, tgt, src_mask, tgt_mask):
        encoder_output = self.encode(src, src_mask)
        decoder_output = self.decode(encoder_output, src_mask, tgt, tgt_mask)
        return self.project(decoder_output)

In [27]:
dataset['train'].data

MemoryMappedTable
id: string
translation: struct<en: string, it: string>
  child 0, en: string
  child 1, it: string
----
id: [["0","1","2","3","4",...,"995","996","997","998","999"],["1000","1001","1002","1003","1004",...,"1995","1996","1997","1998","1999"],...,["31000","31001","31002","31003","31004",...,"31995","31996","31997","31998","31999"],["32000","32001","32002","32003","32004",...,"32327","32328","32329","32330","32331"]]
translation: [
  -- is_valid: all not null
  -- child 0 type: string
["Source: Project Gutenberg","Jane Eyre","Charlotte Bronte","CHAPTER I","There was no possibility of taking a walk that day.",...,""I am afraid I never shall do that."",""Why?"",""Because I have been wrongly accused; and you, ma'am, and everybody else, will now think me wicked."",""We shall think you what you prove yourself to be, my child.","Continue to act as a good girl, and you will satisfy us.""]
  -- child 1 type: string
["Source: www.liberliber.it/Audiobook available here","Jane Eyre