## Fine-Tuning BERT

In [1]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
import torch
import math

dataset = load_dataset("coastalcph/tydi_xor_rc")

languages = ['ar', 'ko', 'te']
train_dataset = dataset["train"].filter(lambda example: example['lang'] in languages)
val_dataset = dataset["validation"].filter(lambda example: example['lang'] in languages)

print("Sample from train dataset:")
sample = train_dataset[0]
print(f"Keys: {sample.keys()}")
print(f"Answer structure: {sample['answer']}")
print(f"Answer type: {type(sample['answer'])}")

model_checkpoint = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

max_length = 384
doc_stride = 128

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = [c.strip() for c in examples["context"]]
    
    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = tokenized_examples.pop("offset_mapping")
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        
        answer_data = examples["answer"][sample_index]
        
        if isinstance(answer_data, dict):
            if "text" in answer_data:
                answer_text = answer_data["text"]
                answer_starts = answer_data.get("answer_start", [])
            else:
                answer_text = answer_data.get("answer_text", answer_data.get("answers", ""))
                answer_starts = answer_data.get("answer_start", answer_data.get("answer_starts", []))
        elif isinstance(answer_data, str):
            answer_text = answer_data
            answer_starts = []
        else:
            answer_text = ""
            answer_starts = []
        
        if isinstance(answer_text, str):
            answer_texts = [answer_text] if answer_text else []
        else:
            answer_texts = answer_text if answer_text else []
        
        if not isinstance(answer_starts, list):
            answer_starts = [answer_starts] if answer_starts else []
        
        if not answer_texts or not answer_texts[0].strip():
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            answer_text = answer_texts[0]
            start_char = answer_starts[0] if answer_starts else 0
            end_char = start_char + len(answer_text)

            token_start_index = 0
            while token_start_index < len(sequence_ids) and sequence_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while token_end_index >= 0 and sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if (token_start_index >= len(offsets) or token_end_index >= len(offsets) or 
                token_start_index > token_end_index or
                not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char)):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                
                while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=25,
    logging_strategy="steps",
    logging_steps=25,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
)

trainer.train()

print("Overall Evaluation")
eval_results = trainer.evaluate()
print(f"Overall Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
print(f"Overall Loss: {eval_results['eval_loss']:.4f}")

languages = ['ar', 'ko', 'te']
print("\nLanguage-specific Evaluations")

for lang in languages:
    print(f"\nEvaluating {lang.upper()}")
    
    lang_val_dataset = val_dataset.filter(lambda example: example['lang'] == lang)
    print(f"Number of {lang} validation examples: {len(lang_val_dataset)}")
    
    if len(lang_val_dataset) == 0:
        print(f"No validation examples found for language: {lang}")
        continue
    
    tokenized_lang_val = lang_val_dataset.map(
        preprocess_function, 
        batched=True, 
        remove_columns=lang_val_dataset.column_names
    )
    
    lang_trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=tokenized_lang_val,
        tokenizer=tokenizer,
    )
    
    lang_eval_results = lang_trainer.evaluate()
    print(f"{lang.upper()} Perplexity: {math.exp(lang_eval_results['eval_loss']):.2f}")
    print(f"{lang.upper()} Loss: {lang_eval_results['eval_loss']:.4f}")
    
    for key, value in lang_eval_results.items():
        if key not in ['eval_loss', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch']:
            print(f"{lang.upper()} {key}: {value:.4f}")

print("Evaluation completed for all languages")

Sample from train dataset:
Keys: dict_keys(['question', 'context', 'lang', 'answerable', 'answer_start', 'answer', 'answer_inlang'])
Answer structure: France
Answer type: <class 'str'>


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
wandb: Currently logged in as: aarushsinha60 (chungimungi) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss,Validation Loss
25,4.5146,3.173701
50,2.1181,1.284127
75,1.4246,1.214832
100,1.2826,1.151535
125,1.2195,1.121558
150,1.1836,1.111004
175,1.1539,1.115361
200,1.2134,1.081321
225,1.074,1.087571
250,1.1417,1.079575


Overall Evaluation


Overall Perplexity: 2.88
Overall Loss: 1.0582

Language-specific Evaluations

Evaluating AR
Number of ar validation examples: 415


  lang_trainer = Trainer(


AR Perplexity: 2.94
AR Loss: 1.0793
AR eval_model_preparation_time: 0.0020

Evaluating KO
Number of ko validation examples: 356


Map:   0%|          | 0/356 [00:00<?, ? examples/s]

KO Perplexity: 2.86
KO Loss: 1.0503
KO eval_model_preparation_time: 0.0020

Evaluating TE
Number of te validation examples: 384


TE Perplexity: 2.83
TE Loss: 1.0420
TE eval_model_preparation_time: 0.0030
Evaluation completed for all languages


## RNN (LSTM)

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer
import math
from tqdm import tqdm
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

class LSTMQuestionAnswering(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_layers=2, dropout=0.3):
        super(LSTMQuestionAnswering, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, 
                           dropout=dropout, bidirectional=True)
        
        self.hidden_dim = hidden_dim * 2     
        self.question_lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, 
                                   dropout=dropout, bidirectional=True)
        self.context_lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, 
                                  dropout=dropout, bidirectional=True)
        self.attention = nn.MultiheadAttention(self.hidden_dim, num_heads=8, dropout=dropout)
        self.start_classifier = nn.Linear(self.hidden_dim, 1)
        self.end_classifier = nn.Linear(self.hidden_dim, 1)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, question_ids, context_ids, question_mask=None, context_mask=None):
        batch_size = question_ids.size(0)
        
        question_embedded = self.embedding(question_ids)
        context_embedded = self.embedding(context_ids)
        
        question_output, _ = self.question_lstm(question_embedded)
        context_output, _ = self.context_lstm(context_embedded)
        
        context_output = context_output.transpose(0, 1)  
        question_output = question_output.transpose(0, 1) 
        attended_output, _ = self.attention(context_output, question_output, question_output)
        attended_output = attended_output.transpose(0, 1)  
        
        attended_output = self.dropout(attended_output)
        
        start_logits = self.start_classifier(attended_output).squeeze(-1)  
        end_logits = self.end_classifier(attended_output).squeeze(-1) 
        
        return start_logits, end_logits

class QADataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=384):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        example = self.dataset[idx]
        
        question = example['question'].strip()
        context = example['context'].strip()
        
        question_tokens = self.tokenizer(
            question,
            max_length=64,  
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        context_tokens = self.tokenizer(
            context,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_offsets_mapping=True
        )
        
        answer_data = example.get('answer', '')
        answer_start = example.get('answer_start', [])
        
        if isinstance(answer_data, str):
            answer_text = answer_data
        else:
            answer_text = answer_data if answer_data else ""
            
        if isinstance(answer_start, list) and answer_start:
            answer_start_pos = answer_start[0]
        else:
            answer_start_pos = answer_start if isinstance(answer_start, int) else 0
        
        start_pos = 0
        end_pos = 0
        
        if answer_text and answer_text.strip():
            try:
                offsets = context_tokens['offset_mapping'][0]
                answer_end_pos = answer_start_pos + len(answer_text)
                
                for i, (start_char, end_char) in enumerate(offsets):
                    if start_char <= answer_start_pos < end_char:
                        start_pos = i
                    if start_char < answer_end_pos <= end_char:
                        end_pos = i
                        break
                        
                if end_pos <= start_pos:
                    end_pos = min(start_pos + 1, len(offsets) - 1)
                    
            except:
                start_pos = 0
                end_pos = 0
        
        return {
            'question_ids': question_tokens['input_ids'].squeeze(0),
            'context_ids': context_tokens['input_ids'].squeeze(0),
            'question_mask': question_tokens['attention_mask'].squeeze(0),
            'context_mask': context_tokens['attention_mask'].squeeze(0),
            'start_position': torch.tensor(start_pos, dtype=torch.long),
            'end_position': torch.tensor(end_pos, dtype=torch.long)
        }

def train_epoch(model, train_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    num_batches = 0
    
    pbar = tqdm(train_loader, desc="Training")
    for batch in pbar:
        question_ids = batch['question_ids'].to(device)
        context_ids = batch['context_ids'].to(device)
        start_positions = batch['start_position'].to(device)
        end_positions = batch['end_position'].to(device)
        
        optimizer.zero_grad()
        
        start_logits, end_logits = model(question_ids, context_ids)
        
        start_loss = F.cross_entropy(start_logits, start_positions)
        end_loss = F.cross_entropy(end_logits, end_positions)
        loss = start_loss + end_loss
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        num_batches += 1
        
        pbar.set_postfix({'loss': f'{loss.item():.4f}', 'avg_loss': f'{total_loss/num_batches:.4f}'})
    
    return total_loss / num_batches

def evaluate_model(model, val_loader, device):
    model.eval()
    total_loss = 0
    correct_start = 0
    correct_end = 0
    total_samples = 0
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            question_ids = batch['question_ids'].to(device)
            context_ids = batch['context_ids'].to(device)
            start_positions = batch['start_position'].to(device)
            end_positions = batch['end_position'].to(device)
            
            start_logits, end_logits = model(question_ids, context_ids)
            
            start_loss = F.cross_entropy(start_logits, start_positions)
            end_loss = F.cross_entropy(end_logits, end_positions)
            loss = start_loss + end_loss
            
            total_loss += loss.item()
            
            pred_start = torch.argmax(start_logits, dim=1)
            pred_end = torch.argmax(end_logits, dim=1)
            
            correct_start += (pred_start == start_positions).sum().item()
            correct_end += (pred_end == end_positions).sum().item()
            total_samples += start_positions.size(0)
    
    avg_loss = total_loss / len(val_loader)
    start_accuracy = correct_start / total_samples
    end_accuracy = correct_end / total_samples
    
    return avg_loss, start_accuracy, end_accuracy

def evaluate_by_language(model, dataset, tokenizer, languages, device):
    """Evaluate model performance for each language separately"""
    results = {}
    
    for lang in languages:
        print(f"\n Evaluating {lang.lower()} ")
        
        lang_dataset = dataset.filter(lambda example: example['lang'] == lang)
        print(f"Number of {lang} validation examples: {len(lang_dataset)}")
        
        if len(lang_dataset) == 0:
            print(f"No validation examples found for language: {lang}")
            continue
        
        lang_qa_dataset = QADataset(lang_dataset, tokenizer)
        lang_loader = DataLoader(lang_qa_dataset, batch_size=16, shuffle=False)
        
        avg_loss, start_acc, end_acc = evaluate_model(model, lang_loader, device)
        perplexity = math.exp(avg_loss)
        
        results[lang] = {
            'loss': avg_loss,
            'perplexity': perplexity,
            'start_accuracy': start_acc,
            'end_accuracy': end_acc
        }
        
        print(f"{lang.upper()} Loss: {avg_loss:.4f}")
        print(f"{lang.upper()} Perplexity: {perplexity:.2f}")
        print(f"{lang.upper()} Start Position Accuracy: {start_acc:.4f}")
        print(f"{lang.upper()} End Position Accuracy: {end_acc:.4f}")
    
    return results

def main():
    print("Loading dataset and tokenizer...")
    dataset = load_dataset("coastalcph/tydi_xor_rc")
    languages = ['ar', 'ko', 'te']
    train_dataset = dataset["train"].filter(lambda example: example['lang'] in languages)
    val_dataset = dataset["validation"].filter(lambda example: example['lang'] in languages)

    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

    print(f"Train dataset size: {len(train_dataset)}")
    print(f"Validation dataset size: {len(val_dataset)}")


    train_qa_dataset = QADataset(train_dataset, tokenizer)
    val_qa_dataset = QADataset(val_dataset, tokenizer)

    batch_size = 16
    train_loader = DataLoader(train_qa_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_qa_dataset, batch_size=batch_size, shuffle=False)

    vocab_size = tokenizer.vocab_size
    model = LSTMQuestionAnswering(vocab_size=vocab_size, embed_dim=128, hidden_dim=256, num_layers=2).to(device)

    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

    num_epochs = 10
    optimizer = optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.01)
    scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.1, 
                                          total_iters=len(train_loader) * num_epochs)

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        
        train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
        print(f"Training Loss: {train_loss:.4f}")
        
        if (epoch + 1) % 1 == 0: 
            print("\nEvaluating on validation set...")
            val_loss, start_acc, end_acc = evaluate_model(model, val_loader, device)
            val_perplexity = math.exp(val_loss)
            
            print(f"Validation Loss: {val_loss:.4f}")
            print(f"Validation Perplexity: {val_perplexity:.2f}")
            print(f"Start Position Accuracy: {start_acc:.4f}")
            print(f"End Position Accuracy: {end_acc:.4f}")

    print("Evaluation")
    overall_loss, overall_start_acc, overall_end_acc = evaluate_model(model, val_loader, device)
    overall_perplexity = math.exp(overall_loss)
    
    print(f"Overall Loss: {overall_loss:.4f}")
    print(f"Overall Perplexity: {overall_perplexity:.2f}")
    print(f"Overall Start Position Accuracy: {overall_start_acc:.4f}")
    print(f"Overall End Position Accuracy: {overall_end_acc:.4f}")

    print("\nLanguage-specific Evaluations")
    lang_results = evaluate_by_language(model, val_dataset, tokenizer, languages, device)

if __name__ == "__main__":
    main()


Using device: cuda
GPU: NVIDIA GeForce RTX 4060 Laptop GPU
Loading dataset and tokenizer...
Train dataset size: 6335
Validation dataset size: 1155
Model parameters: 12,060,930
Trainable parameters: 12,060,930

Epoch 1/10


Training: 100%|██████████| 396/396 [01:05<00:00,  6.07it/s, loss=8.2154, avg_loss=9.2861]  


Training Loss: 9.2861

Evaluating on validation set...


Evaluating: 100%|██████████| 73/73 [00:06<00:00, 10.53it/s]


Validation Loss: 7.8497
Validation Perplexity: 2564.97
Start Position Accuracy: 0.1515
End Position Accuracy: 0.1100

Epoch 2/10


Training: 100%|██████████| 396/396 [01:15<00:00,  5.23it/s, loss=7.4108, avg_loss=7.2681]


Training Loss: 7.2681

Evaluating on validation set...


Evaluating: 100%|██████████| 73/73 [00:06<00:00, 10.61it/s]


Validation Loss: 7.1511
Validation Perplexity: 1275.47
Start Position Accuracy: 0.1662
End Position Accuracy: 0.1524

Epoch 3/10


Training: 100%|██████████| 396/396 [01:11<00:00,  5.52it/s, loss=6.2380, avg_loss=6.6507]


Training Loss: 6.6507

Evaluating on validation set...


Evaluating: 100%|██████████| 73/73 [00:05<00:00, 12.27it/s]


Validation Loss: 6.7510
Validation Perplexity: 854.92
Start Position Accuracy: 0.2208
End Position Accuracy: 0.2190

Epoch 4/10


Training: 100%|██████████| 396/396 [01:10<00:00,  5.58it/s, loss=5.6223, avg_loss=6.1169]


Training Loss: 6.1169

Evaluating on validation set...


Evaluating: 100%|██████████| 73/73 [00:06<00:00, 11.27it/s]


Validation Loss: 6.6770
Validation Perplexity: 793.90
Start Position Accuracy: 0.2355
End Position Accuracy: 0.2199

Epoch 5/10


Training: 100%|██████████| 396/396 [01:09<00:00,  5.73it/s, loss=4.9150, avg_loss=5.7371]


Training Loss: 5.7371

Evaluating on validation set...


Evaluating: 100%|██████████| 73/73 [00:06<00:00, 10.53it/s]


Validation Loss: 6.6322
Validation Perplexity: 759.15
Start Position Accuracy: 0.2519
End Position Accuracy: 0.2355

Epoch 6/10


Training: 100%|██████████| 396/396 [01:10<00:00,  5.61it/s, loss=6.8640, avg_loss=5.4078]


Training Loss: 5.4078

Evaluating on validation set...


Evaluating: 100%|██████████| 73/73 [00:04<00:00, 16.73it/s]


Validation Loss: 6.6182
Validation Perplexity: 748.63
Start Position Accuracy: 0.2416
End Position Accuracy: 0.2303

Epoch 7/10


Training: 100%|██████████| 396/396 [00:47<00:00,  8.28it/s, loss=3.5647, avg_loss=5.1429]


Training Loss: 5.1429

Evaluating on validation set...


Evaluating: 100%|██████████| 73/73 [00:04<00:00, 16.51it/s]


Validation Loss: 6.9190
Validation Perplexity: 1011.27
Start Position Accuracy: 0.2476
End Position Accuracy: 0.2216

Epoch 8/10


Training: 100%|██████████| 396/396 [00:50<00:00,  7.83it/s, loss=4.8291, avg_loss=4.9348]


Training Loss: 4.9348

Evaluating on validation set...


Evaluating: 100%|██████████| 73/73 [00:04<00:00, 14.80it/s]


Validation Loss: 6.7802
Validation Perplexity: 880.21
Start Position Accuracy: 0.2468
End Position Accuracy: 0.2268

Epoch 9/10


Training: 100%|██████████| 396/396 [00:51<00:00,  7.72it/s, loss=3.7590, avg_loss=4.6756]


Training Loss: 4.6756

Evaluating on validation set...


Evaluating: 100%|██████████| 73/73 [00:04<00:00, 16.11it/s]


Validation Loss: 6.8338
Validation Perplexity: 928.72
Start Position Accuracy: 0.2693
End Position Accuracy: 0.2494

Epoch 10/10


Training: 100%|██████████| 396/396 [00:49<00:00,  7.99it/s, loss=3.4470, avg_loss=4.5349]


Training Loss: 4.5349

Evaluating on validation set...


Evaluating: 100%|██████████| 73/73 [00:04<00:00, 16.04it/s]


Validation Loss: 6.8574
Validation Perplexity: 950.93
Start Position Accuracy: 0.2589
End Position Accuracy: 0.2416
Evaluation


Evaluating: 100%|██████████| 73/73 [00:04<00:00, 16.03it/s]


Overall Loss: 6.8574
Overall Perplexity: 950.93
Overall Start Position Accuracy: 0.2589
Overall End Position Accuracy: 0.2416

Language-specific Evaluations

 Evaluating ar 
Number of ar validation examples: 415


Evaluating: 100%|██████████| 26/26 [00:01<00:00, 16.01it/s]


AR Loss: 6.2708
AR Perplexity: 528.92
AR Start Position Accuracy: 0.3108
AR End Position Accuracy: 0.3133

 Evaluating ko 
Number of ko validation examples: 356


Evaluating: 100%|██████████| 23/23 [00:01<00:00, 16.10it/s]


KO Loss: 6.6723
KO Perplexity: 790.19
KO Start Position Accuracy: 0.2697
KO End Position Accuracy: 0.2472

 Evaluating te 
Number of te validation examples: 384


Evaluating: 100%|██████████| 24/24 [00:01<00:00, 15.94it/s]

TE Loss: 7.5759
TE Perplexity: 1950.63
TE Start Position Accuracy: 0.1927
TE End Position Accuracy: 0.1589



