# **Bi-LSTM**

In [2]:
! pip install gensim -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
import torch
import torch.nn as nn
import math
from gensim.models import Word2Vec
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
import re

def simple_tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

dataset = load_dataset("coastalcph/tydi_xor_rc")
languages = ['ar', 'ko', 'te']
train_dataset = dataset["train"].filter(lambda example: example['lang'] in languages)
val_dataset = dataset["validation"].filter(lambda example: example['lang'] in languages)

print("Sample from train dataset:")
sample = train_dataset[0]
print(f"Keys: {sample.keys()}")
print(f"Answer structure: {sample['answer']}")
print(f"Answer type: {type(sample['answer'])}")

class Word2VecTokenizer:
    def __init__(self, sentences, vector_size=100, window=5, min_count=1, workers=4):
        self.vector_size = vector_size
        self.word2vec = Word2Vec(sentences, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
        self.word2index = {word: i + 1 for i, word in enumerate(self.word2vec.wv.index_to_key)}
        self.index2word = {i + 1: word for i, word in enumerate(self.word2vec.wv.index_to_key)}
        self.vocab_size = len(self.word2index) + 1

    def tokenize(self, text):
        return [self.word2index.get(word, 0) for word in simple_tokenize(text.lower())]

    def __call__(self, questions, contexts, truncation, max_length, stride, return_overflowing_tokens, return_offsets_mapping, padding):
        tokenized_examples = {
            "input_ids": [],
            "attention_mask": [],
            "offset_mapping": [],
            "overflow_to_sample_mapping": []
        }
        for i, (question, context) in enumerate(zip(questions, contexts)):
            q_tokens = self.tokenize(question)
            c_tokens = self.tokenize(context)
            input_ids = q_tokens + [0] + c_tokens
            attention_mask = [1] * len(input_ids)
            if len(input_ids) > max_length:
                input_ids = input_ids[:max_length]
                attention_mask = attention_mask[:max_length]
            padding_length = max_length - len(input_ids)
            input_ids += [0] * padding_length
            attention_mask += [0] * padding_length
            tokenized_examples["input_ids"].append(input_ids)
            tokenized_examples["attention_mask"].append(attention_mask)
            tokenized_examples["offset_mapping"].append([(0, 0)] * max_length)
            tokenized_examples["overflow_to_sample_mapping"].append(i)
        return tokenized_examples

class LSTMForQuestionAnswering(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, pretrained_embeddings=None):
        super(LSTMForQuestionAnswering, self).__init__()
        if pretrained_embeddings is not None:
            self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.qa_outputs = nn.Linear(hidden_dim * 2, 2)

    def forward(self, input_ids, attention_mask=None):
        embedded = self.embedding(input_ids)
        lstm_out, _ = self.lstm(embedded)
        logits = self.qa_outputs(lstm_out)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        return {"start_logits": start_logits, "end_logits": end_logits}

all_texts = [simple_tokenize(text.lower()) for text in list(train_dataset['context']) + list(train_dataset['question'])]
tokenizer = Word2VecTokenizer(all_texts, vector_size=100)
max_length = 384
doc_stride = 128

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = [c.strip() for c in examples["context"]]
    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    offset_mapping = tokenized_examples.pop("offset_mapping")
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    for i, offsets in enumerate(offset_mapping):
        sample_index = sample_mapping[i]
        start_char = examples["answer_start"][sample_index]
        answer_text = examples["answer"][sample_index]
        end_char = start_char + len(answer_text)
        context = contexts[sample_index]
        context_tokens = simple_tokenize(context.lower())
        token_start_index = -1
        token_end_index = -1
        current_char = 0
        for j, token in enumerate(context_tokens):
            if current_char <= start_char < current_char + len(token):
                token_start_index = j
            if current_char < end_char <= current_char + len(token):
                token_end_index = j
                break
            current_char += len(token) + 1
        if token_start_index != -1 and token_end_index != -1:
            tokenized_examples["start_positions"].append(token_start_index + len(simple_tokenize(questions[sample_index].lower())) + 1)
            tokenized_examples["end_positions"].append(token_end_index + len(simple_tokenize(questions[sample_index].lower())) + 1)
        else:
            tokenized_examples["start_positions"].append(0)
            tokenized_examples["end_positions"].append(0)
    return tokenized_examples

print("\nTokenizing datasets...")
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)
tokenized_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'start_positions', 'end_positions'])
tokenized_val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

embedding_matrix = torch.zeros((tokenizer.vocab_size, tokenizer.vector_size))
for i, word in tokenizer.index2word.items():
    if word in tokenizer.word2vec.wv:
        embedding_matrix[i] = torch.tensor(tokenizer.word2vec.wv[word])

model = LSTMForQuestionAnswering(tokenizer.vocab_size, tokenizer.vector_size, 256, pretrained_embeddings=embedding_matrix)
device = torch.device("cuda")
model.to(device)
batch_size = 16
learning_rate = 2e-5
num_epochs = 1
train_dataloader = DataLoader(tokenized_train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(tokenized_val_dataset, batch_size=batch_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
loss_fct = nn.CrossEntropyLoss()

def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        start_logits = outputs['start_logits']
        end_logits = outputs['end_logits']
        start_loss = loss_fct(start_logits, start_positions)
        end_loss = loss_fct(end_logits, end_positions)
        loss = (start_loss + end_loss) / 2
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask)
            start_logits = outputs['start_logits']
            end_logits = outputs['end_logits']
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            loss = (start_loss + end_loss) / 2
            total_loss += loss.item()
    return total_loss / len(dataloader)

print("\nStarting training...")
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    train_loss = train_epoch(model, train_dataloader, optimizer, device)
    print(f"Training Loss: {train_loss:.4f}")

print("\nOverall Evaluation")
eval_loss = evaluate(model, val_dataloader, device)
print(f"Overall Perplexity: {math.exp(eval_loss):.2f}")
print(f"Overall Loss: {eval_loss:.4f}")

print("\nLanguage-specific Evaluations")
for lang in languages:
    print(f"\nEvaluating {lang.upper()}")
    lang_val_dataset = val_dataset.filter(lambda example: example['lang'] == lang)
    print(f"Number of {lang.upper()} validation examples: {len(lang_val_dataset)}")
    if len(lang_val_dataset) == 0:
        print(f"No validation examples found for language: {lang}")
        continue
    tokenized_lang_val = lang_val_dataset.map(preprocess_function, batched=True, remove_columns=lang_val_dataset.column_names)
    tokenized_lang_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'start_positions', 'end_positions'])
    lang_dataloader = DataLoader(tokenized_lang_val, batch_size=batch_size)
    lang_eval_loss = evaluate(model, lang_dataloader, device)
    print(f"{lang.upper()} Perplexity: {math.exp(lang_eval_loss):.2f}")
    print(f"{lang.upper()} Loss: {lang_eval_loss:.4f}")

print("\nEvaluating English Contexts Only")
en_contexts = list(val_dataset["context"])
en_context_only_dataset = Dataset.from_dict({
    "question": [""] * len(en_contexts),
    "context": en_contexts,
    "answer": [""] * len(en_contexts),
    "answer_start": [0] * len(en_contexts),
})
tokenized_en_context_val = en_context_only_dataset.map(preprocess_function, batched=True, remove_columns=en_context_only_dataset.column_names)
tokenized_en_context_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'start_positions', 'end_positions'])
en_dataloader = DataLoader(tokenized_en_context_val, batch_size=batch_size)
en_eval_loss = evaluate(model, en_dataloader, device)
print(f"English Context Perplexity: {math.exp(en_eval_loss):.2f}")
print(f"English Context Loss: {en_eval_loss:.4f}")


# **BERT-base**

In [4]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
import torch
import math

dataset = load_dataset("coastalcph/tydi_xor_rc")

languages = ['ar', 'ko', 'te']
train_dataset = dataset["train"].filter(lambda example: example['lang'] in languages)
val_dataset = dataset["validation"].filter(lambda example: example['lang'] in languages)

print("Sample from train dataset:")
sample = train_dataset[0]
print(f"Keys: {sample.keys()}")
print(f"Answer structure: {sample['answer']}")
print(f"Answer type: {type(sample['answer'])}")

model_checkpoint = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

max_length = 384
doc_stride = 128

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = [c.strip() for c in examples["context"]]

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = tokenized_examples.pop("offset_mapping")
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]

        start_char = examples["answer_start"][sample_index]
        answer_text = examples["answer"][sample_index]
        end_char = start_char + len(answer_text)

        token_start_index = 0
        while token_start_index < len(sequence_ids) and sequence_ids[token_start_index] != 1:
            token_start_index += 1

        token_end_index = len(input_ids) - 1
        while token_end_index >= 0 and sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)
            while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

print("\nTokenizing datasets...")
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=50,
    logging_strategy="steps",
    logging_steps=50,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
)

trainer.train()

print("\nOverall Evaluation")
eval_results = trainer.evaluate()
print(f"Overall Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
print(f"Overall Loss: {eval_results['eval_loss']:.4f}")

print("\nLanguage-specific Evaluations")

for lang in languages:
    print(f"\nEvaluating {lang.upper()}")
    lang_val_dataset = val_dataset.filter(lambda example: example['lang'] == lang)
    print(f"Number of {lang.upper()} validation examples: {len(lang_val_dataset)}")

    if len(lang_val_dataset) == 0:
        print(f"No validation examples found for language: {lang}")
        continue

    tokenized_lang_val = lang_val_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=lang_val_dataset.column_names
    )

    lang_trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=tokenized_lang_val,
        tokenizer=tokenizer,
    )

    lang_eval_results = lang_trainer.evaluate()
    print(f"{lang.upper()} Perplexity: {math.exp(lang_eval_results['eval_loss']):.2f}")
    print(f"{lang.upper()} Loss: {lang_eval_results['eval_loss']:.4f}")

# English Context Only Evaluation
print("\nEvaluating English Contexts Only")

en_contexts = list(val_dataset["context"])
en_context_only_dataset = Dataset.from_dict({
    "question": [""] * len(en_contexts),  # empty question (no lang)
    "context": en_contexts,
    "answer": [""] * len(en_contexts),
    "answer_start": [0] * len(en_contexts),
})

tokenized_en_context_val = en_context_only_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=en_context_only_dataset.column_names
)

en_trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_en_context_val,
    tokenizer=tokenizer,
)

en_eval_results = en_trainer.evaluate()
print(f"English Context Perplexity: {math.exp(en_eval_results['eval_loss']):.2f}")
print(f"English Context Loss: {en_eval_results['eval_loss']:.4f}")

for key, value in en_eval_results.items():
    if key not in ['eval_loss', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch']:
        print(f"English Context {key}: {value:.4f}")


Sample from train dataset:
Keys: dict_keys(['question', 'context', 'lang', 'answerable', 'answer_start', 'answer', 'answer_inlang'])
Answer structure: France
Answer type: <class 'str'>

Tokenizing datasets...


Map:   0%|          | 0/6335 [00:00<?, ? examples/s]

Map:   0%|          | 0/1155 [00:00<?, ? examples/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33maarushsinha60[0m ([33mchungimungi[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
50,4.1063,3.202258
100,3.0146,2.787781
150,2.7333,2.665639
200,2.5466,2.589059
250,2.5992,2.518849
300,2.3968,2.55164
350,2.493,2.480206
400,2.4509,2.467368



Overall Evaluation


Overall Perplexity: 11.79
Overall Loss: 2.4671

Language-specific Evaluations

Evaluating AR
Number of AR validation examples: 415


Map:   0%|          | 0/415 [00:00<?, ? examples/s]

  lang_trainer = Trainer(


AR Perplexity: 9.70
AR Loss: 2.2723

Evaluating KO
Number of KO validation examples: 356


Map:   0%|          | 0/356 [00:00<?, ? examples/s]

KO Perplexity: 12.36
KO Loss: 2.5149

Evaluating TE
Number of TE validation examples: 384


Map:   0%|          | 0/384 [00:00<?, ? examples/s]

TE Perplexity: 14.03
TE Loss: 2.6414

Evaluating English Contexts Only


Map:   0%|          | 0/1155 [00:00<?, ? examples/s]

  en_trainer = Trainer(


English Context Perplexity: 52.04
English Context Loss: 3.9520
English Context eval_model_preparation_time: 0.0028
