In [None]:
import os
import random
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import (
    BertTokenizerFast,
    BertForQuestionAnswering,
    AdamW,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding
)
import time
from tqdm import tqdm
import re
import string

In [None]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

In [None]:
# device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
import json
# Load training data
with open('Spoken-SQuAD/spoken_train-v1.1.json', 'r') as f:
    training_data = json.load(f)
# Load validation data
with open('Spoken-SQuAD/spoken_test-v1.1.json', 'r') as f:
    validation_data = json.load(f)


In [None]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
def prepare_examples_with_stride(data, tokenizer, max_length=512, stride=128):
    examples = []
    for article in data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answers = qa.get('answers', [])
                
                # Skip examples without answers
                if not answers:
                    continue

                answer = answers[0]
                answer_text = answer['text']
                start_char = answer['answer_start']
                end_char = start_char + len(answer_text)
            
                tokenized_inputs = tokenizer(
                    question,
                    context,
                    max_length=max_length,
                    truncation='only_second',
                    stride=stride,
                    return_overflowing_tokens=True,
                    return_offsets_mapping=True,
                    padding='max_length'
                )

                # Map tokens
                overflow_mapping = tokenized_inputs.pop('overflow_to_sample_mapping')
                offset_mapping = tokenized_inputs.pop('offset_mapping')

                for i in range(len(tokenized_inputs['input_ids'])):
                    input_ids = tokenized_inputs['input_ids'][i]
                    attention_mask = tokenized_inputs['attention_mask'][i]
                    token_type_ids = tokenized_inputs['token_type_ids'][i]
                    sequence_ids = tokenized_inputs.sequence_ids(i)

                    context_start = sequence_ids.index(1)
                    context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

                    offsets = offset_mapping[i]
                    sample_index = overflow_mapping[i]

                    
                    if not (offsets[context_start][0] <= start_char and offsets[context_end][1] >= end_char):
                       
                        start_positions = 0
                        end_positions = 0
                    else:
                        # Finding the start and end token indices
                        token_start_index = context_start
                        token_end_index = context_end

                        while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                            token_start_index += 1
                        start_positions = token_start_index - 1

                        while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                            token_end_index -= 1
                        end_positions = token_end_index + 1

                    # example
                    example = {
                        'input_ids': input_ids,
                        'attention_mask': attention_mask,
                        'token_type_ids': token_type_ids,
                        'start_positions': start_positions,
                        'end_positions': end_positions,
                        'question': question,
                        'context': context,
                        'answers': [answer_text]
                    }
                    examples.append(example)
    return examples

# Preprocess the data
training_examples = prepare_examples_with_stride(training_data, tokenizer)
validation_examples = prepare_examples_with_stride(validation_data, tokenizer)

print(f"Total training examples: {len(training_examples)}")
print(f"Total validation examples: {len(validation_examples)}")


Total training examples: 37130
Total validation examples: 5376


In [7]:
class QADataset(Dataset):
    def __init__(self, examples):
        self.examples = examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        return {
            'input_ids': torch.tensor(example['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(example['attention_mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(example['token_type_ids'], dtype=torch.long),
            'start_positions': torch.tensor(example['start_positions'], dtype=torch.long),
            'end_positions': torch.tensor(example['end_positions'], dtype=torch.long)
        }


In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Creating datasets
train_dataset = QADataset(training_examples)
val_dataset = QADataset(validation_examples)

# Creating DataLoaders
batch_size = 8 

train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator
)


In [None]:
from transformers import BertForQuestionAnswering
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
model.to(device)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [None]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)
epochs = 3 
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

In [None]:
def train(model, train_dataloader, val_dataloader, optimizer, scheduler, epochs, device, output_dir):
    total_start_time = time.time()
    scaler = torch.cuda.amp.GradScaler()

    for epoch in range(epochs):
        print(f"\n======== Epoch {epoch + 1}/{epochs} ========")
        print("Training...")

        epoch_start_time = time.time()
        total_loss = 0.0
        model.train()

        for step, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch + 1}", leave=False)):
            optimizer.zero_grad()
            batch = {k: v.to(device) for k, v in batch.items()}

            with torch.cuda.amp.autocast():
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    token_type_ids=batch['token_type_ids'],
                    start_positions=batch['start_positions'],
                    end_positions=batch['end_positions']
                )
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            batch_loss = loss.item()
            total_loss += batch_loss

            # Print batch loss every 500 steps
            if (step + 1) % 500 == 0 or (step + 1) == len(train_dataloader):
                current_lr = scheduler.get_last_lr()[0]
                elapsed = time.time() - epoch_start_time
                print(f"  Step {step + 1}/{len(train_dataloader)} - "
                      f"Batch Loss: {batch_loss:.4f} - "
                      f"Avg Loss: {total_loss / (step + 1):.4f} - "
                      f"LR: {current_lr:.6f} - "
                      f"Elapsed: {elapsed:.2f}s")

        avg_train_loss = total_loss / len(train_dataloader)
        epoch_time = time.time() - epoch_start_time
        print(f"\nEpoch {epoch + 1} finished.")
        print(f"  Average Training Loss: {avg_train_loss:.4f}")
        print(f"  Epoch Training Time: {epoch_time:.2f}s")

        # Validation after each epoch
        print("\nRunning Validation...")
        val_start_time = time.time()
        avg_val_loss = evaluate(model, val_dataloader, device)
        val_time = time.time() - val_start_time
        print(f"  Validation Loss: {avg_val_loss:.4f}")
        print(f"  Validation Time: {val_time:.2f}s")

        # Computing evaluation metrics
        print("\nCalculating Metrics on Validation Set...")
        evaluate_metrics(model, validation_examples, device)

        # Saving model after each epoch
        epoch_output_dir = os.path.join(output_dir, f"epoch_{epoch + 1}")
        if not os.path.exists(epoch_output_dir):
            os.makedirs(epoch_output_dir)
        model.save_pretrained(epoch_output_dir)
        tokenizer.save_pretrained(epoch_output_dir)
        print(f"Model saved to {epoch_output_dir}")

    total_training_time = time.time() - total_start_time
    print(f"\nTraining complete! Total training time: {total_training_time:.2f}s")


In [None]:
def evaluate(model, dataloader, device):
    model.eval()
    total_eval_loss = 0.0

    for step, batch in enumerate(tqdm(dataloader, desc="Evaluating", leave=False)):
        batch = {k: v.to(device) for k, v in batch.items()}

        with torch.no_grad():
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                token_type_ids=batch['token_type_ids'],
                start_positions=batch['start_positions'],
                end_positions=batch['end_positions']
            )
            loss = outputs.loss

        total_eval_loss += loss.item()

        if (step + 1) % 100 == 0 or (step + 1) == len(dataloader):
            print(f"  Evaluation Step {step + 1}/{len(dataloader)} - "
                  f"Loss: {loss.item():.4f}")

    avg_eval_loss = total_eval_loss / len(dataloader)
    return avg_eval_loss


In [14]:
def normalize_answer(s):
    """Lowercase, remove punctuation, articles, and extra whitespace."""
    def lower(text):
        return text.lower()
    def remove_punctuation(text):
        return ''.join(ch for ch in text if ch not in set(string.punctuation))
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    return white_space_fix(remove_articles(remove_punctuation(lower(s))))

def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def compute_f1(a_gold, a_pred):
    gold_tokens = normalize_answer(a_gold).split()
    pred_tokens = normalize_answer(a_pred).split()
    common = set(gold_tokens) & set(pred_tokens)
    if not common:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gold_tokens)
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1


In [None]:
def evaluate_metrics(model, examples, device):
    model.eval()
    exact_scores = []
    f1_scores = []

    for idx, example in enumerate(tqdm(examples, desc='Calculating Metrics')):
        # Tokenize with offset mapping and overflow mapping
        tokenized_inputs = tokenizer(
            example['question'],
            example['context'],
            truncation='only_second',
            max_length=512,
            stride=128,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding='max_length',
            return_tensors='pt'
        )

        offset_mapping = tokenized_inputs.pop('offset_mapping')
        overflow_mapping = tokenized_inputs.pop('overflow_to_sample_mapping')

        inputs = {k: v.to(device) for k, v in tokenized_inputs.items()}

        with torch.no_grad():
           
            outputs = model(**inputs)

        # Postprocessing to get the best answer from multiple chunks
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        all_answers = []
        for i in range(start_logits.size(0)):
            start_logit = start_logits[i]
            end_logit = end_logits[i]
            input_ids = inputs['input_ids'][i]
            offsets = offset_mapping[i]

            start_indexes = torch.argsort(start_logit, descending=True).tolist()[:20]
            end_indexes = torch.argsort(end_logit, descending=True).tolist()[:20]

            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Ensure that the end index is after the start index and the answer is not too long
                    if start_index <= end_index and end_index - start_index + 1 <= 30:
                        # Calculate the score for the answer span
                        answer_score = start_logit[start_index] + end_logit[end_index]
                        # Skip answers that are not in the context (sequence_ids=1)
                        if inputs['token_type_ids'][i][start_index] != 1 or inputs['token_type_ids'][i][end_index] != 1:
                            continue
                        # Extracting answers using offsets
                        start_char = offsets[start_index][0].item()
                        end_char = offsets[end_index][1].item()
                        predicted_answer = example['context'][start_char:end_char]
                        all_answers.append((predicted_answer, answer_score))

        if all_answers:
            best_answer = max(all_answers, key=lambda x: x[1])[0]
        else:
            best_answer = ''

        # Ground truth answers
        ground_truth_answers = example['answers']

        
        exact = max(compute_exact(gt, best_answer) for gt in ground_truth_answers)
        f1 = max(compute_f1(gt, best_answer) for gt in ground_truth_answers)

        exact_scores.append(exact)
        f1_scores.append(f1)

        if (idx + 1) % 500 == 0 or (idx + 1) == len(examples):
            avg_exact = sum(exact_scores) / len(exact_scores)
            avg_f1 = sum(f1_scores) / len(f1_scores)
            print(f"  Processed {idx + 1}/{len(examples)} examples - "
                  f"EM: {avg_exact:.4f} - F1: {avg_f1:.4f}")

    avg_exact = sum(exact_scores) / len(exact_scores)
    avg_f1 = sum(f1_scores) / len(f1_scores)

    print(f"\nFinal Exact Match (EM): {avg_exact:.4f}")
    print(f"Final F1 Score: {avg_f1:.4f}")


In [None]:
def predict_answers(model, questions, contexts, tokenizer, device):
    model.eval()
    answers = []

    for question, context in zip(questions, contexts):
        tokenized_inputs = tokenizer(
            question,
            context,
            truncation='only_second',
            max_length=512,
            stride=128,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding='max_length',
            return_tensors='pt'
        )
        offset_mapping = tokenized_inputs.pop('offset_mapping')
        overflow_mapping = tokenized_inputs.pop('overflow_to_sample_mapping')

      
        inputs = {k: v.to(device) for k, v in tokenized_inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        all_answers = []
        for i in range(start_logits.size(0)):
            start_logit = start_logits[i]
            end_logit = end_logits[i]
            input_ids = inputs['input_ids'][i]
            offsets = offset_mapping[i]
            token_type_ids = inputs['token_type_ids'][i]

            start_indexes = torch.argsort(start_logit, descending=True).tolist()[:20]
            end_indexes = torch.argsort(end_logit, descending=True).tolist()[:20]

            for start_index in start_indexes:
                for end_index in end_indexes:
                   
                    if start_index <= end_index and end_index - start_index + 1 <= 30:
                        
                        if token_type_ids[start_index] != 1 or token_type_ids[end_index] != 1:
                            continue
                        
                        answer_score = start_logit[start_index] + end_logit[end_index]
                        
                        start_char = offsets[start_index][0].item()
                        end_char = offsets[end_index][1].item()
                        predicted_answer = context[start_char:end_char]
                        all_answers.append((predicted_answer, answer_score))
                        
        if all_answers:
            best_answer = max(all_answers, key=lambda x: x[1])[0]
        else:
            best_answer = ''

        answers.append(best_answer)

    return answers

In [17]:
output_dir = './models/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


In [18]:
train(
    model=model,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    optimizer=optimizer,
    scheduler=scheduler,
    epochs=epochs,
    device=device,
    output_dir=output_dir
)


  scaler = torch.cuda.amp.GradScaler()



Training...


  with torch.cuda.amp.autocast():
Epoch 1:  11%|█         | 503/4642 [00:24<03:05, 22.30it/s]

  Step 500/4642 - Batch Loss: 2.2808 - Avg Loss: 4.3370 - LR: 0.000011 - Elapsed: 24.06s


Epoch 1:  22%|██▏       | 1004/4642 [00:47<02:43, 22.30it/s]

  Step 1000/4642 - Batch Loss: 1.6811 - Avg Loss: 3.3355 - LR: 0.000022 - Elapsed: 46.85s


Epoch 1:  32%|███▏      | 1502/4642 [01:09<02:24, 21.68it/s]

  Step 1500/4642 - Batch Loss: 1.9018 - Avg Loss: 2.8516 - LR: 0.000030 - Elapsed: 69.65s


Epoch 1:  43%|████▎     | 2003/4642 [01:32<01:58, 22.22it/s]

  Step 2000/4642 - Batch Loss: 1.2167 - Avg Loss: 2.5638 - LR: 0.000029 - Elapsed: 92.40s


Epoch 1:  54%|█████▍    | 2504/4642 [01:55<01:38, 21.68it/s]

  Step 2500/4642 - Batch Loss: 1.4181 - Avg Loss: 2.3677 - LR: 0.000027 - Elapsed: 115.43s


Epoch 1:  65%|██████▍   | 3002/4642 [02:18<01:13, 22.22it/s]

  Step 3000/4642 - Batch Loss: 1.3757 - Avg Loss: 2.2256 - LR: 0.000026 - Elapsed: 138.38s


Epoch 1:  75%|███████▌  | 3503/4642 [02:41<00:51, 22.17it/s]

  Step 3500/4642 - Batch Loss: 1.0670 - Avg Loss: 2.1165 - LR: 0.000025 - Elapsed: 160.91s


Epoch 1:  86%|████████▋ | 4004/4642 [03:03<00:29, 21.72it/s]

  Step 4000/4642 - Batch Loss: 1.0638 - Avg Loss: 2.0319 - LR: 0.000024 - Elapsed: 183.82s


Epoch 1:  97%|█████████▋| 4502/4642 [03:26<00:06, 22.20it/s]

  Step 4500/4642 - Batch Loss: 0.9653 - Avg Loss: 1.9592 - LR: 0.000023 - Elapsed: 206.39s


                                                            

  Step 4642/4642 - Batch Loss: 1.6729 - Avg Loss: 1.9411 - LR: 0.000022 - Elapsed: 212.85s

Epoch 1 finished.
  Average Training Loss: 1.9411
  Epoch Training Time: 212.85s

Running Validation...


Evaluating:  15%|█▌        | 102/672 [00:05<00:31, 17.97it/s]

  Evaluation Step 100/672 - Loss: 0.6900


Evaluating:  30%|███       | 202/672 [00:11<00:26, 17.98it/s]

  Evaluation Step 200/672 - Loss: 0.5966


Evaluating:  45%|████▍     | 302/672 [00:16<00:20, 17.92it/s]

  Evaluation Step 300/672 - Loss: 1.2751


Evaluating:  60%|█████▉    | 402/672 [00:22<00:15, 17.93it/s]

  Evaluation Step 400/672 - Loss: 2.1104


Evaluating:  75%|███████▍  | 502/672 [00:28<00:09, 17.92it/s]

  Evaluation Step 500/672 - Loss: 1.3017


Evaluating:  90%|████████▉ | 602/672 [00:33<00:03, 17.93it/s]

  Evaluation Step 600/672 - Loss: 1.0626


                                                             

  Evaluation Step 672/672 - Loss: 1.6201
  Validation Loss: 1.5099
  Validation Time: 37.50s

Calculating Metrics on Validation Set...


Calculating Metrics:   9%|▉         | 505/5376 [00:13<02:00, 40.53it/s]

  Processed 500/5376 examples - EM: 0.5680 - F1: 0.6654


Calculating Metrics:  19%|█▊        | 1006/5376 [00:26<01:59, 36.49it/s]

  Processed 1000/5376 examples - EM: 0.5670 - F1: 0.6775


Calculating Metrics:  28%|██▊       | 1504/5376 [00:40<01:48, 35.59it/s]

  Processed 1500/5376 examples - EM: 0.5427 - F1: 0.6754


Calculating Metrics:  37%|███▋      | 2005/5376 [00:53<01:21, 41.12it/s]

  Processed 2000/5376 examples - EM: 0.5375 - F1: 0.6807


Calculating Metrics:  47%|████▋     | 2505/5376 [01:05<00:58, 49.10it/s]

  Processed 2500/5376 examples - EM: 0.5296 - F1: 0.6775


Calculating Metrics:  56%|█████▌    | 3008/5376 [01:16<00:48, 48.48it/s]

  Processed 3000/5376 examples - EM: 0.5333 - F1: 0.6812


Calculating Metrics:  65%|██████▌   | 3507/5376 [01:26<00:37, 49.33it/s]

  Processed 3500/5376 examples - EM: 0.5280 - F1: 0.6746


Calculating Metrics:  75%|███████▍  | 4006/5376 [01:36<00:26, 51.11it/s]

  Processed 4000/5376 examples - EM: 0.5305 - F1: 0.6759


Calculating Metrics:  84%|████████▍ | 4508/5376 [01:48<00:22, 39.43it/s]

  Processed 4500/5376 examples - EM: 0.5298 - F1: 0.6737


Calculating Metrics:  93%|█████████▎| 5003/5376 [02:01<00:09, 39.10it/s]

  Processed 5000/5376 examples - EM: 0.5334 - F1: 0.6773


Calculating Metrics: 100%|██████████| 5376/5376 [02:10<00:00, 41.08it/s]


  Processed 5376/5376 examples - EM: 0.5320 - F1: 0.6763

Final Exact Match (EM): 0.5320
Final F1 Score: 0.6763
Model saved to ./models/epoch_1

Training...


Epoch 2:  11%|█         | 504/4642 [00:23<03:11, 21.57it/s]

  Step 500/4642 - Batch Loss: 0.5957 - Avg Loss: 0.8846 - LR: 0.000021 - Elapsed: 23.01s


Epoch 2:  22%|██▏       | 1002/4642 [00:46<02:47, 21.74it/s]

  Step 1000/4642 - Batch Loss: 1.3091 - Avg Loss: 0.9102 - LR: 0.000020 - Elapsed: 46.00s


Epoch 2:  32%|███▏      | 1503/4642 [01:08<02:21, 22.25it/s]

  Step 1500/4642 - Batch Loss: 1.3460 - Avg Loss: 0.9202 - LR: 0.000019 - Elapsed: 68.55s


Epoch 2:  43%|████▎     | 2004/4642 [01:31<01:58, 22.22it/s]

  Step 2000/4642 - Batch Loss: 0.3092 - Avg Loss: 0.9276 - LR: 0.000017 - Elapsed: 91.03s


Epoch 2:  54%|█████▍    | 2502/4642 [01:53<01:36, 22.21it/s]

  Step 2500/4642 - Batch Loss: 1.3931 - Avg Loss: 0.9288 - LR: 0.000016 - Elapsed: 113.79s


Epoch 2:  65%|██████▍   | 3003/4642 [02:16<01:15, 21.69it/s]

  Step 3000/4642 - Batch Loss: 2.1768 - Avg Loss: 0.9294 - LR: 0.000015 - Elapsed: 136.43s


Epoch 2:  75%|███████▌  | 3504/4642 [02:39<00:52, 21.73it/s]

  Step 3500/4642 - Batch Loss: 0.9917 - Avg Loss: 0.9327 - LR: 0.000014 - Elapsed: 159.43s


Epoch 2:  86%|████████▌ | 4002/4642 [03:02<00:29, 21.73it/s]

  Step 4000/4642 - Batch Loss: 1.0818 - Avg Loss: 0.9265 - LR: 0.000013 - Elapsed: 182.48s


Epoch 2:  97%|█████████▋| 4503/4642 [03:25<00:06, 22.23it/s]

  Step 4500/4642 - Batch Loss: 0.4745 - Avg Loss: 0.9215 - LR: 0.000011 - Elapsed: 205.25s


                                                            

  Step 4642/4642 - Batch Loss: 0.3068 - Avg Loss: 0.9211 - LR: 0.000011 - Elapsed: 211.64s

Epoch 2 finished.
  Average Training Loss: 0.9211
  Epoch Training Time: 211.64s

Running Validation...


Evaluating:  15%|█▌        | 102/672 [00:05<00:31, 17.95it/s]

  Evaluation Step 100/672 - Loss: 0.6542


Evaluating:  30%|███       | 202/672 [00:11<00:26, 17.97it/s]

  Evaluation Step 200/672 - Loss: 0.4800


Evaluating:  45%|████▍     | 302/672 [00:16<00:20, 17.92it/s]

  Evaluation Step 300/672 - Loss: 1.5272


Evaluating:  60%|█████▉    | 402/672 [00:22<00:15, 17.95it/s]

  Evaluation Step 400/672 - Loss: 1.5892


Evaluating:  75%|███████▍  | 502/672 [00:27<00:09, 17.93it/s]

  Evaluation Step 500/672 - Loss: 0.9548


Evaluating:  90%|████████▉ | 602/672 [00:33<00:03, 17.92it/s]

  Evaluation Step 600/672 - Loss: 1.5369


                                                             

  Evaluation Step 672/672 - Loss: 1.6122
  Validation Loss: 1.4853
  Validation Time: 37.47s

Calculating Metrics on Validation Set...


Calculating Metrics:   9%|▉         | 507/5376 [00:10<01:44, 46.39it/s]

  Processed 500/5376 examples - EM: 0.5980 - F1: 0.6950


Calculating Metrics:  19%|█▉        | 1008/5376 [00:22<01:37, 44.62it/s]

  Processed 1000/5376 examples - EM: 0.5910 - F1: 0.6999


Calculating Metrics:  28%|██▊       | 1508/5376 [00:33<01:28, 43.58it/s]

  Processed 1500/5376 examples - EM: 0.5613 - F1: 0.6922


Calculating Metrics:  37%|███▋      | 2007/5376 [00:44<01:08, 48.99it/s]

  Processed 2000/5376 examples - EM: 0.5570 - F1: 0.6974


Calculating Metrics:  47%|████▋     | 2507/5376 [00:55<00:58, 49.22it/s]

  Processed 2500/5376 examples - EM: 0.5532 - F1: 0.6937


Calculating Metrics:  56%|█████▌    | 3005/5376 [01:05<00:48, 48.83it/s]

  Processed 3000/5376 examples - EM: 0.5537 - F1: 0.6961


Calculating Metrics:  65%|██████▌   | 3509/5376 [01:16<00:37, 49.16it/s]

  Processed 3500/5376 examples - EM: 0.5474 - F1: 0.6885


Calculating Metrics:  74%|███████▍  | 4005/5376 [01:26<00:27, 49.76it/s]

  Processed 4000/5376 examples - EM: 0.5465 - F1: 0.6887


Calculating Metrics:  84%|████████▍ | 4506/5376 [01:36<00:17, 49.33it/s]

  Processed 4500/5376 examples - EM: 0.5458 - F1: 0.6861


Calculating Metrics:  93%|█████████▎| 5010/5376 [01:47<00:07, 48.84it/s]

  Processed 5000/5376 examples - EM: 0.5482 - F1: 0.6889


Calculating Metrics: 100%|██████████| 5376/5376 [01:55<00:00, 46.59it/s]


  Processed 5376/5376 examples - EM: 0.5452 - F1: 0.6867

Final Exact Match (EM): 0.5452
Final F1 Score: 0.6867
Model saved to ./models/epoch_2

Training...


Epoch 3:  11%|█         | 504/4642 [00:22<03:04, 22.41it/s]

  Step 500/4642 - Batch Loss: 0.1564 - Avg Loss: 0.5050 - LR: 0.000010 - Elapsed: 22.34s


Epoch 3:  22%|██▏       | 1002/4642 [00:44<02:42, 22.36it/s]

  Step 1000/4642 - Batch Loss: 0.1294 - Avg Loss: 0.4985 - LR: 0.000009 - Elapsed: 44.70s


Epoch 3:  32%|███▏      | 1503/4642 [01:07<02:20, 22.34it/s]

  Step 1500/4642 - Batch Loss: 1.8543 - Avg Loss: 0.4968 - LR: 0.000008 - Elapsed: 67.06s


Epoch 3:  43%|████▎     | 2004/4642 [01:29<01:57, 22.36it/s]

  Step 2000/4642 - Batch Loss: 0.4932 - Avg Loss: 0.4983 - LR: 0.000006 - Elapsed: 89.71s


Epoch 3:  54%|█████▍    | 2502/4642 [01:52<01:35, 22.30it/s]

  Step 2500/4642 - Batch Loss: 0.3011 - Avg Loss: 0.4967 - LR: 0.000005 - Elapsed: 112.10s


Epoch 3:  65%|██████▍   | 3003/4642 [02:14<01:15, 21.84it/s]

  Step 3000/4642 - Batch Loss: 0.4315 - Avg Loss: 0.4966 - LR: 0.000004 - Elapsed: 134.58s


Epoch 3:  75%|███████▌  | 3504/4642 [02:37<00:51, 21.96it/s]

  Step 3500/4642 - Batch Loss: 0.3354 - Avg Loss: 0.4949 - LR: 0.000003 - Elapsed: 157.43s


Epoch 3:  86%|████████▌ | 4002/4642 [03:00<00:28, 22.32it/s]

  Step 4000/4642 - Batch Loss: 0.1581 - Avg Loss: 0.4898 - LR: 0.000002 - Elapsed: 180.10s


Epoch 3:  97%|█████████▋| 4503/4642 [03:22<00:06, 22.32it/s]

  Step 4500/4642 - Batch Loss: 0.3077 - Avg Loss: 0.4900 - LR: 0.000000 - Elapsed: 202.59s


                                                            

  Step 4642/4642 - Batch Loss: 0.7115 - Avg Loss: 0.4880 - LR: 0.000000 - Elapsed: 209.00s

Epoch 3 finished.
  Average Training Loss: 0.4880
  Epoch Training Time: 209.00s

Running Validation...


Evaluating:  15%|█▌        | 102/672 [00:05<00:31, 17.96it/s]

  Evaluation Step 100/672 - Loss: 0.9461


Evaluating:  30%|███       | 202/672 [00:11<00:26, 17.94it/s]

  Evaluation Step 200/672 - Loss: 0.4653


Evaluating:  45%|████▍     | 302/672 [00:16<00:20, 17.92it/s]

  Evaluation Step 300/672 - Loss: 1.5343


Evaluating:  60%|█████▉    | 402/672 [00:22<00:15, 17.91it/s]

  Evaluation Step 400/672 - Loss: 1.5408


Evaluating:  75%|███████▍  | 502/672 [00:27<00:09, 17.97it/s]

  Evaluation Step 500/672 - Loss: 1.0079


Evaluating:  90%|████████▉ | 602/672 [00:33<00:03, 17.93it/s]

  Evaluation Step 600/672 - Loss: 1.7933


                                                             

  Evaluation Step 672/672 - Loss: 1.5950
  Validation Loss: 1.7295
  Validation Time: 37.49s

Calculating Metrics on Validation Set...


Calculating Metrics:   9%|▉         | 509/5376 [00:10<01:40, 48.26it/s]

  Processed 500/5376 examples - EM: 0.5840 - F1: 0.6828


Calculating Metrics:  19%|█▉        | 1009/5376 [00:21<01:37, 44.72it/s]

  Processed 1000/5376 examples - EM: 0.5910 - F1: 0.7004


Calculating Metrics:  28%|██▊       | 1505/5376 [00:33<01:28, 43.81it/s]

  Processed 1500/5376 examples - EM: 0.5607 - F1: 0.6933


Calculating Metrics:  37%|███▋      | 2009/5376 [00:44<01:09, 48.24it/s]

  Processed 2000/5376 examples - EM: 0.5570 - F1: 0.6979


Calculating Metrics:  47%|████▋     | 2509/5376 [00:55<00:58, 48.69it/s]

  Processed 2500/5376 examples - EM: 0.5544 - F1: 0.6969


Calculating Metrics:  56%|█████▌    | 3005/5376 [01:05<00:47, 49.80it/s]

  Processed 3000/5376 examples - EM: 0.5547 - F1: 0.7004


Calculating Metrics:  65%|██████▌   | 3505/5376 [01:16<00:36, 51.60it/s]

  Processed 3500/5376 examples - EM: 0.5506 - F1: 0.6952


Calculating Metrics:  74%|███████▍  | 4004/5376 [01:26<00:27, 49.27it/s]

  Processed 4000/5376 examples - EM: 0.5505 - F1: 0.6974


Calculating Metrics:  84%|████████▍ | 4509/5376 [01:37<00:18, 48.08it/s]

  Processed 4500/5376 examples - EM: 0.5502 - F1: 0.6957


Calculating Metrics:  93%|█████████▎| 5010/5376 [01:48<00:07, 47.67it/s]

  Processed 5000/5376 examples - EM: 0.5526 - F1: 0.6977


Calculating Metrics: 100%|██████████| 5376/5376 [01:56<00:00, 46.32it/s]


  Processed 5376/5376 examples - EM: 0.5506 - F1: 0.6962

Final Exact Match (EM): 0.5506
Final F1 Score: 0.6962
Model saved to ./models/epoch_3

Training complete! Total training time: 1110.87s


In [None]:
# Trial 
questions = ["What is the capital of France?"]
contexts = ["France, officially the French Republic, is a country whose territory consists of metropolitan France in Western Europe and several overseas regions and territories. The capital is Paris."]
predicted_answers = predict_answers(model, questions, contexts, tokenizer, device)

for question, answer in zip(questions, predicted_answers):
    print(f"Question: {question}")
    print(f"Answer: {answer}")
    
    

Question: What is the capital of France?
Answer: Paris
