In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Brief Overview:**# 

I have followed the below steps to fine-tune my distil-bert model with the NewsQA dataset

1. Loading and exploring the dataset.

2. Initializing the BERT model and tokenizer.

3. Preprocessing the dataset to prepare inputs for the model.

4. Configuring lightweight training parameters for efficient fine-tuning.

5. Training the model on the dataset and saving the fine-tuned version.

6. Loading the trained model and predicting answers for given questions and contexts.

7. Providing a practical example of using the fine-tuned model for real-world question-answering.

In [1]:
pip install -q transformers datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.3/564.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
pylibcudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.
cudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.
bigframes 2.12.0 requires google-cloud-bigquery[bqstorage,pandas]>=3.31.0, but you have google-cloud-bigquery 3.25.0 which is incompatible.
bigframes 2.12.0 requires rich<14,>=1

In [4]:
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForQuestionAnswering,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding  
)
import torch

2025-10-22 04:01:47.961050: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761105708.186430      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761105708.253318      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## **Load the Model and Tokenizer**

In [5]:
model_name = "distilbert/distilbert-base-uncased" 

tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForQuestionAnswering.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## **Loading the NewsQA dataset** 

In [6]:
from datasets import load_dataset

ds_all = load_dataset("lucadiliello/newsqa")
eval_name = "validation" if "validation" in ds_all else ("dev" if "dev" in ds_all else "test")

train_ds = load_dataset("lucadiliello/newsqa", split="train[:200]")
eval_ds  = load_dataset("lucadiliello/newsqa", split=f"{eval_name}[:40]")

print(train_ds)
print(eval_ds)


Dataset({
    features: ['context', 'question', 'answers', 'key', 'labels'],
    num_rows: 200
})
Dataset({
    features: ['context', 'question', 'answers', 'key', 'labels'],
    num_rows: 40
})


## **Preprocessing**

In [12]:
def _first_int(x):
    
    if isinstance(x, (int, float)):
        return int(x)
    if isinstance(x, (list, tuple)):
        for v in x:
            if isinstance(v, (int, float)):
                return int(v)
            try:
                return int(v)
            except Exception:
                continue
        return None
    try:
        return int(x)
    except Exception:
        return None

def preprocess_newsqa(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts  = [c.strip() for c in examples["context"]]

    enc = tokenizer(
        questions,
        contexts,
        max_length=256,
        truncation="only_second",
        stride=128,
        return_offsets_mapping=True,
        padding="max_length",
    )

    start_positions, end_positions = [], []

    for i, offsets in enumerate(enc["offset_mapping"]):
        ctx = contexts[i]
        start_char, end_char = None, None

        
        if "labels" in examples:
            labs = examples["labels"][i]
            if labs and isinstance(labs, list):
                lab = labs[0]  # pick first label
                if isinstance(lab, dict):
                    s = _first_int(lab.get("start"))
                    e = _first_int(lab.get("end"))
                    ln = _first_int(lab.get("len"))
                    if s is not None and (e is not None or ln is not None):
                        start_char = s
                        end_char = e if e is not None else (s + ln if ln is not None else None)

        
        if (start_char is None or end_char is None) and "answers" in examples:
            ans_list = examples["answers"][i]
            if isinstance(ans_list, list) and len(ans_list) > 0 and isinstance(ans_list[0], str):
                ans_text = ans_list[0]
                idx = ctx.lower().find(ans_text.lower())
                if idx != -1:
                    start_char = idx
                    end_char = idx + len(ans_text)

        
        input_ids = enc["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id) if tokenizer.cls_token_id in input_ids else 0
        if start_char is None or end_char is None:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
            continue

        start_tok, end_tok = None, None
        for t, (s, e) in enumerate(offsets):
            if s <= start_char <= e:
                start_tok = t
            if s <= end_char <= e:
                end_tok = t
                break

        if start_tok is None or end_tok is None or end_tok < start_tok:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_positions.append(start_tok)
            end_positions.append(end_tok)

    enc["start_positions"] = start_positions
    enc["end_positions"] = end_positions
    enc.pop("offset_mapping")
    return enc


In [13]:
tokenized_train = train_ds.map(
    preprocess_newsqa,
    batched=True,
    batch_size=32,
    remove_columns=train_ds.column_names,
)

tokenized_eval = eval_ds.map(
    preprocess_newsqa,
    batched=True,
    batch_size=32,
    remove_columns=eval_ds.column_names,
)


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [14]:
pip -q install transformers[torch]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## **Configuring Training Parameters for Fast and Lightweight Model Training**

In [15]:
from transformers import TrainingArguments
import transformers as tfm
from packaging import version

eval_key = "eval_strategy" if version.parse(tfm.__version__) >= version.parse("4.46") else "evaluation_strategy"

USE_CPU = False 

training_args = TrainingArguments(
    output_dir="./newsqa-distilbert-results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="no",
    report_to="none",
    no_cuda=USE_CPU,            
    **{eval_key: "no"}          
)


## **Initalize and Train**

In [16]:
from transformers import Trainer, DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model=model,                         
    args=training_args,
    train_dataset=tokenized_train,      
    eval_dataset=tokenized_eval,        
    data_collator=data_collator
)


In [17]:
trainer.train()  

trainer.save_model("./newsqa-distilbert-model") 

model.save_pretrained("./newsqa-distilbert-model")          
tokenizer.save_pretrained("./newsqa-distilbert-model")    

Step,Training Loss
10,5.4047
20,5.1582
30,4.8821
40,4.6853
50,4.4334


('./newsqa-distilbert-model/tokenizer_config.json',
 './newsqa-distilbert-model/special_tokens_map.json',
 './newsqa-distilbert-model/vocab.txt',
 './newsqa-distilbert-model/added_tokens.json',
 './newsqa-distilbert-model/tokenizer.json')

## **Example usage**

In [18]:
from transformers import pipeline

qa = pipeline(
    "question-answering",
    model="./newsqa-distilbert-model",
    tokenizer="./newsqa-distilbert-model",
)

context = "NEW DELHI, India (CNN) -- A high court in northern India..."
question = "When was Pandher sentenced to death?"
print(qa(question=question, context=context))


Device set to use cpu


{'score': 0.00982893817126751, 'start': 11, 'end': 50, 'answer': 'India (CNN) -- A high court in northern'}


In [19]:
from transformers import pipeline
qa = pipeline(
    "question-answering",
    model="./newsqa-distilbert-model",
    tokenizer="./newsqa-distilbert-model",
    max_seq_len=256,
    doc_stride=128,
    handle_impossible_answer=False,
)


Device set to use cpu


In [20]:
pred_texts = []
gold_texts_list = []

for ex in eval_ds:
    pr = qa(question=ex["question"], context=ex["context"])
    pred_texts.append(pr["answer"])
    golds = ex["answers"] if isinstance(ex.get("answers"), list) else []
    gold_texts_list.append([g for g in dict.fromkeys(golds) if isinstance(g, str) and g.strip()])


In [21]:
import re, string
def normalize_text(s):
    def remove_articles(text): return re.sub(r"\b(a|an|the)\b", " ", text)
    def white_space_fix(text):  return " ".join(text.split())
    def remove_punc(text):      return text.translate(str.maketrans("", "", string.punctuation))
    def lower(text):            return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_single(prediction, ground_truth):
    pred_tokens = normalize_text(prediction).split()
    gold_tokens = normalize_text(ground_truth).split()
    if len(pred_tokens) == 0 and len(gold_tokens) == 0:
        return 1.0
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 0.0
    common = {}
    for t in pred_tokens:
        common[t] = common.get(t, 0) + (t in gold_tokens)
    num_same = sum(min(pred_tokens.count(t), gold_tokens.count(t)) for t in set(pred_tokens))
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall    = num_same / len(gold_tokens)
    return 2 * precision * recall / (precision + recall)

def exact_match_single(prediction, ground_truth):
    return 1.0 if normalize_text(prediction) == normalize_text(ground_truth) else 0.0

def best_over_gold(metric_fn, prediction, gold_list):
    if not gold_list:
        return metric_fn(prediction, "") 
    return max(metric_fn(prediction, g) for g in gold_list)

em_scores, f1_scores = [], []
for pred, golds in zip(pred_texts, gold_texts_list):
    em_scores.append(best_over_gold(exact_match_single, pred, golds))
    f1_scores.append(best_over_gold(f1_single, pred, golds))

em = sum(em_scores) / len(em_scores) if em_scores else 0.0
f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0.0
print({"EM": em, "F1": f1})


{'EM': 0.075, 'F1': 0.13916666666666666}
