In [34]:
!pip install transformers datasets evaluate -q


In [63]:
from datasets import load_dataset, Dataset
from transformers import (AutoTokenizer, AutoModelForQuestionAnswering,
                          TrainingArguments, Trainer, default_data_collator)
import evaluate
import torch
import numpy as np
from langdetect import detect
from tqdm import tqdm


In [64]:
raw_ds = load_dataset("ai4bharat/indicqa", split="train")

examples = []
for row in tqdm(raw_ds):
    paragraphs = row['data']['paragraphs']
    for para in paragraphs:
        context = para['context']
        for qa in para['qas']:
            if not qa['answers'] or qa['answers'][0]['text'] == "":
                continue
            ans = qa['answers'][0]
            if ans['answer_start'] is None:
                continue
            try:
                lang = detect(qa['question'])  # detect from question
            except:
                lang = "unknown"
            examples.append({
                'context': context,
                'question': qa['question'],
                'answer_text': ans['text'],
                'answer_start': ans['answer_start'],
                'id': qa['id'],
                'category': qa.get('category', 'SHORT'),
                'language': lang
            })

flat_ds = Dataset.from_list(examples)


100%|██████████| 2759/2759 [00:11<00:00, 248.84it/s]


In [65]:
def convert_to_squad(example):
    return {
        "context": example["context"],
        "question": example["question"],
        "answers": {
            "text": [example["answer_text"]],
            "answer_start": [example["answer_start"]]
        },
        "id": str(example["id"])
    }

def preprocess_qa(example, tokenizer):
    inputs = tokenizer(
        example["question"],
        example["context"],
        truncation="only_second",
        padding="max_length",
        max_length=384,
        return_offsets_mapping=True
    )
    offset_mapping = inputs.pop("offset_mapping")
    start_char = example["answers"]["answer_start"][0]
    end_char = start_char + len(example["answers"]["text"][0])
    start_token = end_token = 0
    for idx, (start, end) in enumerate(offset_mapping):
        if start <= start_char < end:
            start_token = idx
        if start < end_char <= end:
            end_token = idx
            break
    inputs["start_positions"] = start_token
    inputs["end_positions"] = end_token
    return inputs

squad_metric = evaluate.load("squad")

In [66]:
def compute_predictions(dataset, preds, tokenizer, tokenized_dataset):
    start_logits, end_logits = preds
    predictions = []
    references = []

    for i in range(len(start_logits)):
        try:
            start = torch.argmax(torch.tensor(start_logits[i])).item()
            end = torch.argmax(torch.tensor(end_logits[i])).item() + 1
            input_ids = tokenized_dataset[i]["input_ids"][start:end]
            pred_text = tokenizer.decode(input_ids, skip_special_tokens=True).strip()

            # Reference
            ref = {
                "id": str(dataset[i]["id"]),
                "answers": {
                    "text": dataset[i]["answers"]["text"],
                    "answer_start": dataset[i]["answers"]["answer_start"]
                }
            }

            # Prediction
            pred = {
                "id": str(dataset[i]["id"]),
                "prediction_text": pred_text
            }

            # Only add if prediction is non-empty
            if pred_text != "":
                predictions.append(pred)
                references.append(ref)

        except Exception as e:
            continue  # skip failed item

    # Now pass to metric evaluator
    if len(predictions) == 0:
        return {"exact_match": 0.0, "f1": 0.0}

    return squad_metric.compute(predictions=predictions, references=references)


In [68]:
model_language_list = [
    {"model": "bert-base-multilingual-cased", "lang": "hi"},
    {"model": "ai4bharat/indic-bert", "lang": "kn"},
    {"model": "ai4bharat/indic-bert", "lang": "ta"},
]

results = []


In [69]:
for entry in model_language_list:
    model_name = entry["model"]
    lang = entry["lang"]

    print(f"\n Training {model_name} on language: {lang}")

    try:
        # Filter language data
        lang_ds = flat_ds.filter(lambda x: x["language"] == lang).select(range(100))
        lang_ds_squad = lang_ds.map(convert_to_squad)

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForQuestionAnswering.from_pretrained(model_name)

        tokenized_dataset = lang_ds_squad.map(
            lambda x: preprocess_qa(x, tokenizer),
            remove_columns=lang_ds_squad.column_names
        )

        training_args = TrainingArguments(
            output_dir=f"./results/{model_name.replace('/', '_')}_{lang}",
            learning_rate=2e-5,
            per_device_train_batch_size=4,
            num_train_epochs=1,
            weight_decay=0.01,
            logging_steps=10,
            save_strategy="no",
            report_to="none"
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_dataset,
            tokenizer=tokenizer,
            data_collator=default_data_collator
        )

        trainer.train()
        preds = trainer.predict(tokenized_dataset)
        metrics = compute_predictions(lang_ds_squad, preds.predictions, tokenizer, tokenized_dataset)

        results.append({
            "Model": model_name,
            "Language": lang,
            "Exact Match": round(metrics["exact_match"], 2),
            "F1": round(metrics["f1"], 2)
        })

    except Exception as e:
        print(f" Error for {model_name} with language {lang}: {e}")


 Training bert-base-multilingual-cased on language: hi


Filter:   0%|          | 0/13295 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Trainer(
  return forward_call(*args, **kwargs)


Step,Training Loss
10,5.5976
20,4.8649



 Training ai4bharat/indic-bert on language: kn


Filter:   0%|          | 0/13295 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
10,5.8812
20,5.7692



 Training ai4bharat/indic-bert on language: ta


Filter:   0%|          | 0/13295 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
10,5.7273
20,5.5705


In [70]:
 #Load dataset
en_ds = load_dataset("squad", split="train[:100]")

# Tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [71]:
# Preprocess function
def preprocess_qa(example):
    inputs = tokenizer(
        example["question"],
        example["context"],
        truncation="only_second",
        padding="max_length",
        max_length=384,
        return_offsets_mapping=True
    )

    offset_mapping = inputs.pop("offset_mapping")
    start_char = example["answers"]["answer_start"][0]
    end_char = start_char + len(example["answers"]["text"][0])
    start_token = end_token = 0

    for idx, (start, end) in enumerate(offset_mapping):
        if start <= start_char < end:
            start_token = idx
        if start < end_char <= end:
            end_token = idx
            break

    inputs["start_positions"] = start_token
    inputs["end_positions"] = end_token
    return inputs

In [72]:
# Tokenize
tokenized_en = en_ds.map(preprocess_qa, remove_columns=en_ds.column_names)

# Training setup
training_args = TrainingArguments(
    output_dir="./results_en",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_en,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

# Train
trainer.train()

# Evaluate
squad_metric = evaluate.load("squad")

  trainer = Trainer(
  return forward_call(*args, **kwargs)


Step,Training Loss
10,5.8581
20,5.6105


In [73]:
def compute_predictions_squad(dataset, preds, tokenizer, tokenized_dataset):
    start_logits, end_logits = preds
    predictions = []
    references = []

    for i in range(len(start_logits)):
        start = torch.argmax(torch.tensor(start_logits[i])).item()
        end = torch.argmax(torch.tensor(end_logits[i])).item() + 1
        input_ids = tokenized_dataset[i]["input_ids"][start:end]
        pred_text = tokenizer.decode(input_ids, skip_special_tokens=True).strip()

        if pred_text == "":
            continue

        predictions.append({
            "id": dataset[i]["id"],
            "prediction_text": pred_text
        })

        references.append({
            "id": dataset[i]["id"],
            "answers": dataset[i]["answers"]
        })

    if len(predictions) == 0:
        return {"exact_match": 0.0, "f1": 0.0}

    return squad_metric.compute(predictions=predictions, references=references)

In [74]:

preds = trainer.predict(tokenized_en)
metrics = compute_predictions_squad(en_ds, preds.predictions, tokenizer, tokenized_en)

# Append to your results
results.append({
    "Model": model_name,
    "Language": "en",
    "Exact Match": round(metrics["exact_match"], 2),
    "F1": round(metrics["f1"], 2)
})

In [75]:
# Display updated results
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,Model,Language,Exact Match,F1
0,bert-base-multilingual-cased,hi,0.0,0.0
1,ai4bharat/indic-bert,kn,0.0,0.0
2,ai4bharat/indic-bert,ta,0.0,0.0
3,bert-base-uncased,en,3.7,9.04
