In [8]:
import pandas as pd
import torch
from transformers import AutoTokenizer, XLMRobertaForQuestionAnswering, AutoModelForQuestionAnswering
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import json
from sklearn.metrics import f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 1: Preprocess the IndicQA Dataset
def preprocess_indicqa_for_extractive(json_file):
    with open(json_file, "r", encoding="utf-8") as f:
        raw_data = json.load(f)
    
    data = []
    for entry in raw_data["data"]:
        for paragraph in entry["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                answers = qa["answers"]
                if answers and answers[0]["text"]:  # If there's a valid answer
                    answer = answers[0]["text"]
                    answer_start = answers[0]["answer_start"]
                    answer_end = answer_start + len(answer)
                    data.append({"question": question, "context": context, "answer": answer, 
                                 "answer_start": answer_start, "answer_end": answer_end})
    return pd.DataFrame(data)

# Load and preprocess the dataset
data_file = "mergedQuAD-hi-train.json"  # Replace with your actual JSON file path
df = preprocess_indicqa_for_extractive(data_file)

df

Unnamed: 0,question,context,answer,answer_start,answer_end
0,एक्सावेटा की एक विशेषता क्या है?\n,संघ के रूप में इलाज किया जा सकता है जो प्रोटिस...,मोनोफाईलेटिक,209,221
1,किस अम्ल में सबसे अधिक पशु चर्बी प्रतिशत होता है?,साँचा:Chembox ThermalConductivityसाँचा:Chembox...,स्टीयरिक अम्ल,132,145
2,चीन में 6वी जनसंख्या गणना कब आयोजित की गई?,चीन की जनसांख्यिकीय अपेक्षाकृत छोटे युवा घटक क...,1 नवंबर 2010,441,453
3,चीन की जनसंख्या वृद्धि दर कितनी है?,चीन की जनसांख्यिकीय अपेक्षाकृत छोटे युवा घटक क...,0.59%,389,394
4,ब्रिटनी क्या चाहती है?,अपने रिकॉर्डिंग की जीवन-वृत्ति के शुभारंभ के त...,दीवार की पोस्टर में दीखना,1381,1406
...,...,...,...,...,...
6019,संधात्मक गणतंत्र किस तरह की व्यवस्था है?\n,राष्ट्रपति प्रणाली के साथ सरकार का रूप लोकतांत...,राष्ट्रपति प्रणाली,0,18
6020,किस देश में विधायिका को विधि का स्रोत कहा जाता...,राष्ट्रपति प्रणाली के साथ सरकार का रूप लोकतांत...,ब्राजील,435,442
6021,उड़ने वाली वस्तु एक प्रमुख निर्यात क्या था?,आईएमएफ डेटा के अनुसार 2017 में ब्राजील को 77व...,विमान,910,915
6022,विभाजन के बाद शेष कौन से शासित प्रदेश केंद्र श...,भारत ने 1947 में अंग्रेजों से स्वतंत्रता प्राप...,दमन और दीव,377,387


In [2]:
!conda env list

# conda environments:
#
base                     /data2/home/dileeppatel/miniconda3
DataAnalytics            /data2/home/dileeppatel/miniconda3/envs/DataAnalytics
dlnlp                 *  /data2/home/dileeppatel/miniconda3/envs/dlnlp
trans_env                /data2/home/dileeppatel/miniconda3/envs/trans_env



In [9]:

#Step 2: Split into Train, Validation, and Test Sets
train_df = df.sample(frac=0.8, random_state=42).reset_index(drop=True)
remaining_df = df.drop(train_df.index).reset_index(drop=True)
valid_df = remaining_df.sample(frac=0.5, random_state=42).reset_index(drop=True)
test_df = remaining_df.drop(valid_df.index).reset_index(drop=True)


In [10]:

class QAExtractiveDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        question = row["question"]
        context = row["context"]
        answer = row["answer"]
        answer_start = row["answer_start"]
        answer_end = row["answer_end"]

        # Tokenize question and context together
        inputs = self.tokenizer(
            question, context, 
            max_length=self.max_len, 
            truncation=True, 
            padding="max_length", 
            return_offsets_mapping=True,
            return_tensors="pt"
        )

        # Find start and end token positions
        offset_mapping = inputs["offset_mapping"].squeeze()
        start_token_idx = end_token_idx = 0
        for i, (start, end) in enumerate(offset_mapping):
            if start <= answer_start < end:
                start_token_idx = i
            if start < answer_end <= end:
                end_token_idx = i

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "start_positions": torch.tensor(start_token_idx),
            "end_positions": torch.tensor(end_token_idx),
            "answer": answer,  # Include answer for metrics
        }


# model_name = "xlm-roberta-base"  
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = XLMRobertaForQuestionAnswering.from_pretrained(model_name)

# model_name = "google/muril-base-cased"  

# model_name = "ai4bharat/IndicBERTv2-MLM-Sam-TLM"

#model_name = "ai4bharat/indicbart"

model_name = "google-bert/bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Prepare DataLoaders
train_dataset = QAExtractiveDataset(train_df, tokenizer)
valid_dataset = QAExtractiveDataset(valid_df, tokenizer)
test_dataset = QAExtractiveDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

# Step 5: Define Training and Validation Loops

model.to(device)
optimizer = AdamW(model.parameters(), lr=3e-5)

def train_epoch(model, dataloader, optimizer):
    model.train()
    epoch_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_positions = batch["start_positions"].to(device)
        end_positions = batch["end_positions"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, 
                        start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

def validate_epoch(model, dataloader):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validation"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            start_positions = batch["start_positions"].to(device)
            end_positions = batch["end_positions"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, 
                            start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)
 
# Step 7: Train and Evaluate the Model
for epoch in range(10):
    print(f"Epoch {epoch + 1}")
    train_loss = train_epoch(model, train_loader, optimizer)
    valid_loss = validate_epoch(model, valid_loader)
    print(f"Train Loss: {train_loss:.4f} | Validation Loss: {valid_loss:.4f}")


Epoch 1


Training: 100%|███████████████████████████████| 151/151 [02:27<00:00,  1.02it/s]
Validation: 100%|███████████████████████████████| 19/19 [00:09<00:00,  1.93it/s]


Train Loss: 3.3152 | Validation Loss: 2.0066
Epoch 2


Training:  21%|██████▌                         | 31/151 [00:30<01:57,  1.02it/s]

In [7]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Sat Nov 30 14:53:11 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:07:00.0 Off |                    0 |
| N/A   28C    P0              59W / 400W |  12610MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM4-40GB          Off | 00000000:0A:00.0 Off |  

In [13]:
model.save_pretrained("indicbert-v2")
tokenizer.save_pretrained("indicbert-v2")

('indicbert-v2/tokenizer_config.json',
 'indicbert-v2/special_tokens_map.json',
 'indicbert-v2/tokenizer.json')

## Inference

In [74]:
import pandas as pd
import torch
from transformers import AutoTokenizer, XLMRobertaForQuestionAnswering, AutoModelForQuestionAnswering
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import json
from sklearn.metrics import f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("roberta")
model = XLMRobertaForQuestionAnswering.from_pretrained("roberta")
model.to(device)

print("model loaded successfully")


model loaded successfully


In [None]:
def batch_inference(model, tokenizer, dataloader):
   
    predictions = []
    references = []

    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Running Inference"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            start_logits, end_logits = outputs.start_logits, outputs.end_logits

            start_indices = torch.argmax(start_logits, dim=1)
            end_indices = torch.argmax(end_logits, dim=1)

            for i in range(len(input_ids)):
                pred_tokens = input_ids[i][start_indices[i]:end_indices[i] + 1]
                predicted_answer = tokenizer.decode(pred_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True)
                predictions.append(predicted_answer)
                references.append(batch["answer"][i])  # Ground truth answers

    return predictions, references


def evaluate_with_jaccard(predictions, references):
   
    # Calculate Exact Match
    em = sum([1 if pred.strip() == ref.strip() else 0 for pred, ref in zip(predictions, references)]) / len(references)

    # Calculate F1 Score
    def compute_f1(pred, ref):
        pred_tokens = set(pred.split())
        ref_tokens = set(ref.split())
        common_tokens = pred_tokens & ref_tokens
        if not common_tokens:
            return 0
        precision = len(common_tokens) / len(pred_tokens)
        recall = len(common_tokens) / len(ref_tokens)
        return 2 * (precision * recall) / (precision + recall)

    f1 = sum([compute_f1(pred, ref) for pred, ref in zip(predictions, references)]) / len(references)

    # Calculate Jaccard Score
    def compute_jaccard(pred, ref):
        pred_tokens = set(pred.split())
        ref_tokens = set(ref.split())
        intersection = len(pred_tokens & ref_tokens)
        union = len(pred_tokens | ref_tokens)
        if union == 0:
            return 0
        return intersection / union

    jaccard = sum([compute_jaccard(pred, ref) for pred, ref in zip(predictions, references)]) / len(references)

    return {"Exact Match": em, "F1 Score": f1, "Jaccard Score": jaccard}


In [None]:
df_test1 = preprocess_indicqa_for_extractive("mergedQuAD-hi-test.json")
df_test2 = preprocess_indicqa_for_extractive("indicqa.hi.json")
df_test3 = pd.read_csv('chaii-hindi.csv')


test_dataset1 = QAExtractiveDataset(df_test1, tokenizer)
test_dataset2 = QAExtractiveDataset(df_test2, tokenizer)
test_dataset3 = QAExtractiveDataset(df_test3, tokenizer)
#test_dataset = QAExtractiveDataset(test_df, tokenizer)

test_loader1 = DataLoader(test_dataset1, batch_size=32)
test_loader2 = DataLoader(test_dataset2, batch_size=32)
test_loader3 = DataLoader(test_dataset3, batch_size=32)


for test_l in [test_loader, test_loader1, test_loader2, test_loader3]:
    predictions, references = batch_inference(model, tokenizer, test_l)
    metrics = evaluate_with_jaccard(predictions, references)
    print(f"\nExact Match: {metrics['Exact Match'] * 100:.2f}%")
    print(f"F1 Score: {metrics['F1 Score']:.4f}")
    print(f"Jaccard Score: {metrics['Jaccard Score']:.4f}")


Running Inference: 100%|██████████| 38/38 [00:09<00:00,  3.84it/s]



Exact Match: 78.28%
F1 Score: 0.8541
Jaccard Score: 0.8385


Running Inference: 100%|██████████| 19/19 [00:09<00:00,  2.04it/s]



Exact Match: 37.39%
F1 Score: 0.5528
Jaccard Score: 0.5093


Running Inference: 100%|██████████| 33/33 [00:17<00:00,  1.92it/s]



Exact Match: 47.34%
F1 Score: 0.5947
Jaccard Score: 0.5644


Running Inference: 100%|██████████| 24/24 [00:19<00:00,  1.25it/s]


Exact Match: 34.05%
F1 Score: 0.4702
Jaccard Score: 0.4368





In [54]:
def run_inference(model, tokenizer, question, context, max_len=512):
    """
    Perform inference on a single question-context pair.

    Args:
        model: Fine-tuned model.
        tokenizer: Tokenizer for the model.
        question (str): The question string.
        context (str): The context string.
        max_len (int): Maximum length for tokenized inputs.

    Returns:
        predicted_answer (str): The extracted answer.
    """
    # Tokenize the inputs
    inputs = tokenizer(
        question, 
        context, 
        max_length=max_len, 
        truncation=True, 
        padding="max_length", 
        return_tensors="pt"
    )

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Model inference
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        start_logits, end_logits = outputs.start_logits, outputs.end_logits

    # Get start and end token positions
    start_idx = torch.argmax(start_logits, dim=1).item()
    end_idx = torch.argmax(end_logits, dim=1).item()

    # Decode the predicted answer tokens
    predicted_tokens = input_ids[0][start_idx:end_idx + 1]
    predicted_answer = tokenizer.decode(predicted_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    return predicted_answer


In [55]:
question = "कलाम की मृत्यु पर शोक व्यक्त करते हुए किस सरकार ने श्रद्धांजलि में 1000 मक्खन के दीपक भेंट किए थे?"
context = "\"भूटान सरकार ने कलाम की मौत के शोक के लिए देश के झंडे को आधी ऊंचाई पर फहराने के लिए आदेश दिया, और श्रद्धांजलि में 1000 मक्खन के दीपक की भेंट किए।\""

# Predict the answer
predicted_answer = run_inference(model, tokenizer, question, context)
print(f"Question: {question}")
print(f"Context: {context}")
print(f"Predicted Answer: {predicted_answer}")


Question: कलाम की मृत्यु पर शोक व्यक्त करते हुए किस सरकार ने श्रद्धांजलि में 1000 मक्खन के दीपक भेंट किए थे?
Context: "भूटान सरकार ने कलाम की मौत के शोक के लिए देश के झंडे को आधी ऊंचाई पर फहराने के लिए आदेश दिया, और श्रद्धांजलि में 1000 मक्खन के दीपक की भेंट किए।"
Predicted Answer: भूटान सरकार
