In [None]:
!pip install datasets



In [None]:
!gdown 1GnUYDYrpc3H3EVpem8sCBOZ1ZvcjI4oG

Downloading...
From: https://drive.google.com/uc?id=1GnUYDYrpc3H3EVpem8sCBOZ1ZvcjI4oG
To: /content/insert_chars_1.json
100% 55.8M/55.8M [00:00<00:00, 55.8MB/s]


In [None]:
!gdown 1rGRTXIr-FKFVmrxA7fgk8Y_EhOFnJ4jk

Downloading...
From: https://drive.google.com/uc?id=1rGRTXIr-FKFVmrxA7fgk8Y_EhOFnJ4jk
To: /content/train.json
100% 42.1M/42.1M [00:01<00:00, 34.9MB/s]


# Format Fix (from 442 to 1)

In [None]:
import json

def reformat_json(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # Extract version information (assuming all versions are the same)
    version = data[0]["version"]

    # Extract all "data" dictionaries
    data_list = [item["data"] for item in data]

    # Create the new formatted structure
    reformatted_data = {
        "version": version,
        "data": data_list
    }

    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(reformatted_data, file, ensure_ascii=False, indent=4)

# Specify the input and output file paths
input_file = 'insert_chars_1.json'  # Replace with your input JSON file path
output_file = 'your_output_file.json'  # Replace with your desired output JSON file path , similar to train.json

# Reformat the JSON data
reformat_json(input_file, output_file)

print(f'Reformatted JSON data has been saved to {output_file}')

Reformatted JSON data has been saved to your_output_file.json


# Datasets Format Conversion

In [None]:
import json
from datasets import Dataset
import pandas as pd

# Step 1: Load the original SQuAD JSON file
with open('your_output_file.json', 'r') as f:
    squad_data = json.load(f)

# Step 2: Transform the data
data = []

# for article in squad_data['data'][:10]: # partial

for article in squad_data['data']: # whole
    for paragraph in article['paragraphs']:
        context = paragraph['new_context']
        for qa in paragraph['qas']:
            question = qa['question']
            id_ = qa['id']
            answers = qa['answers']
            if len(answers) > 0:
                answer_texts = [answer['text'] for answer in answers] * 3
                answer_starts = [answer['answer_start'] for answer in answers] * 3
            else: # think abou is_impossible = true [Nipun]
                answer_texts = []
                answer_starts = []

            data.append({
                'id': id_,
                'title': article['title'],
                'context': context,
                'question': question,
                'answers': {
                    'text': answer_texts,
                    'answer_start': answer_starts,
                }
            })

# Step 3: Create a Dataset object
dataset = Dataset.from_pandas(pd.DataFrame(data))

dataset.to_json('squad_train_dataset_2.json')

Creating json from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

4861445

# Train

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from datasets import load_dataset, load_metric
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the SQuAD dataset
# squad_dataset = load_dataset('squad')
squad_dataset = load_dataset('json', data_files='squad_train_dataset_2.json')


# Define a custom dataset class for DataLoader
class SquadDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=512):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.tokenizer(
            item['question'],
            item['context'],
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
            # Add this line if token_type_ids are needed
            return_token_type_ids=True
        )
        encoding = {key: tensor.squeeze(0) for key, tensor in encoding.items()}
        encoding['id'] = item['id']
        encoding['answers'] = item['answers']
        return encoding

def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    token_type_ids = torch.stack([item['token_type_ids'] for item in batch])
    ids = [item['id'] for item in batch]
    answers = [item['answers'] for item in batch]
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_type_ids': token_type_ids,
        'id': ids,
        'answers': answers
    }

# Load tokenizer (use FastTokenizer for better performance)
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2", use_fast=True)

# Create DataLoader with batch size
batch_size = 256  # Increase batch size
squad_data_loader = DataLoader(
    SquadDataset(squad_dataset['train'], tokenizer),
    batch_size=batch_size,
    shuffle=False,
    pin_memory=True,
    collate_fn=collate_fn,
    num_workers=4  # Use multiple workers to speed up data loading
)

# Load the pre-trained model
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
model.to(device)
model.eval()

results = []

with torch.no_grad():
    for batch in tqdm(squad_data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)

        # Use mixed precision for faster inference
        with torch.cuda.amp.autocast():
            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        for i in range(len(input_ids)):
            start_idx = torch.argmax(start_logits[i]).item()
            end_idx = torch.argmax(end_logits[i]).item() + 1
            answer = tokenizer.decode(input_ids[i][start_idx:end_idx], skip_special_tokens=True)

            results.append({
                'id': batch['id'][i],
                'answer': answer,
                'score': (start_logits[i][start_idx] + end_logits[i][end_idx - 1]).item()
            })

# Dump "results" in a json file "pred_insert_chars_1.json" [Nipun]

Using device: cpu


Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Process Process-1:
Process Process-3:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 317, in _bootstrap
    util._exit_function()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 317, in _bootstrap
    util._exit_function()
  File "/usr/lib/python3.10/multiprocessing/util.py", line 360, in _exit_function
    _run_finalizers()
  File "/usr/lib/python3.10/multiprocessing/util.py", line 360, in _exit_function
    _run_finalizers()
  File "/usr/lib/python3.10/multiprocessing/util.py", line 300, in _run_finalizers
    finalizer()
  File "/usr/lib/python3.10/multiprocessing/util.py", line 300, in _run_finalizers
    finalizer()
  File "/usr/lib/python3.10/multiprocessing/util.py", line 224, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/util.py", line 224, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/usr

RuntimeError: DataLoader worker (pid 13421) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with num_workers=0 may give better error trace.

In [None]:
# from datasets import load_dataset, load_metric

# # Load SQuAD evaluation metric
# metric = load_metric('squad')

# # Assuming formatted_results and references are correctly prepared
# formatted_results = [{'id': res['id'], 'prediction_text': res['answer']} for res in results]
# references = [{'id': example['id'], 'answers': example['answers']} for example in squad_dataset['train']]

# # Compute the metric
# metric_result = metric.compute(predictions=formatted_results, references=references)

# print(f"Exact Match: {metric_result['exact_match']}")
# print(f"F1 Score: {metric_result['f1']}")


ValueError: max() arg is an empty sequence

In [None]:
# from datasets import load_dataset, load_metric

# # Load SQuAD evaluation metric
# metric = load_metric('squad')

# # Assuming formatted_results and references are correctly prepared
# formatted_results = [{'id': '5a8d7bf7df8bba001a0f9ab1', 'prediction_text': ''}]

# references = [{'id': '5a8d7bf7df8bba001a0f9ab1',
#   'answers': {'answer_start': [], 'text': []}}]

# # Compute the metric
# metric_result = metric.compute(predictions=formatted_results, references=references)

# print(f"Exact Match: {metric_result['exact_match']}")
# print(f"F1 Score: {metric_result['f1']}")


Exact Match: 50.0
F1 Score: 0.0
