In [1]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install datasets



# Imports

In [3]:
import json
from datasets import Dataset
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from datasets import load_dataset, load_metric
from torch.utils.data import DataLoader
from tqdm import tqdm

# Convert to Squad Dataset

In [4]:
# Define a custom dataset class for DataLoader
class SquadDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=512):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.tokenizer(
            item['question'],
            item['context'],
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
            # Add this line if token_type_ids are needed
            return_token_type_ids=True
        )
        encoding = {key: tensor.squeeze(0) for key, tensor in encoding.items()}
        encoding['id'] = item['id']
        encoding['answers'] = item['answers']
        return encoding

def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    token_type_ids = torch.stack([item['token_type_ids'] for item in batch])
    ids = [item['id'] for item in batch]
    answers = [item['answers'] for item in batch]
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_type_ids': token_type_ids,
        'id': ids,
        'answers': answers
    }

## Reformed Data

In [None]:
!gdown 1SN-IbvhlslldE8qllc-hJlW9g0jrd58u

Downloading...
From: https://drive.google.com/uc?id=1SN-IbvhlslldE8qllc-hJlW9g0jrd58u
To: /content/insert_chars_1.json
100% 104M/104M [00:00<00:00, 114MB/s] 


In [5]:
def reformed_data(file, level):

  file_location = "/content/drive/MyDrive/Research/Robustness of QA/Reformed Dataset 30k/" + file + "/" + file + "_" + str(level) + ".json"
  # file_location = f"{file}_{level}.json"

  with open(file_location, 'r') as f:
    squad_data = json.load(f)

  data = []

  for article in squad_data['data']:
    for paragraph in article['paragraphs']:
        context = paragraph['new_context']
        for qa in paragraph['qas']:
            question = qa['question']
            id_ = qa['id']
            answers = qa['answers']
            if len(answers) > 0:
                answer_texts = [answer['text'] for answer in answers] * 3
                answer_starts = [answer['answer_start'] for answer in answers] * 3
                is_impossible = False
            else:
                answer_texts = ["No answer available"]
                answer_starts = [0]
                is_impossible = True

            data.append({
                'id': id_,
                'title': article['title'],
                'context': context,
                'question': question,
                'answers': {
                    'text': answer_texts,
                    'answer_start': answer_starts,
                },
                'is_impossible': is_impossible
            })

  dataset = Dataset.from_pandas(pd.DataFrame(data))

  return dataset

## Train

In [6]:
def train(squad_data_loader):

  results = []

  with torch.no_grad():
      for batch in tqdm(squad_data_loader):
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          token_type_ids = batch['token_type_ids'].to(device)

          # Use mixed precision for faster inference
          with torch.cuda.amp.autocast():
              outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

          start_logits = outputs.start_logits
          end_logits = outputs.end_logits

          for i in range(len(input_ids)):
              start_idx = torch.argmax(start_logits[i]).item()
              end_idx = torch.argmax(end_logits[i]).item() + 1
              answer = tokenizer.decode(input_ids[i][start_idx:end_idx], skip_special_tokens=True)

              results.append({
                  'id': batch['id'][i],
                  'answer': answer,
                  'score': (start_logits[i][start_idx] + end_logits[i][end_idx - 1]).item()
              })

  return results

In [13]:
# Noise List
noise_list = ['delete_character','grammatical_mistakes_sentence', 'grammatical_mistakes_word', 'insert_chars', 'random_insertion']

# Level List
level = 5

BATCH_SIZE = 256

NUM_WORKERS = 2


# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = "deepset/roberta-base-squad2"
model_name_json = "roberta"

# Load tokenizer (use FastTokenizer for better performance)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Load the pre-trained model
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
model.to(device)
model.eval()

final_input_filename = 'final_input_dataset.json'

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [14]:
for i in range(len(noise_list)):
    for j in range(1, level + 1):
        result_json_filename = f'{model_name_json}_pred_{noise_list[i]}_{j}.json'
        dataset = reformed_data(noise_list[i], j)


        with open(final_input_filename, 'w', encoding='utf-8') as f:
          json.dump(dataset, f, ensure_ascii=False, indent=4)

        # # Load the dataset from the saved JSON file
        squad_dataset = load_dataset('json', data_files=final_input_filename)

        squad_data_loader = DataLoader(
            SquadDataset(squad_dataset['train'], tokenizer),
            batch_size=BATCH_SIZE,
            shuffle=False,
            pin_memory=True,
            collate_fn=collate_fn,
            num_workers=NUM_WORKERS
        )

        results = train(squad_data_loader)

        with open(result_json_filename, 'w', encoding='utf-8') as f:
          json.dump(results, f, ensure_ascii=False, indent=4)

TypeError: Object of type Dataset is not JSON serializable