In [5]:
import json
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import *

### Read the training, validation data, and evidence set

In [2]:
file = open('../data/train-claims.json')
raw_train_data = json.load(file)
file.close()

file = open('../data/dev-claims.json')
raw_val_data = json.load(file)
file.close()

file = open('../data/evidence.json')
evidence_data = json.load(file)
file.close()

In [3]:
# Preprocessing the dataset
for evidence_id, evidence in evidence_data.items():
    evidence_data[evidence_id] = evidence.lower()

def preprocess_data(raw_data, evidence_set):
    # Map labels into numbers
    label_map = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT_ENOUGH_INFO': 2, 'DISPUTED': 3}
    processed_data = []
    for claim_id, claim_info in raw_data.items():
        
        # Lower all the words in both claim and evidences
        claim_text = claim_info['claim_text'].lower()
        evidences = [evidence_set[evidence_id] for evidence_id in claim_info['evidences'][:5]]  # Limit to max 5 evidence
        label = label_map[claim_info['claim_label']]
        for evidence in evidences:
            processed_data.append((claim_text, evidence, label))
    return processed_data

# Load and preprocess your training and validation datasets
train_data = preprocess_data(raw_train_data, evidence_data)
val_data = preprocess_data(raw_val_data, evidence_data)

In [None]:
class DPRTrainDataset(Dataset):
    def __init__(self, train_data, claim_max_len, evidence_max_len):
        self.data = train_data
        self.claim_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
        self.evidence_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
        self.claim_max_len = claim_max_len
        self.evidence_max_len = evidence_max_len

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        claim = self.data[index][0]
        evidence = self.data[index][1]

        # Tokenize the claim and evidence
        claim_tokens = self.claim_tokenizer.tokenize(claim)
        claim_tokens = ['[CLS]'] + claim_tokens + ['[SEP]']
        evidence_tokens = self.evidence_tokenizer.tokenize(evidence)
        evidence_tokens = ['[CLS]'] + evidence_tokens + ['[SEP]']

        # Pad to sequence to the same length if the length is less than max length
        if len(claim_tokens) < self.claim_max_len:
            claim_tokens = claim_tokens + ['[PAD]' for _ in range(self.claim_max_len - len(claim_tokens))] #Padding sentences
        else:
            claim_tokens = claim_tokens[:self.claim_max_len-1] + ['[SEP]'] #Prunning the list to be of specified max length

        if len(evidence_tokens) < self.evidence_max_len:
            evidence_tokens = evidence_tokens + ['[PAD]' for _ in range(self.evidence_max_len - len(evidence_tokens))] #Padding sentences
        else:
            evidence_tokens = evidence_tokens[:self.evidence_max_len-1] + ['[SEP]'] #Prunning the list to be of specified max length

        claim_tokens_ids = self.claim_tokenizer.convert_tokens_to_ids(claim_tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        claim_tokens_ids_tensor = torch.tensor(claim_tokens_ids) #Converting the list to a pytorch tensor

        evidence_tokens_ids = self.evidence_tokenizer.convert_tokens_to_ids(evidence_tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        evidence_tokens_ids_tensor = torch.tensor(evidence_tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        claim_attn_mask = (claim_tokens_ids_tensor != 0).long()
        evidence_attn_mask = (evidence_tokens_ids_tensor != 0).long()

        return claim_tokens_ids_tensor, evidence_tokens_ids_tensor, claim_attn_mask, evidence_attn_mask