In [14]:
import json
import torch
import random
from torch.utils.data import DataLoader, Dataset
from transformers import *

### Read the training, validation data, and evidence set

In [2]:
file = open('../data/train-claims.json')
raw_train_data = json.load(file)
file.close()

file = open('../data/dev-claims.json')
raw_val_data = json.load(file)
file.close()

file = open('../data/evidence.json')
evidence_data = json.load(file)
file.close()

In [11]:
# Preprocessing the dataset
for evidence_id, evidence in evidence_data.items():
    evidence_data[evidence_id] = evidence.lower()

def preprocess_data(raw_data, evidence_set):
    # Map labels into numbers
    label_map = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT_ENOUGH_INFO': 2, 'DISPUTED': 3}
    processed_data = []
    for claim_id, claim_info in raw_data.items():
        
        # Lower all the words in both claim and evidences
        claim_text = claim_info['claim_text'].lower()
        evidences = [evidence_set[evidence_id] for evidence_id in claim_info['evidences'][:5]]  # Limit to max 5 evidence
        evidences = " [SEP] ".join(evidences)
        label = label_map[claim_info['claim_label']]
        processed_data.append((claim_text, evidences, label, claim_info['evidences']))
    return processed_data

# Load and preprocess your training and validation datasets
train_data = preprocess_data(raw_train_data, evidence_data)
val_data = preprocess_data(raw_val_data, evidence_data)

In [5]:
# Initialise the tokenizers

claim_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
evidence_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

loading file vocab.txt from cache at /Users/greysonchung/.cache/huggingface/hub/models--facebook--dpr-question_encoder-single-nq-base/snapshots/d04a52f6d2f96c60117a925e8c24c4043a75f265/vocab.txt
loading file tokenizer.json from cache at /Users/greysonchung/.cache/huggingface/hub/models--facebook--dpr-question_encoder-single-nq-base/snapshots/d04a52f6d2f96c60117a925e8c24c4043a75f265/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /Users/greysonchung/.cache/huggingface/hub/models--facebook--dpr-question_encoder-single-nq-base/snapshots/d04a52f6d2f96c60117a925e8c24c4043a75f265/tokenizer_config.json
loading configuration file config.json from cache at /Users/greysonchung/.cache/huggingface/hub/models--facebook--dpr-question_encoder-single-nq-base/snapshots/d04a52f6d2f96c60117a925e8c24c4043a75f265/config.json
Model config DPRConfig {
  "_name_or_path": "facebook/dpr-ques

In [None]:
class DPRTrainDataset(Dataset):
    def __init__(self, train_data, claim_max_len, evidence_max_len):
        self.data = train_data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        claims = self.data[index][0]
        evidences = self.data[index][1]
        evidence_ids = self.data[index][3]

        return claims, evidences, evidence_ids

In [None]:
def negative_sampling(claims, evidence_ids):
    '''
    This function performs negative sampling for a batch of claims
    '''
    batch_evidence_set = set([ids for ids in evidence_ids])
    negative_evidences = []
    
    for i in range(len(claims)):
        negative_candidates = batch_evidence_set - set(evidence_ids[i])
        sample_size = len(evidence_ids[i])
        sampled_ids = random.sample(negative_candidates, sample_size)

        # Retrieve the actual evidence base on evidence ids
        sampled_evidence = [evidence_data[evidence_id] for evidence_id in sampled_ids]
        sampled_evidence = " [SEP] ".join(sampled_evidence)
        negative_evidences.append(sampled_evidence)

    return negative_evidences

In [None]:
# Defining collate function to process and combine samples from the dataset into a single batch

def collate_fn(batch):
    claims, evidences, evidence_ids = zip(*batch)

    negative_evidences = negative_sampling(claims, evidence_ids)
    
    # Tokenize the claim and evidence
    claim_tokens = claim_tokenizer(claims, return_tensors="pt", padding=True, truncation=True)
    positive_evidence_tokens = evidence_tokenizer(evidences, return_tensors="pt", padding=True, truncation=True)
    negative_evidence_tokens = evidence_tokenizer(negative_evidences, return_tensors="pt", padding=True, truncation=True)

    return {
        "claim_inputs": claim_tokens,
        "positive_evidence_inputs": positive_evidence_tokens,
        "negative_evidence_inputs": negative_evidence_tokens,
    }    