In [14]:
import json
import torch
import random
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import *

### Read the training, validation data, and evidence set

In [2]:
file = open('../data/train-claims.json')
raw_train_data = json.load(file)
file.close()

file = open('../data/dev-claims.json')
raw_val_data = json.load(file)
file.close()

file = open('../data/evidence.json')
evidence_data = json.load(file)
file.close()

In [11]:
# Preprocessing the dataset
for evidence_id, evidence in evidence_data.items():
    evidence_data[evidence_id] = evidence.lower()

def preprocess_data(raw_data, evidence_set):
    # Map labels into numbers
    label_map = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT_ENOUGH_INFO': 2, 'DISPUTED': 3}
    processed_data = []
    for claim_id, claim_info in raw_data.items():
        
        # Lower all the words in both claim and evidences
        claim_text = claim_info['claim_text'].lower()
        evidences = [evidence_set[evidence_id] for evidence_id in claim_info['evidences'][:5]]  # Limit to max 5 evidence
        evidences = " [SEP] ".join(evidences)
        label = label_map[claim_info['claim_label']]
        processed_data.append((claim_text, evidences, label, claim_info['evidences']))
    return processed_data

# Load and preprocess your training and validation datasets
train_data = preprocess_data(raw_train_data, evidence_data)
val_data = preprocess_data(raw_val_data, evidence_data)

In [5]:
# Initialise the tokenizers

claim_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
evidence_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

loading file vocab.txt from cache at /Users/greysonchung/.cache/huggingface/hub/models--facebook--dpr-question_encoder-single-nq-base/snapshots/d04a52f6d2f96c60117a925e8c24c4043a75f265/vocab.txt
loading file tokenizer.json from cache at /Users/greysonchung/.cache/huggingface/hub/models--facebook--dpr-question_encoder-single-nq-base/snapshots/d04a52f6d2f96c60117a925e8c24c4043a75f265/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /Users/greysonchung/.cache/huggingface/hub/models--facebook--dpr-question_encoder-single-nq-base/snapshots/d04a52f6d2f96c60117a925e8c24c4043a75f265/tokenizer_config.json
loading configuration file config.json from cache at /Users/greysonchung/.cache/huggingface/hub/models--facebook--dpr-question_encoder-single-nq-base/snapshots/d04a52f6d2f96c60117a925e8c24c4043a75f265/config.json
Model config DPRConfig {
  "_name_or_path": "facebook/dpr-ques

In [15]:
class DPRTrainDataset(Dataset):
    def __init__(self, train_data, claim_max_len, evidence_max_len):
        self.data = train_data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        claims = self.data[index][0]
        evidences = self.data[index][1]
        evidence_ids = self.data[index][3]

        return claims, evidences, evidence_ids

In [16]:
def negative_sampling(claims, evidence_ids):
    '''
    This function performs negative sampling for a batch of claims
    '''
    batch_evidence_set = set([ids for ids in evidence_ids])
    negative_evidences = []
    
    for i in range(len(claims)):
        negative_candidates = batch_evidence_set - set(evidence_ids[i])
        sample_size = len(evidence_ids[i])
        sampled_ids = random.sample(negative_candidates, sample_size)

        # Retrieve the actual evidence base on evidence ids
        sampled_evidence = [evidence_data[evidence_id] for evidence_id in sampled_ids]
        sampled_evidence = " [SEP] ".join(sampled_evidence)
        negative_evidences.append(sampled_evidence)

    return negative_evidences

In [17]:
# Defining collate function to process and combine samples from the dataset into a single batch

def collate_fn(batch):
    claims, evidences, evidence_ids = zip(*batch)

    negative_evidences = negative_sampling(claims, evidence_ids)
    
    # Tokenize the claim and evidence
    claim_tokens = claim_tokenizer(claims, return_tensors="pt", padding=True, truncation=True)
    positive_evidence_tokens = evidence_tokenizer(evidences, return_tensors="pt", padding=True, truncation=True)
    negative_evidence_tokens = evidence_tokenizer(negative_evidences, return_tensors="pt", padding=True, truncation=True)

    return {
        "claim_inputs": claim_tokens,
        "positive_evidence_inputs": positive_evidence_tokens,
        "negative_evidence_inputs": negative_evidence_tokens,
    }    

In [18]:
# Instantiate the encoders

question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

loading configuration file config.json from cache at /Users/greysonchung/.cache/huggingface/hub/models--facebook--dpr-question_encoder-single-nq-base/snapshots/d04a52f6d2f96c60117a925e8c24c4043a75f265/config.json
Model config DPRConfig {
  "architectures": [
    "DPRQuestionEncoder"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "dpr",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "projection_dim": 0,
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /Users/greysonchung/.cache/huggingface/hub/models--facebook--dpr-question_encoder-single-nq-base/snapshots/d04a52f6d2f96c60117a925e8c24c4043

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
LEARNING_RATE = 1e-5
optimizer = torch.optim.AdamW(list(question_encoder.parameters()) + 
                              list(context_encoder.parameters), lr=LEARNING_RATE)

In [None]:
def train(batch_size, data_loader, num_epochs, learning_rate):
    for epoch in range(num_epochs):
        question_encoder.train()
        context_encoder.train()

        for batch in data_loader:
            optimizer.zero_grad()
            
            claim_inputs = {k: v.to(device) for k, v in batch["claim_inputs"].items()}
            positive_evidence_inputs = {k: v.to(device) for k, v in batch["positive_evidence_inputs"].items()}
            negative_evidence_inputs = {k: v.to(device) for k, v in batch["negative_evidence_inputs"].items()}

            # Encodes the inputs using encoders
            claim_outputs = question_encoder(**claim_inputs)["pooler_output"]
            positive_evidence_outputs = context_encoder(**positive_evidence_inputs)["pooler_output"]
            negative_evidence_outputs = context_encoder(**negative_evidence_inputs)["pooler_output"]

            # Compute similarity scores
            positive_similarity_scores = torch.matmul(claim_outputs, positive_evidence_outputs.T)
            negative_similarity_scores = torch.matmul(claim_outputs, negative_evidence_outputs.T)

            # Concatenate positive and negative similarity scores
            all_similarity_scores = torch.cat([positive_similarity_scores, negative_similarity_scores], dim=1)

            # Compute probabilities using softmax
            probabilities = F.softmax(all_similarity_scores, dim=1)

            # Compute the negative log likelihood loss using the positive similarity scores' probabilities
            positive_indices = torch.arange(probabilities.size(0), dtype=torch.long, device=device)
            loss = F.nll_loss(torch.log(probabilities), positive_indices)

            loss.backward()
            optimizer.step()
            
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")
        
    print("Fine-tuning complete.")