In [2]:
import json
import torch
from tqdm import tqdm
import random
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import *

### Read the training, validation data, and evidence set

In [3]:
file = open('../data/train-claims.json')
raw_train_data = json.load(file)
file.close()

file = open('../data/dev-claims.json')
raw_val_data = json.load(file)
file.close()

file = open('../data/evidence.json')
evidence_data = json.load(file)
file.close()

In [4]:
# Preprocessing the dataset
for evidence_id, evidence in evidence_data.items():
    evidence_data[evidence_id] = evidence.lower()

def preprocess_data(raw_data, evidence_set):
    # Map labels into numbers
    label_map = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT_ENOUGH_INFO': 2, 'DISPUTED': 3}
    processed_data = []
    for claim_id, claim_info in raw_data.items():
        
        # Lower all the words in both claim and evidences
        claim_text = claim_info['claim_text'].lower()
        evidences = [evidence_set[evidence_id] for evidence_id in claim_info['evidences'][:5]]  # Limit to max 5 evidence
        evidences = " [SEP] ".join(evidences)
        label = label_map[claim_info['claim_label']]
        processed_data.append((claim_text, evidences, label, claim_info['evidences']))
    return processed_data

# Load and preprocess your training and validation datasets
train_data = preprocess_data(raw_train_data, evidence_data)
val_data = preprocess_data(raw_val_data, evidence_data)

In [5]:
# Initialise the tokenizers

claim_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
evidence_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

loading file vocab.txt from cache at /Users/greysonchung/.cache/huggingface/hub/models--facebook--dpr-question_encoder-single-nq-base/snapshots/d04a52f6d2f96c60117a925e8c24c4043a75f265/vocab.txt
loading file tokenizer.json from cache at /Users/greysonchung/.cache/huggingface/hub/models--facebook--dpr-question_encoder-single-nq-base/snapshots/d04a52f6d2f96c60117a925e8c24c4043a75f265/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /Users/greysonchung/.cache/huggingface/hub/models--facebook--dpr-question_encoder-single-nq-base/snapshots/d04a52f6d2f96c60117a925e8c24c4043a75f265/tokenizer_config.json
loading configuration file config.json from cache at /Users/greysonchung/.cache/huggingface/hub/models--facebook--dpr-question_encoder-single-nq-base/snapshots/d04a52f6d2f96c60117a925e8c24c4043a75f265/config.json
Model config DPRConfig {
  "_name_or_path": "facebook/dpr-ques

In [6]:
class DPRTrainDataset(Dataset):
    def __init__(self, train_data):
        self.data = train_data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        claims = self.data[index][0]
        evidences = self.data[index][1]
        evidence_ids = self.data[index][3]

        return claims, evidences, evidence_ids

In [7]:
def negative_sampling(claims, evidence_ids):
    '''
    This function performs negative sampling for a batch of claims
    '''
    batch_evidence_set = set([ids for sublist in evidence_ids for ids in sublist])
    negative_evidences = []
    
    for i in range(len(claims)):
        negative_candidates = list(batch_evidence_set - set(evidence_ids[i]))
        sample_size = len(evidence_ids[i])
        sample_size = min(sample_size, len(negative_candidates))  # Add this line
        sampled_ids = random.sample(negative_candidates, sample_size)

        # Retrieve the actual evidence base on evidence ids
        sampled_evidence = [evidence_data[evidence_id] for evidence_id in sampled_ids]
        sampled_evidence = " [SEP] ".join(sampled_evidence)
        negative_evidences.append(sampled_evidence)

    return negative_evidences

In [8]:
# Defining collate function to process and combine samples from the dataset into a single batch

def collate_fn(batch):
    claims, evidences, evidence_ids = zip(*batch)

    negative_evidences = negative_sampling(claims, evidence_ids)
    
    # Tokenize the claim and evidence
    claim_tokens = claim_tokenizer(claims, return_tensors="pt", padding=True, truncation=True)
    positive_evidence_tokens = evidence_tokenizer(evidences, return_tensors="pt", padding=True, truncation=True)
    negative_evidence_tokens = evidence_tokenizer(negative_evidences, return_tensors="pt", padding=True, truncation=True)

    return {
        "claim_inputs": claim_tokens,
        "positive_evidence_inputs": positive_evidence_tokens,
        "negative_evidence_inputs": negative_evidence_tokens,
    }    

In [9]:
# Instantiate the encoders

question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

loading configuration file config.json from cache at /Users/greysonchung/.cache/huggingface/hub/models--facebook--dpr-question_encoder-single-nq-base/snapshots/d04a52f6d2f96c60117a925e8c24c4043a75f265/config.json
Model config DPRConfig {
  "architectures": [
    "DPRQuestionEncoder"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "dpr",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "projection_dim": 0,
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /Users/greysonchung/.cache/huggingface/hub/models--facebook--dpr-question_encoder-single-nq-base/snapshots/d04a52f6d2f96c60117a925e8c24c4043

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
LEARNING_RATE = 1e-5
optimizer = torch.optim.AdamW(list(question_encoder.parameters()) + list(context_encoder.parameters()), lr=LEARNING_RATE)

NameError: name 'question_encoder' is not defined

In [12]:
def train(data_loader, num_epochs):
    for epoch in range(num_epochs):
        question_encoder.train()
        context_encoder.train()

        progress_bar = tqdm(data_loader, desc="Training", unit="batch")

        for batch in progress_bar:
            optimizer.zero_grad()
            
            claim_inputs = {k: v.to(device) for k, v in batch["claim_inputs"].items()}
            positive_evidence_inputs = {k: v.to(device) for k, v in batch["positive_evidence_inputs"].items()}
            negative_evidence_inputs = {k: v.to(device) for k, v in batch["negative_evidence_inputs"].items()}

            # Encodes the inputs using encoders
            claim_outputs = question_encoder(**claim_inputs)["pooler_output"]
            positive_evidence_outputs = context_encoder(**positive_evidence_inputs)["pooler_output"]
            negative_evidence_outputs = context_encoder(**negative_evidence_inputs)["pooler_output"]

            # Compute similarity scores
            positive_similarity_scores = torch.matmul(claim_outputs, positive_evidence_outputs.T)
            negative_similarity_scores = torch.matmul(claim_outputs, negative_evidence_outputs.T)

            # Concatenate positive and negative similarity scores
            all_similarity_scores = torch.cat([positive_similarity_scores, negative_similarity_scores], dim=1)

            # Compute probabilities using softmax
            probabilities = F.softmax(all_similarity_scores, dim=1)

            # Compute the negative log likelihood loss using the positive similarity scores' probabilities
            positive_indices = torch.arange(probabilities.size(0), dtype=torch.long, device=device)
            loss = F.nll_loss(torch.log(probabilities), positive_indices)

            loss.backward()
            optimizer.step()
        
        print("\n")
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")
        
    print("Fine-tuning complete.")

In [13]:
train_ds = DPRTrainDataset(train_data)
train_loader = DataLoader(train_ds, batch_size=6, shuffle=True, collate_fn=collate_fn)

In [14]:
train(train_loader, num_epochs=3)

Training: 100%|██████████| 205/205 [11:24<00:00,  3.34s/batch]


Epoch 1/3, Loss: 0.29675430059432983


Training: 100%|██████████| 205/205 [11:12<00:00,  3.28s/batch]


Epoch 2/3, Loss: 0.14043737947940826


Training: 100%|██████████| 205/205 [11:28<00:00,  3.36s/batch]

Epoch 3/3, Loss: 0.008969031274318695
Fine-tuning complete.





In [9]:
# Save the tuned model locally
question_encoder_save_path = "tuned/question_encoder"
context_encoder_save_path = "tuned/context_encoder"
# question_encoder.save_pretrained(question_encoder_save_path)
# context_encoder.save_pretrained(context_encoder_save_path)

### Encoding the evidence from the evidence dataset using the tuned ContextEncoder

In [10]:
# Load the fine-tuned models
fine_tuned_question_encoder = DPRQuestionEncoder.from_pretrained(question_encoder_save_path)
fine_tuned_context_encoder = DPRContextEncoder.from_pretrained(context_encoder_save_path)

loading configuration file tuned/question_encoder/config.json
Model config DPRConfig {
  "_name_or_path": "facebook/dpr-question_encoder-single-nq-base",
  "architectures": [
    "DPRQuestionEncoder"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "dpr",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "projection_dim": 0,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

loading weights file tuned/question_encoder/pytorch_model.bin
All model checkpoint weights were used when initializing DPRQuestionEncoder.

All the weights of DPRQuestionEncoder were initialized from the model checkpoint at tuned/question_encoder.
I

In [8]:
# Extract the all text into a single list
evidence_texts = list(evidence_data.values())
tokenized_evidence = evidence_tokenizer(evidence_texts, return_tensors="pt", padding=True, truncation=True, max_length=250)

In [13]:
# Encode evidence by batch to prevent memory overflow issue
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def encode_and_save_evidence_batch(batch, filename):
    encoded_batch = {}
    for evidence_id, evidence_text in batch:
        inputs = evidence_tokenizer(evidence_text, return_tensors="pt", padding=True, truncation=True)
        outputs = fine_tuned_context_encoder(**inputs.to(device))["pooler_output"]
        encoded_batch[evidence_id] = outputs.cpu().detach().numpy().tolist()

    with open(filename, "a") as f:
        for evidence_id, embedding in encoded_batch.items():
            f.write(json.dumps({evidence_id: embedding}))
            f.write("\n")

batch_size = 100
evidence_items = list(evidence_data.items())
num_batches = len(evidence_items) // batch_size + int(len(evidence_items) % batch_size > 0)

output_filename = "encoded_evidence.jsonl"

# Clear the file content before writing the encoded evidence
with open(output_filename, "w") as f:
    pass

for i in range(num_batches):
    start_index = i * batch_size
    end_index = min((i + 1) * batch_size, len(evidence_items))
    batch = evidence_items[start_index:end_index]
    encode_and_save_evidence_batch(batch, output_filename)


NameError: name 'device' is not defined

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenized_evidence = {k: v.to(device) for k, v in tokenized_evidence.items()}
with torch.no_grad():
    encoded_evidence = fine_tuned_context_encoder(**tokenized_evidence)["pooler_output"]

: 

: 