# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [1]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [16]:
import pandas as pd

import torch
import math
import json
import nltk
import numpy as np


import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter, OrderedDict

In [2]:
import pandas as pd

evidences = pd.read_json('/content/drive/MyDrive/nlp/data/evidence.json', orient='index')
train_claims = pd.read_json('/content/drive/MyDrive/nlp/data/train-claims.json', orient='index')
dev_claims = pd.read_json('/content/drive/MyDrive/nlp/data/dev-claims.json', orient='index')

#update column names
evidences.reset_index(inplace=True)
evidences.columns = ['evidence_id', 'evidence_text']

train_claims.reset_index(inplace=True)
train_claims.rename(columns={'index': 'claim_id'}, inplace=True)

dev_claims.reset_index(inplace=True)
dev_claims.rename(columns={'index': 'claim_id'}, inplace=True)

evidence_id = evidences['evidence_id']
evidence_text = evidences['evidence_text']
evidence_idx = evidences.index.tolist()

evidence_id_dict = dict(zip(evidence_id, evidence_idx))

train_claims_text = train_claims['claim_text']
train_evidence_ids = train_claims['evidences']
train_claim_labels = train_claims['claim_label']
#map evidence_id to their corrosponding index for faster processing
train_evidence_idxs = train_evidence_ids.apply(lambda x: [evidence_id_dict[evidence_id] for evidence_id in x])

dev_claims_text = dev_claims['claim_text']
dev_claim_labels = dev_claims['claim_label']
dev_evidence_ids = dev_claims['evidences']
dev_evidence_idxs = dev_evidence_ids.apply(lambda x: [evidence_id_dict[evidence_id] for evidence_id in x])

In [None]:
test_claims = pd.read_json('/content/drive/MyDrive/nlp/data/test-claims-unlabelled.json', orient='index')
test_claims.reset_index(inplace=True)
test_claims.columns = ['claim_id', 'claim_text']
test_claims_text = test_claims['claim_text']
test_claims_id = test_claims['claim_id']

In [None]:
dev_claim_ids = dev_claims['claim_id']

In [3]:
import json
dev_evidence_indices = json.load(open("/content/drive/MyDrive/nlp/data/reranked_indices.json", "r"))
test_evidence_indices = json.load(open("/content/drive/MyDrive/nlp/data/test_reranked_indices.json", "r"))

In [4]:
dev_k_indices = [sublist[:5] if len(sublist) >= 5 else sublist + [None] * (5 - len(sublist)) for sublist in dev_evidence_indices]
test_k_indices = [sublist[:5] if len(sublist) >= 5 else sublist + [None] * (5 - len(sublist)) for sublist in test_evidence_indices]

In [5]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

tt = TweetTokenizer()
stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_data(text):
    # Tokenize the text
    tokens = tt.tokenize(text.lower())

    # Remove unwanted characters but keep alphanumeric, question marks, and important punctuation
    tokens = [re.sub(f"[^a-zA-Z0-9?]", "", token) for token in tokens]

    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

train_claims_text_processed = train_claims_text.apply(preprocess_data)
dev_claims_text_processed = dev_claims_text.apply(preprocess_data)
evidence_text_processed = evidence_text.apply(preprocess_data)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
from collections import Counter, OrderedDict
def build_vocab(texts, min_freq=3):
    # Count all the words
    word_freq = Counter()
    for text in texts:
        word_freq.update(text)

    # Start vocab from special tokens
    vocab = OrderedDict({
        "<pad>": 0,
        "<unk>": 1,
        "<sos>": 2,
        "<eos>": 3
    })
    index = 4  # Start indexing from 4 because 0-3 are reserved for special tokens
    for word, freq in word_freq.items():
        if freq >= min_freq:  # Only include words that meet the frequency threshold
            vocab[word] = index
            index += 1

    return vocab

# Build vocabulary using only evidence texts and applying the frequency threshold
vocab = build_vocab(evidence_text_processed, min_freq=3)

In [7]:
label_map = {
    "REFUTES": 0,
    "SUPPORTS": 1,
    "NOT_ENOUGH_INFO": 2,
    "DISPUTED": 3
}
train_claim_labels = train_claims['claim_label'].map(label_map)
dev_claim_labels = dev_claims['claim_label'].map(label_map)

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [42]:
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

from torch.utils.data import Dataset, DataLoader
import torch

def text_to_indices(text, vocab):
    indices = [vocab.get(word, vocab["<unk>"]) for word in text]
    return [vocab["<sos>"]] + indices + [vocab["<eos>"]]

class ClaimEvidenceDataset(Dataset):
    def __init__(self, claims, claim_labels, evidence_indices, evidences):
        self.claims = claims
        self.claim_labels = claim_labels
        self.evidence_indices = evidence_indices
        self.evidences = evidences

    def __len__(self):
        return len(self.claims)

    def __getitem__(self, idx):
        claim = self.claims[idx]
        evidence_idxs = self.evidence_indices[idx]
        evidences = [self.evidences[i] for i in evidence_idxs]
        if self.claim_labels is not None:
            label = self.claim_labels[idx]
            return claim, evidences, label
        else:
            return claim, evidences, -1

def custom_collate_fn(batch, vocab=vocab):
    claims, evidence_lists, labels = zip(*batch)

    # Convert claims to indices and pad them
    claims_indices = [torch.tensor(text_to_indices(claim, vocab)) for claim in claims]
    claims_padded = pad_sequence(claims_indices, batch_first=True, padding_value=vocab['<pad>'])

    # Convert evidences to indices and pad them independently for each claim
    max_evidence_length = max([len(evidence) for evidences in evidence_lists for evidence in evidences])
    evidence_padded = []
    for evidences in evidence_lists:
        evidence_indices = [torch.tensor(text_to_indices(evidence, vocab)) for evidence in evidences]
        padded_evidences = pad_sequence(evidence_indices, batch_first=True, padding_value=vocab['<pad>'])
        # Pad each list of evidences to the maximum evidence length using F.pad
        if padded_evidences.size(1) < max_evidence_length:
            padded_evidences = F.pad(padded_evidences, (0, max_evidence_length - padded_evidences.size(1)), value=vocab['<pad>'])
        evidence_padded.append(padded_evidences)

    labels_tensor = torch.tensor(labels, dtype=torch.long)

    return claims_padded, evidence_padded, labels_tensor

## 2.1 Bi-Directional GRU with Attention

In [54]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ClaimEvidenceModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx, dropout=0.1):
        super(ClaimEvidenceModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.attention = nn.Linear(hidden_dim * 2, 1, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.pad_idx = pad_idx

    def forward(self, claims, evidence_lists):
        # Embed and encode claims
        embedded_claims = self.embedding(claims)
        claims_mask = (claims != self.pad_idx).unsqueeze(2).float()
        embedded_claims *= claims_mask

        embedded_claims = self.dropout(embedded_claims)
        _, hidden_claims = self.gru(embedded_claims)
        hidden_claims = torch.cat((hidden_claims[-2,:,:], hidden_claims[-1,:,:]), dim=1)
        hidden_claims = self.dropout(hidden_claims)

        batch_size = claims.size(0)
        max_logits = torch.full((batch_size, self.fc.out_features), float('-inf'), device=claims.device)

        for i in range(batch_size):
            evidences = evidence_lists[i]
            embedded_evidences = self.embedding(evidences)
            evidences_mask = (evidences != self.pad_idx).unsqueeze(2).float()
            embedded_evidences *= evidences_mask

            embedded_evidences = self.dropout(embedded_evidences)
            _, hidden_evidences = self.gru(embedded_evidences)
            hidden_evidences = torch.cat((hidden_evidences[-2,:,:], hidden_evidences[-1,:,:]), dim=1)
            hidden_evidences = self.dropout(hidden_evidences)

            logits = self.fc(hidden_evidences)
            attention_weights = F.softmax(self.attention(hidden_evidences), dim=0)
            evidence_representation = torch.sum(attention_weights * logits, dim=0, keepdim=True)

            max_logits[i, :] = torch.max(logits, dim=0).values

        return F.log_softmax(max_logits, dim=1)




# Model instantiation
model = ClaimEvidenceModel(vocab_size=len(vocab), embedding_dim=100, hidden_dim=256, output_dim=len(label_map), pad_idx=vocab['<pad>'])


## 2.2 Transformer

In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
from torch import Tensor
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import math


class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(-torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(0)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: torch.Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:, :token_embedding.size(1), :])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: torch.Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

class TransformerEncoderLayer(nn.Module):
    def __init__(self, emb_size, num_heads, dim_feedforward, dropout):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(emb_size, num_heads, dropout=dropout, batch_first=True)
        self.linear1 = nn.Linear(emb_size, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, emb_size)
        self.norm1 = nn.LayerNorm(emb_size)
        self.norm2 = nn.LayerNorm(emb_size)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.activation = F.relu

    def forward(self, src: torch.Tensor, src_mask: torch.Tensor = None, src_key_padding_mask: torch.Tensor = None):
        src2, _ = self.self_attn(src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src

class ClaimVerificationTransformer(nn.Module):
    def __init__(self, num_layers: int, emb_size: int, nhead: int, vocab_size: int, dim_feedforward: int, num_classes: int, dropout: float = 0.1, maxlen: int = 5000, pad_idx: int = 0):
        super(ClaimVerificationTransformer, self).__init__()
        self.embedding = TokenEmbedding(vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout, maxlen)
        encoder_layers = nn.TransformerEncoderLayer(emb_size, nhead, dim_feedforward, dropout, batch_first=True)
        self.transformer_claim = nn.TransformerEncoder(encoder_layers, num_layers)
        self.transformer_evidence = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(emb_size, num_classes)
        self.attention = nn.Linear(emb_size, 1)
        self.pad_idx = pad_idx

    def forward(self, claims: Tensor, evidence_lists: Tensor):
        # Embed and encode claims
        embedded_claims = self.embedding(claims)
        embedded_claims = self.positional_encoding(embedded_claims)
        claims_mask = (claims == self.pad_idx)
        claims_encoded = self.transformer_claim(embedded_claims, src_key_padding_mask=claims_mask)
        #claims_encoded = claims_encoded.mean(dim=1)  # Average pooling
        claims_encoded, _ = torch.max(claims_encoded, dim=1)  # Max pooling

        batch_size = claims.size(0)
        evidence_representations = []

        for i in range(batch_size):
            evidences = evidence_lists[i]
            embedded_evidences = self.embedding(evidences)
            embedded_evidences = self.positional_encoding(embedded_evidences)
            evidences_mask = (evidences == self.pad_idx)
            evidences_encoded = self.transformer_evidence(embedded_evidences, src_key_padding_mask=evidences_mask)
            evidences_encoded = evidences_encoded.mean(dim=1)  # Average pooling
            #evidences_encoded, _ = torch.max(evidences_encoded, dim=1)  # Max pooling

            # Attention mechanism
            attention_weights = F.softmax(self.attention(evidences_encoded), dim=0)
            evidence_representation = torch.sum(attention_weights * evidences_encoded, dim=0)
            evidence_representations.append(evidence_representation)

        evidence_representations = torch.stack(evidence_representations)
        combined_representation = evidence_representations + claims_encoded
        logits = self.fc(combined_representation)
        return F.log_softmax(logits, dim=1)




## 2.3 Model Training

In [55]:
def train_model(model, train_loader, dev_loader, criterion, optimizer, device, vocab, num_epochs=10, grad_clip=1.0):
    model = model.to(device)
    criterion = criterion.to(device)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for claims, evidence_lists, labels in train_loader:
            claims = claims.to(device)
            evidence_lists = [e.to(device) for e in evidence_lists]
            labels = labels.to(device).long()

            optimizer.zero_grad()
            logits = model(claims, evidence_lists)
            loss = criterion(logits, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{num_epochs} - Training Loss: {avg_train_loss}')

        model.eval()
        total_val_loss = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for claims, evidence_lists, labels in dev_loader:
                claims = claims.to(device)
                evidence_lists = [e.to(device) for e in evidence_lists]
                labels = labels.to(device).long()

                logits = model(claims, evidence_lists)
                val_loss = criterion(logits, labels)
                total_val_loss += val_loss.item()

                preds = torch.argmax(logits, dim=1).cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = total_val_loss / len(dev_loader)
        accuracy = accuracy_score(all_labels, all_preds)
        f1 = classification_report(all_labels, all_preds, target_names=['REFUTES', 'SUPPORTS', 'NOT ENOUGH INFO', 'DISPUTED'], zero_division=0, output_dict=True)['macro avg']['f1-score']

        print(f'Epoch {epoch + 1}/{num_epochs} - Validation Loss: {avg_val_loss}')
        print(f'Accuracy: {accuracy}')
        print(f'F1 Score: {f1}')
        print(classification_report(all_labels, all_preds, target_names=['REFUTES', 'SUPPORTS', 'NOT ENOUGH INFO', 'DISPUTED'], zero_division=0))

        scheduler.step(avg_val_loss)



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_label_counts = train_claim_labels.value_counts()
# Convert label counts to a Tensor
train_label_counts_tensor = torch.tensor(train_label_counts.sort_index().values, dtype=torch.float)

# Calculate weights: inversely proportional to the class frequencies
class_weights = 1.0 / train_label_counts_tensor

# Normalize weights so that the smallest weight is 1.0
#class_weights = class_weights / class_weights.min()

# Move weights to the correct device (GPU or CPU)
class_weights = class_weights.to(device)





#### 2.3.1 Bi-GRU Model

In [56]:
vocab_size = len(vocab)
embedding_dim = 128
hidden_dim = 512
output_dim = 4  # Four classes now
model = ClaimEvidenceModel(vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx=0)
criterion = nn.CrossEntropyLoss(weight = class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_dataset = ClaimEvidenceDataset(train_claims_text_processed, train_claim_labels, train_evidence_idxs, evidence_text_processed)
train_loader = DataLoader(train_dataset, batch_size=36, shuffle=True, collate_fn=custom_collate_fn)
dev_dataset = ClaimEvidenceDataset(dev_claims_text_processed, dev_claim_labels, dev_evidence_idxs, evidence_text_processed)
dev_loader = DataLoader(dev_dataset, batch_size=36, shuffle=False, collate_fn=custom_collate_fn)

train_model(model, train_loader, dev_loader, criterion, optimizer, device=device, vocab=vocab, num_epochs=10)



Epoch 1/10 - Training Loss: 1.38687732560294
Epoch 1/10 - Validation Loss: 1.3888340711593627
Accuracy: 0.15584415584415584
F1 Score: 0.12366452991452992
                 precision    recall  f1-score   support

        REFUTES       0.11      0.04      0.06        27
       SUPPORTS       0.00      0.00      0.00        68
NOT ENOUGH INFO       0.71      0.12      0.21        41
       DISPUTED       0.13      1.00      0.23        18

       accuracy                           0.16       154
      macro avg       0.24      0.29      0.12       154
   weighted avg       0.22      0.16      0.09       154

Epoch 2/10 - Training Loss: 1.2939297982624598
Epoch 2/10 - Validation Loss: 1.3640209913253785
Accuracy: 0.2532467532467532
F1 Score: 0.23304084564860428
                 precision    recall  f1-score   support

        REFUTES       0.19      0.48      0.27        27
       SUPPORTS       0.00      0.00      0.00        68
NOT ENOUGH INFO       0.46      0.51      0.48        41
   

In [59]:
vocab_size = len(vocab)
embedding_dim = 128  # Increased embedding dimension for better representation
hidden_dim = 512  # Increased hidden dimension for better representation
output_dim = 4  # Four classes now
num_layers = 2  # Reduced number of transformer layers
nhead = 4  # Number of heads in multi-head attention
dim_feedforward = 56  # Feedforward network dimension
dropout = 0.4  # Dropout rate
pad_idx = vocab['<pad>']

model = ClaimVerificationTransformer(num_layers, embedding_dim, nhead, vocab_size, dim_feedforward, output_dim, dropout, pad_idx=pad_idx)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_dataset = ClaimEvidenceDataset(train_claims_text_processed, train_claim_labels, train_evidence_idxs, evidence_text_processed)
train_loader = DataLoader(train_dataset, batch_size=36, shuffle=True, collate_fn=custom_collate_fn)
dev_dataset = ClaimEvidenceDataset(dev_claims_text_processed, dev_claim_labels, dev_evidence_idxs, evidence_text_processed)
dev_loader = DataLoader(dev_dataset, batch_size=36, shuffle=False, collate_fn=custom_collate_fn)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_model(model, train_loader, dev_loader, criterion, optimizer, device=device, vocab=vocab, num_epochs=10)



Epoch 1/10 - Training Loss: 1.3470669439860752
Epoch 1/10 - Validation Loss: 1.2572819948196412
Accuracy: 0.45454545454545453
F1 Score: 0.1778012684989429
                 precision    recall  f1-score   support

        REFUTES       0.00      0.00      0.00        27
       SUPPORTS       0.45      1.00      0.62        68
NOT ENOUGH INFO       1.00      0.05      0.09        41
       DISPUTED       0.00      0.00      0.00        18

       accuracy                           0.45       154
      macro avg       0.36      0.26      0.18       154
   weighted avg       0.46      0.45      0.30       154

Epoch 2/10 - Training Loss: 1.2543191977909633
Epoch 2/10 - Validation Loss: 1.2629724502563477
Accuracy: 0.45454545454545453
F1 Score: 0.17940930037704234
                 precision    recall  f1-score   support

        REFUTES       0.00      0.00      0.00        27
       SUPPORTS       0.46      1.00      0.63        68
NOT ENOUGH INFO       0.67      0.05      0.09        41
 

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [None]:
from sklearn.metrics import accuracy_score
dev_dataset = ClaimEvidenceDataset(dev_claims_text_processed, dev_claim_labels, dev_evidence_idxs, evidence_text_processed)

dev_loader = DataLoader(dev_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate_fn)

model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for claims, evidences, labels in dev_loader:
        claims = claims.to(device)
        evidences = [e.to(device) for e in evidences]  # List comprehension to move each batch of evidences to GPU
        labels = labels.to(device)
        outputs = model(claims, evidences)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)

In [None]:
evidence_idx_to_id_dict = {idx: id for id, idx in evidence_id_dict.items()}

label_map_inverse = {v: k for k, v in label_map.items()}



dev_label_predictions = [label_map_inverse[pred] for pred in all_preds]
dev_converted_evidence_ids = [[evidence_idx_to_id_dict[idx] for idx in indices] for indices in dev_k_indices]

results = {}
for i, claim_id in enumerate(dev_claim_ids):
    results[claim_id] = {
        "claim_text": dev_claims_text[i],
        "claim_label": dev_label_predictions[i],
        "evidences": dev_converted_evidence_ids[i]
    }


In [None]:
with open('/content/drive/MyDrive/nlp/data/dev_predictions.json', 'w') as file:
    json.dump(results, file)

In [None]:
test_dataset = ClaimEvidenceDataset(test_claims, None, test_k_indices, evidence_text_processed)

test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate_fn)

model.eval()
all_preds = []
with torch.no_grad():
    for claims, evidences,_ in test_loader:  # Note that labels are not loaded
        claims = claims.to(device)
        evidences = [e.to(device) for e in evidences]  # Ensure evidences are moved to GPU
        outputs = model(claims, evidences)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())

# `all_preds` now contains the predicted labels for your test dataset


In [None]:
# Inverse mapping for label_map
label_map_inverse = {v: k for k, v in label_map.items()}
# Convert numerical predictions to label strings
label_predictions = [label_map_inverse[pred] for pred in all_preds]


In [None]:

evidence_idx_to_id_dict = {idx: id for id, idx in evidence_id_dict.items()}

converted_evidence_ids = [[evidence_idx_to_id_dict[idx] for idx in indices] for indices in test_k_indices]

In [None]:
converted_evidence_ids[0]

['evidence-55562',
 'evidence-1032935',
 'evidence-60163',
 'evidence-225665',
 'evidence-377026']

In [None]:
test_claims_id
test_claims_text
label_predictions
converted_evidence_ids


In [None]:
json.dump(label_predictions, open("/content/drive/MyDrive/nlp/data/test_label_predictions.json", "w"))

In [None]:
results = {}
for i, claim_id in enumerate(test_claims_id):
    results[claim_id] = {
        "claim_text": test_claims_text[i],
        "claim_label": label_predictions[i],
        "evidences": converted_evidence_ids[i]
    }

In [None]:
import json

# Convert to JSON string
json_output = json.dumps(results, indent=4)
print(json_output)

# Save to a JSON file
with open('/content/drive/MyDrive/nlp/data/test_predictions.json', 'w') as file:
    json.dump(results, file)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*