# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [1]:
import pandas as pd

import torch
import math
import json
import nltk
import numpy as np


import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter, OrderedDict


from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

evidences = pd.read_json('/content/drive/MyDrive/nlp/data/evidence.json', orient='index')
train_claims = pd.read_json('/content/drive/MyDrive/nlp/data/train-claims.json', orient='index')
dev_claims = pd.read_json('/content/drive/MyDrive/nlp/data/dev-claims.json', orient='index')

#update column names
evidences.reset_index(inplace=True)
evidences.columns = ['evidence_id', 'evidence_text']

train_claims.reset_index(inplace=True)
train_claims.rename(columns={'index': 'claim_id'}, inplace=True)

dev_claims.reset_index(inplace=True)
dev_claims.rename(columns={'index': 'claim_id'}, inplace=True)

evidence_id = evidences['evidence_id']
evidence_text = evidences['evidence_text']
evidence_idx = evidences.index.tolist()

evidence_id_dict = dict(zip(evidence_id, evidence_idx))

train_claims_text = train_claims['claim_text']
train_evidence_ids = train_claims['evidences']
train_claim_labels = train_claims['claim_label']
#map evidence_id to their corrosponding index for faster processing
train_evidence_idxs = train_evidence_ids.apply(lambda x: [evidence_id_dict[evidence_id] for evidence_id in x])

dev_claims_text = dev_claims['claim_text']
dev_claim_labels = dev_claims['claim_label']
dev_evidence_ids = dev_claims['evidences']
dev_evidence_idxs = dev_evidence_ids.apply(lambda x: [evidence_id_dict[evidence_id] for evidence_id in x])

Mounted at /content/drive


In [None]:
test_claims = pd.read_json('/content/drive/MyDrive/nlp/data/test-claims-unlabelled.json', orient='index')
test_claims.reset_index(inplace=True)
test_claims.columns = ['claim_id', 'claim_text']
test_claims_text = test_claims['claim_text']
test_claims_id = test_claims['claim_id']

In [None]:
dev_claim_ids = dev_claims['claim_id']

In [3]:
import json
dev_evidence_indices = json.load(open("/content/drive/MyDrive/nlp/data/reranked_indices.json", "r"))
test_evidence_indices = json.load(open("/content/drive/MyDrive/nlp/data/test_reranked_indices.json", "r"))

In [4]:
dev_k_indices = [sublist[:5] if len(sublist) >= 5 else sublist + [None] * (5 - len(sublist)) for sublist in dev_evidence_indices]
test_k_indices = [sublist[:5] if len(sublist) >= 5 else sublist + [None] * (5 - len(sublist)) for sublist in test_evidence_indices]

In [5]:
tt = TweetTokenizer()
stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_data(text):
    tokens = tt.tokenize(text.lower())
    return tokens

train_claims_text_processed = train_claims_text.apply(preprocess_data)
dev_claims_text_processed = dev_claims_text.apply(preprocess_data)
evidence_text_processed = evidence_text.apply(preprocess_data)


In [6]:
def build_vocab(texts, min_freq=3):
    # Count all the words
    word_freq = Counter()
    for text in texts:
        word_freq.update(text)

    # Start vocab from special tokens
    vocab = OrderedDict({
        "<pad>": 0,
        "<unk>": 1,
        "<sos>": 2,
        "<eos>": 3
    })
    index = 4  # Start indexing from 4 because 0-3 are reserved for special tokens
    for word, freq in word_freq.items():
        if freq >= min_freq:  # Only include words that meet the frequency threshold
            vocab[word] = index
            index += 1

    return vocab

# Build vocabulary using only evidence texts and applying the frequency threshold
vocab = build_vocab(evidence_text_processed, min_freq=3)

In [7]:
label_map = {
    "REFUTES": 0,
    "SUPPORTS": 1,
    "NOT_ENOUGH_INFO": 2,
    "DISPUTED": 3
}
train_claim_labels = train_claims['claim_label'].map(label_map)
dev_claim_labels = dev_claims['claim_label'].map(label_map)

In [8]:


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def text_to_indices(text, vocab):
    return [vocab.get(word, vocab["<unk>"]) for word in text]

class BinaryClaimEvidenceDataset(Dataset):
    def __init__(self, claims, evidence_indices, evidences, claim_labels, vocab):
        self.claims = claims
        self.evidence_indices = evidence_indices
        self.evidences = evidences
        self.claim_labels = claim_labels
        self.vocab = vocab
        self.pairs = self.create_pairs()

    def create_pairs(self):
        pairs = []
        for idx, claim in enumerate(self.claims):
            label = self.claim_labels[idx]
            if label in [0, 1]:  # Only consider REFUTES (0) and SUPPORTS (1)
                candidate_pos_indices = self.evidence_indices[idx]
                positive_evidences = [self.evidences[i] for i in candidate_pos_indices]
                for evidence in positive_evidences:
                    pairs.append((claim, evidence, label))
        return pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        claim, evidence, label = self.pairs[idx]
        claim_indices = text_to_indices(claim, self.vocab)
        evidence_indices = text_to_indices(evidence, self.vocab)
        claim_indices = [self.vocab["<sos>"]] + claim_indices + [self.vocab["<eos>"]]
        evidence_indices = [self.vocab["<sos>"]] + evidence_indices + [self.vocab["<eos>"]]
        return claim_indices, evidence_indices, label

def custom_collate_fn(batch):
    claims, evidences, labels = zip(*batch)
    claims_tensor = pad_sequence([torch.tensor(seq, dtype=torch.long) for seq in claims], batch_first=True, padding_value=vocab["<pad>"]).to(device)
    evidences_tensor = pad_sequence([torch.tensor(seq, dtype=torch.long) for seq in evidences], batch_first=True, padding_value=vocab["<pad>"]).to(device)
    labels_tensor = torch.tensor(labels, dtype=torch.float).to(device)
    return claims_tensor, evidences_tensor, labels_tensor




# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## 2.1 Bi-Directional GRU (Baseline)

In [46]:
class ClaimEvidenceBaseModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, pad_idx, dropout=0.5):
        super(ClaimEvidenceBaseModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 4, 1)  # *4 because we will concatenate hidden states of claims and evidences
        self.dropout = nn.Dropout(dropout)
        self.pad_idx = pad_idx

    def forward(self, claims, evidences):
        # Embed and encode claims
        embedded_claims = self.embedding(claims)
        embedded_claims = self.dropout(embedded_claims)
        claims_mask = (claims != self.pad_idx).unsqueeze(2).float()
        embedded_claims *= claims_mask

        _, hidden_claims = self.gru(embedded_claims)
        hidden_claims = torch.cat((hidden_claims[-2,:,:], hidden_claims[-1,:,:]), dim=1)

        # Embed and encode evidences
        embedded_evidences = self.embedding(evidences)
        embedded_evidences = self.dropout(embedded_evidences)
        evidences_mask = (evidences != self.pad_idx).unsqueeze(2).float()
        embedded_evidences *= evidences_mask

        _, hidden_evidences = self.gru(embedded_evidences)
        hidden_evidences = torch.cat((hidden_evidences[-2,:,:], hidden_evidences[-1,:,:]), dim=1)

        # Combine claim and evidence representations by concatenation
        combined_representation = torch.cat((hidden_claims, hidden_evidences), dim=1)
        combined_representation = self.dropout(combined_representation)
        logits = self.fc(combined_representation).squeeze(-1)  # Ensure the output is of shape (batch_size)

        return logits


# Model instantiation
model = ClaimEvidenceBaseModel(vocab_size=len(vocab), embedding_dim=100, hidden_dim=256, pad_idx=vocab['<pad>'])


## Bi-Directional GRU + Attention

In [51]:
class ClaimEvidenceAttnModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, pad_idx, dropout=0.5):
        super(ClaimEvidenceAttnModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 4, 1)  # *4 because we will concatenate hidden states of claims and evidences
        self.dropout = nn.Dropout(dropout)
        self.pad_idx = pad_idx
        self.attn_claims = nn.Linear(hidden_dim * 2, 1, bias=False)
        self.attn_evidences = nn.Linear(hidden_dim * 2, 1, bias=False)

    def attention(self, gru_output, attn_layer, mask):
        attn_energies = attn_layer(gru_output).squeeze(2)  # (batch_size, seq_len)
        attn_energies = attn_energies.masked_fill(mask.squeeze(2) == 0, -1e10)  # Apply mask
        attn_weights = F.softmax(attn_energies, dim=1)  # (batch_size, seq_len)
        context = torch.bmm(attn_weights.unsqueeze(1), gru_output).squeeze(1)  # (batch_size, hidden_dim * 2)
        return context

    def forward(self, claims, evidences):
        # Embed and encode claims
        embedded_claims = self.embedding(claims)
        embedded_claims = self.dropout(embedded_claims)
        claims_mask = (claims != self.pad_idx).unsqueeze(2).float()
        embedded_claims *= claims_mask

        gru_output_claims, _ = self.gru(embedded_claims)  # (batch_size, seq_len, hidden_dim * 2)
        claims_context = self.attention(gru_output_claims, self.attn_claims, claims_mask)

        # Embed and encode evidences
        embedded_evidences = self.embedding(evidences)
        embedded_evidences = self.dropout(embedded_evidences)
        evidences_mask = (evidences != self.pad_idx).unsqueeze(2).float()
        embedded_evidences *= evidences_mask

        gru_output_evidences, _ = self.gru(embedded_evidences)  # (batch_size, seq_len, hidden_dim * 2)
        evidences_context = self.attention(gru_output_evidences, self.attn_evidences, evidences_mask)

        # Combine claim and evidence representations by concatenation
        combined_representation = torch.cat((claims_context, evidences_context), dim=1)
        combined_representation = self.dropout(combined_representation)
        logits = self.fc(combined_representation).squeeze(-1)  # Ensure the output is of shape (batch_size)

        return logits


## 2.2 Model Training

In [48]:
def train_model(model, train_loader, dev_loader, criterion, optimizer, device, num_epochs=10, grad_clip=1.0, threshold=0.8):
    model = model.to(device)  # Ensure the model is on the right device
    criterion = criterion.to(device)  # Also move the criterion to the GPU if available

    # Learning rate scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for claims, evidences, labels in train_loader:
            claims = claims.to(device)
            evidences = evidences.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            logits = model(claims, evidences).squeeze()
            loss = criterion(logits, labels)
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{num_epochs} - Training Loss: {avg_train_loss}')

        # Evaluate on the development set
        model.eval()
        total_val_loss = 0
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for claims, evidences, labels in dev_loader:
                claims = claims.to(device)
                evidences = evidences.to(device)
                labels = labels.to(device)

                logits = model(claims, evidences).squeeze()
                val_loss = criterion(logits, labels)
                total_val_loss += val_loss.item()

                probs = torch.sigmoid(logits)  # Convert logits to probabilities
                preds = torch.where(probs > threshold, 1, 0).cpu().numpy()  # Convert probabilities to 0 and 1

                all_preds.extend(preds)
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = total_val_loss / len(dev_loader)

        # Calculate evaluation metrics
        accuracy = accuracy_score(all_labels, all_preds)
        f1 = classification_report(all_labels, all_preds, target_names=['REFUTES', 'SUPPORTS'], zero_division=0, output_dict=True)['macro avg']['f1-score']

        print(f'Epoch {epoch + 1}/{num_epochs} - Validation Loss: {avg_val_loss}')
        print(f'Accuracy: {accuracy}')
        print(f'F1 Score: {f1}')
        print(classification_report(all_labels, all_preds, target_names=['REFUTES', 'SUPPORTS'], zero_division=0))

        # Step the learning rate scheduler
        scheduler.step(avg_val_loss)

### Baseline Training

In [49]:
num_refutes = (train_claim_labels == 0).sum()
num_supports = (train_claim_labels == 1).sum()
pos_weight = num_refutes / (num_supports*2)

train_dataset = BinaryClaimEvidenceDataset(train_claims_text_processed, train_evidence_idxs, evidence_text_processed, train_claim_labels, vocab)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=custom_collate_fn)
dev_dataset = BinaryClaimEvidenceDataset(dev_claims_text_processed, dev_evidence_idxs, evidence_text_processed, dev_claim_labels, vocab)
dev_loader = DataLoader(dev_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate_fn)
# Training the model
model = ClaimEvidenceBaseModel(vocab_size=len(vocab), embedding_dim=300, hidden_dim=512, pad_idx=vocab['<pad>'])


criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight).to(device))
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

train_model(model, train_loader, dev_loader, criterion, optimizer, device=device, num_epochs=15)

Epoch 1/15 - Training Loss: 0.2611047856311883
Epoch 1/15 - Validation Loss: 0.2504979223012924
Accuracy: 0.25
F1 Score: 0.2
              precision    recall  f1-score   support

     REFUTES       0.25      1.00      0.40        57
    SUPPORTS       0.00      0.00      0.00       171

    accuracy                           0.25       228
   macro avg       0.12      0.50      0.20       228
weighted avg       0.06      0.25      0.10       228

Epoch 2/15 - Training Loss: 0.24952434201156143
Epoch 2/15 - Validation Loss: 0.241336427628994
Accuracy: 0.25
F1 Score: 0.2
              precision    recall  f1-score   support

     REFUTES       0.25      1.00      0.40        57
    SUPPORTS       0.00      0.00      0.00       171

    accuracy                           0.25       228
   macro avg       0.12      0.50      0.20       228
weighted avg       0.06      0.25      0.10       228

Epoch 3/15 - Training Loss: 0.23664563511852668
Epoch 3/15 - Validation Loss: 0.2325946912169456

### Attention Model

In [53]:
num_refutes = (train_claim_labels == 0).sum()
num_supports = (train_claim_labels == 1).sum()
pos_weight = num_refutes / (num_supports*2)

train_dataset = BinaryClaimEvidenceDataset(train_claims_text_processed, train_evidence_idxs, evidence_text_processed, train_claim_labels, vocab)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=custom_collate_fn)
dev_dataset = BinaryClaimEvidenceDataset(dev_claims_text_processed, dev_evidence_idxs, evidence_text_processed, dev_claim_labels, vocab)
dev_loader = DataLoader(dev_dataset, batch_size=128, shuffle=False, collate_fn=custom_collate_fn)
# Training the model
model = ClaimEvidenceAttnModel(vocab_size=len(vocab), embedding_dim=300, hidden_dim=512, pad_idx=vocab['<pad>'])
#model = ConcatenatedClaimEvidenceModel(vocab_size=len(vocab), embedding_dim=300, hidden_dim=512, pad_idx=vocab['<pad>'])
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight).to(device))
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

train_model(model, train_loader, dev_loader, criterion, optimizer, device=device, num_epochs=15, threshold=0.8)







Epoch 1/15 - Training Loss: 0.25629022546046604
Epoch 1/15 - Validation Loss: 0.2503948360681534
Accuracy: 0.25
F1 Score: 0.2
              precision    recall  f1-score   support

     REFUTES       0.25      1.00      0.40        57
    SUPPORTS       0.00      0.00      0.00       171

    accuracy                           0.25       228
   macro avg       0.12      0.50      0.20       228
weighted avg       0.06      0.25      0.10       228

Epoch 2/15 - Training Loss: 0.23562038142596725
Epoch 2/15 - Validation Loss: 0.23958025872707367
Accuracy: 0.25
F1 Score: 0.2
              precision    recall  f1-score   support

     REFUTES       0.25      1.00      0.40        57
    SUPPORTS       0.00      0.00      0.00       171

    accuracy                           0.25       228
   macro avg       0.12      0.50      0.20       228
weighted avg       0.06      0.25      0.10       228

Epoch 3/15 - Training Loss: 0.20636684355218854
Epoch 3/15 - Validation Loss: 0.2352152839303

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [36]:
def evaluate_model(model, claims, evidence_idxs, evidences, claim_labels, vocab, pad_idx, device, support_threshold=0.95, refute_threshold=0.05):
    model.eval()
    all_preds = []
    all_labels = []
    all_evidence_predictions = []
    all_evidence_probs = []

    for claim_tokens, evidence_idx_list, true_label in zip(claims, evidence_idxs, claim_labels):
        # Numericalize the claim tokens
        claim_indices = text_to_indices(claim_tokens, vocab)
        claim_indices = [vocab["<sos>"]] + claim_indices + [vocab["<eos>"]]
        claim_tensor = torch.tensor(claim_indices, dtype=torch.long).unsqueeze(0).to(device)  # Add batch dimension

        evidence_tensors = []
        for idx in evidence_idx_list:
            evidence_tokens = evidences[idx]
            evidence_indices = text_to_indices(evidence_tokens, vocab)
            evidence_indices = [vocab["<sos>"]] + evidence_indices + [vocab["<eos>"]]
            evidence_tensor = torch.tensor(evidence_indices, dtype=torch.long).to(device)
            evidence_tensors.append(evidence_tensor)

        # Pad evidence tensors to the same length
        evidence_tensors_padded = pad_sequence(evidence_tensors, batch_first=True, padding_value=pad_idx).to(device)

        evidence_predictions = []
        evidence_probs = []
        with torch.no_grad():
            for evidence_tensor in evidence_tensors_padded:
                evidence_tensor = evidence_tensor.unsqueeze(0)  # Add batch dimension
                logits = model(claim_tensor, evidence_tensor)
                preds = torch.argmax(logits, dim=1).cpu().numpy()
                evidence_predictions.extend(preds)
                #prob = torch.sigmoid(logits).item()
                #evidence_probs.append(prob)


        aggregated_prediction = aggregate_predictions(evidence_predictions)
        all_preds.append(aggregated_prediction)
        all_labels.append(true_label)
        all_evidence_predictions.append(evidence_predictions)
        all_evidence_probs.append(evidence_probs)

    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=['REFUTES', 'SUPPORTS', 'NOT ENOUGH INFO', 'DISPUTED'], zero_division=0)
    return accuracy, report, all_preds, all_labels, all_evidence_predictions

def aggregate_predictions(evidence_predictions):
    counter = Counter(evidence_predictions)
    num_supports = counter[1]
    num_refutes = counter[0]

    # Handle conflicts: if both SUPPORTS and REFUTES are present
    if num_supports > 0 and num_refutes > 0:
        return 3  # DISPUTED if there are both SUPPORTS and REFUTES

    # Determine the class if all predictions are either SUPPORTS or REFUTES (with or without NOT ENOUGH INFO)
    if num_supports > 0 and num_refutes == 0:
        return 1  # SUPPORTS

    if num_refutes > 0 and num_supports == 0:
        return 0  # REFUTES

    # Default to NOT ENOUGH INFO if there are no SUPPORTS or REFUTES
    return 2


# Evaluate the model on the training set
accuracy, report, all_preds, all_labels, all_evidence_predictions = evaluate_model(model, dev_claims_text_processed, dev_k_indices, evidence_text_processed, dev_claim_labels, vocab, vocab["<pad>"], device)
print(f'Accuracy: {accuracy}')
print(report)

Accuracy: 0.4155844155844156
                 precision    recall  f1-score   support

        REFUTES       0.24      0.19      0.21        27
       SUPPORTS       0.50      0.63      0.56        68
NOT ENOUGH INFO       0.44      0.34      0.38        41
       DISPUTED       0.13      0.11      0.12        18

       accuracy                           0.42       154
      macro avg       0.33      0.32      0.32       154
   weighted avg       0.39      0.42      0.40       154



In [82]:
from sklearn.metrics import accuracy_score, classification_report
import torch
from torch.nn.utils.rnn import pad_sequence
from collections import Counter

def evaluate_model(model, claims, evidence_idxs, evidences, claim_labels, vocab, pad_idx, device, support_threshold=0.95, refute_threshold=0.05):
    model.eval()
    all_preds = []
    all_labels = []
    all_evidence_predictions = []
    all_evidence_probs = []

    for claim_tokens, evidence_idx_list, true_label in zip(claims, evidence_idxs, claim_labels):
        # Numericalize the claim tokens
        claim_indices = text_to_indices(claim_tokens, vocab)
        claim_indices = [vocab["<sos>"]] + claim_indices + [vocab["<eos>"]]
        claim_tensor = torch.tensor(claim_indices, dtype=torch.long).unsqueeze(0).to(device)  # Add batch dimension

        evidence_tensors = []
        for idx in evidence_idx_list:
            evidence_tokens = evidences[idx]
            evidence_indices = text_to_indices(evidence_tokens, vocab)
            evidence_indices = [vocab["<sos>"]] + evidence_indices + [vocab["<eos>"]]
            evidence_tensor = torch.tensor(evidence_indices, dtype=torch.long).to(device)
            evidence_tensors.append(evidence_tensor)

        # Pad evidence tensors to the same length
        evidence_tensors_padded = pad_sequence(evidence_tensors, batch_first=True, padding_value=pad_idx).to(device)

        evidence_predictions = []
        evidence_probs = []
        with torch.no_grad():
            for evidence_tensor in evidence_tensors_padded:
                evidence_tensor = evidence_tensor.unsqueeze(0)  # Add batch dimension
                logits = model(claim_tensor, evidence_tensor).squeeze()
                prob = torch.sigmoid(logits).item()
                evidence_probs.append(prob)
                if prob > support_threshold:
                    evidence_predictions.append(1)  # SUPPORTS
                elif prob < refute_threshold:
                    evidence_predictions.append(0)  # REFUTES
                else:
                    evidence_predictions.append(2)  # NOT ENOUGH INFO

        aggregated_prediction = aggregate_predictions(evidence_predictions)
        all_preds.append(aggregated_prediction)
        all_labels.append(true_label)
        all_evidence_predictions.append(evidence_predictions)
        all_evidence_probs.append(evidence_probs)

    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=['REFUTES', 'SUPPORTS', 'NOT ENOUGH INFO', 'DISPUTED'], zero_division=0)
    return accuracy, report, all_preds, all_labels, all_evidence_predictions

def aggregate_predictions(evidence_predictions):
    counter = Counter(evidence_predictions)
    num_supports = counter[1]
    num_refutes = counter[0]

    # Handle conflicts: if both SUPPORTS and REFUTES are present
    if num_supports > 0 and num_refutes > 0:
        return 3  # DISPUTED if there are both SUPPORTS and REFUTES

    # Determine the class if all predictions are either SUPPORTS or REFUTES (with or without NOT ENOUGH INFO)
    if num_supports > 0 and num_refutes == 0:
        return 1  # SUPPORTS

    if num_refutes > 0 and num_supports == 0:
        return 0  # REFUTES

    # Default to NOT ENOUGH INFO if there are no SUPPORTS or REFUTES
    return 2


# Evaluate the model on the training set
accuracy, report, all_preds, all_labels, all_evidence_predictions = evaluate_model(model, dev_claims_text_processed, dev_k_indices, evidence_text_processed, dev_claim_labels, vocab, vocab["<pad>"], device)
print(f'Accuracy: {accuracy}')
print(report)


Accuracy: 0.4155844155844156
                 precision    recall  f1-score   support

        REFUTES       0.31      0.15      0.20        27
       SUPPORTS       0.52      0.72      0.60        68
NOT ENOUGH INFO       0.23      0.27      0.25        41
       DISPUTED       0.00      0.00      0.00        18

       accuracy                           0.42       154
      macro avg       0.27      0.28      0.26       154
   weighted avg       0.35      0.42      0.37       154



In [86]:
def greedy_search_best_thresholds(model, claims, evidence_idxs, evidences, claim_labels, vocab, pad_idx, device):
    best_accuracy = 0.1
    best_support_threshold = 0.85
    best_refute_threshold = 0.3

    # Define the ranges for the thresholds
    support_thresholds = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
    refute_thresholds = [0.01, 0.05, 0.1, 0.2]

    for support_threshold in support_thresholds:
        for refute_threshold in refute_thresholds:
            accuracy, report, all_preds, all_labels,_ = evaluate_model(model, dev_claims_text_processed, dev_k_indices, evidence_text_processed, dev_claim_labels, vocab, vocab["<pad>"], device, support_threshold=support_threshold, refute_threshold=refute_threshold)
            print(accuracy)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_support_threshold = support_threshold
                best_refute_threshold = refute_threshold

    return best_support_threshold, best_refute_threshold, best_accuracy

# Usage
best_support_threshold, best_refute_threshold, best_accuracy = greedy_search_best_thresholds(
    model, dev_claims_text_processed, dev_evidence_idxs, evidence_text_processed, dev_claim_labels, vocab, vocab["<pad>"], device
)

print(f"Best Support Threshold: {best_support_threshold}")
print(f"Best Refute Threshold: {best_refute_threshold}")
print(f"Best Accuracy: {best_accuracy}")


0.461038961038961
0.44155844155844154
0.45454545454545453
0.45454545454545453
0.4675324675324675
0.44805194805194803
0.461038961038961
0.45454545454545453
0.461038961038961
0.44155844155844154
0.45454545454545453
0.44805194805194803
0.44805194805194803
0.42857142857142855
0.44155844155844154
0.43506493506493504
0.42857142857142855
0.4090909090909091
0.42207792207792205
0.4155844155844156
0.44805194805194803
0.42857142857142855
0.44155844155844154
0.43506493506493504
Best Support Threshold: 0.4
Best Refute Threshold: 0.01
Best Accuracy: 0.4675324675324675


## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*