# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## 1.1 Data Loading

In [1]:
#Import packages
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import math
import random
from collections import defaultdict, Counter, OrderedDict
import numpy as np
from scipy.sparse import csr_matrix, diags
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

evidences = pd.read_json('/content/drive/MyDrive/nlp/data/evidence.json', orient='index')
train_claims = pd.read_json('/content/drive/MyDrive/nlp/data/train-claims.json', orient='index')
dev_claims = pd.read_json('/content/drive/MyDrive/nlp/data/dev-claims.json', orient='index')

#update column names
evidences.reset_index(inplace=True)
evidences.columns = ['evidence_id', 'evidence_text']

train_claims.reset_index(inplace=True)
train_claims.rename(columns={'index': 'claim_id'}, inplace=True)

dev_claims.reset_index(inplace=True)
dev_claims.rename(columns={'index': 'claim_id'}, inplace=True)

evidence_id = evidences['evidence_id']
evidence_text = evidences['evidence_text']
evidence_idx = evidences.index.tolist()

evidence_id_dict = dict(zip(evidence_id, evidence_idx))

train_claims_text = train_claims['claim_text']
train_evidence_ids = train_claims['evidences']
#map evidence_id to their corrosponding index for faster processing
train_evidence_idxs = train_evidence_ids.apply(lambda x: [evidence_id_dict[evidence_id] for evidence_id in x])

dev_claims_text = dev_claims['claim_text']
dev_evidence_ids = dev_claims['evidences']
dev_evidence_idxs = dev_evidence_ids.apply(lambda x: [evidence_id_dict[evidence_id] for evidence_id in x])

Mounted at /content/drive


## 1.2 Text Preprocessing

In [None]:
#text preprocessing
tt = TweetTokenizer()
stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_data(text):
    tokens = tt.tokenize(text.lower())

    processed_tokens = []

    for token in tokens:
        token = token.lower()
        if token not in stopwords and token.isalpha():
            stemmed_token = stemmer.stem(token)
            processed_tokens.append(stemmed_token)

    return processed_tokens

train_claims_text_processed = train_claims_text.apply(preprocess_data)
dev_claims_text_precessed = dev_claims_text.apply(preprocess_data)
evidence_text_processed = evidence_text.apply(preprocess_data)

## 1.3 BM25 Scores (For Negative sampling)

In [4]:
# Build inverted index
def build_inverted_index(documents):
    inverted_index = defaultdict(list)
    for i, doc in enumerate(documents):
        term_freq = defaultdict(int)
        for term in doc:
            term_freq[term] += 1
        for term, freq in term_freq.items():
            inverted_index[term].append((i, freq))
    return inverted_index

# Compute IDF values
def compute_idf_values(inverted_index, total_documents):
    idf_values = {}
    for term, postings in inverted_index.items():
        idf_values[term] = np.log((total_documents + 1) / (len(postings) + 1))
    return idf_values

# Calculate average document length
def calculate_avg_doc_length(documents):
    total_length = sum(len(doc) for doc in documents)
    return total_length / len(documents)
#k.1.0, b = 0.78 recall=0.14
# Compute BM25 scores
def bm25_scores(query, inverted_index, idf_values, avg_doc_length, k1=0.4, b=0.9):
    scores = defaultdict(float)
    for term in query:
        if term not in inverted_index:
            continue
        doc_list = inverted_index[term]
        idf = idf_values[term]
        for doc_id, tf in doc_list:
            # Compute BM25 score for this document
            doc_length = len(evidence_text_processed[doc_id])
            numerator = idf * tf * (k1 + 1)
            denominator = tf + k1 * (1 - b + b * (doc_length / avg_doc_length))
            scores[doc_id] += numerator / denominator
    return scores

# Build inverted index for evidence text
inverted_index = build_inverted_index(evidence_text_processed)
total_documents = len(evidence_text_processed)
idf_values = compute_idf_values(inverted_index, total_documents)
avg_doc_length = calculate_avg_doc_length(evidence_text_processed)

# Example usage
train_bm25_results = []
for query in train_claims_text_processed:
    scores = bm25_scores(query, inverted_index, idf_values, avg_doc_length)
    train_bm25_results.append(scores)

dev_bm25_results = []
for query in dev_claims_text_precessed:
    scores = bm25_scores(query, inverted_index, idf_values, avg_doc_length)
    dev_bm25_results.append(scores)

In [5]:
train_reranked_indices = [[doc_id for doc_id, _ in sorted(scores.items(), key=lambda x: x[1], reverse=True)] for scores in train_bm25_results]
train_reranked_scores = [[score for _, score in sorted(scores.items(), key=lambda x: x[1], reverse=True)] for scores in train_bm25_results]

dev_reranked_indices = [[doc_id for doc_id, _ in sorted(scores.items(), key=lambda x: x[1], reverse=True)] for scores in dev_bm25_results]
dev_reranked_scores = [[score for _, score in sorted(scores.items(), key=lambda x: x[1], reverse=True)] for scores in dev_bm25_results]

def topk_indices(indices, k=100):
    return [indices[i][:min(k, len(indices[i]))] for i in range(len(indices))]

train_top_indices = topk_indices(train_reranked_indices, k=50)
dev_top_indices = topk_indices(dev_reranked_indices, k=50)
dev_top_scores = topk_indices(dev_reranked_scores, k=50)

In [None]:
def calculate_average_recall(top_k_indices, true_indices):

    recall_values = []

    # Iterate over each pair of top_k_indices and corresponding true indices
    for top_indices, true_inds in zip(top_k_indices, true_indices):
        # Calculate the number of true positives
        true_positives = len(set(top_indices) & set(true_inds))

        # Calculate recall for this claim
        recall = true_positives / len(true_inds) if true_inds else 0  # Avoid division by zero

        # Append the recall for this claim to the list
        recall_values.append(recall)

    # Compute average recall over all claims
    avg_recall = sum(recall_values) / len(recall_values) if recall_values else 0  # Avoid division by zero if list is empty

    return avg_recall

avg_recall = calculate_average_recall(dev_top_indices, dev_evidence_idxs)
print("Average Recall:", avg_recall)

In [28]:
def build_vocab(texts, min_freq=3):
    # Count all the words
    word_freq = Counter()
    for text in texts:
        word_freq.update(text)

    # Start vocab from special tokens
    vocab = OrderedDict({
        "<pad>": 0,
        "<unk>": 1,
        "<sos>": 2,
        "<eos>": 3
    })
    index = 4  # Start indexing from 4 because 0-3 are reserved for special tokens
    for word, freq in word_freq.items():
        if freq >= min_freq:  # Only include words that meet the frequency threshold
            vocab[word] = index
            index += 1

    return vocab

# Build vocabulary using only evidence texts and applying the frequency threshold
vocab = build_vocab(evidence_text_processed, min_freq=3)

## 1.4 Dataset Loading

### 1.4.1 In-Batch Negatives

In [49]:
def text_to_indices(text, vocab):
    return [vocab["<sos>"]] + [vocab.get(token, vocab['<unk>']) for token in text] + [vocab["<eos>"]]


class RankingDatasetInBatch(Dataset):
    def __init__(self, claims, evidences, true_indices):
        self.claims = claims
        self.evidences = evidences
        self.true_indices = true_indices

    def __len__(self):
        return len(self.claims)

    def __getitem__(self, idx):
        pos_idx = random.choice(self.true_indices[idx])  # Randomly sample one positive evidence index
        claim = self.claims[idx]
        pos_evidence = self.evidences[pos_idx]
        return claim, pos_evidence

def inbatch_collate_fn(batch):
    claims, pos_evidences = zip(*batch)

    # Prepare claims and positive evidences
    claims_indices = [text_to_indices(claim, vocab) for claim in claims]
    pos_indices = [text_to_indices(evidence, vocab) for evidence in pos_evidences]

    # Pad sequences for claims and positive evidences
    claims_padded = pad_sequence([torch.tensor(ci, dtype=torch.long) for ci in claims_indices], batch_first=True, padding_value=vocab['<pad>'])
    pos_padded = pad_sequence([torch.tensor(pi, dtype=torch.long) for pi in pos_indices], batch_first=True, padding_value=vocab['<pad>'])

    # Generate in-batch negatives: Each claim gets the positive samples of all other claims as its negatives.
    neg_padded_list = []
    max_length = max([len(pi) for pi in pos_indices])  # Find the maximum length of positive evidences in the batch

    for i in range(len(batch)):
        neg_samples = [pos_indices[j] for j in range(len(batch)) if i != j]

        # Pad each negative sample to the maximum length
        neg_padded = [F.pad(torch.tensor(ni, dtype=torch.long), (0, max_length - len(ni)), value=vocab['<pad>']) for ni in neg_samples]
        neg_padded_stack = torch.stack(neg_padded, dim=0)
        neg_padded_list.append(neg_padded_stack)

    # Stack the list of negative batches to form a single tensor
    neg_padded_stack = torch.stack(neg_padded_list, dim=0)

    return claims_padded, pos_padded, neg_padded_stack


### 1.4.2 In-Batch + Top Negative

In [50]:
class RankingDatasetInBatchGold(Dataset):
    def __init__(self, claims, evidences, true_indices, top_indices):
        self.claims = claims
        self.evidences = evidences
        self.true_indices = true_indices
        self.top_indices = top_indices

    def __len__(self):
        return len(self.claims)

    def __getitem__(self, idx):
        pos_idx = random.choice(self.true_indices[idx])  # Randomly sample one positive evidence index
        claim = self.claims[idx]
        pos_evidence = self.evidences[pos_idx]

        # Sample a negative from the top indices
        top_neg_indices = [i for i in self.top_indices[idx] if i not in self.true_indices[idx]]
        neg_idx = random.choice(top_neg_indices[:20])  # Choose one from the top 20 indices that are not true indices
        neg_evidence = self.evidences[neg_idx]

        return claim, pos_evidence, neg_evidence

def inbatch_gold_collate_fn(batch):
    claims, pos_evidences, neg_evidences = zip(*batch)

    # Prepare claims, positive evidences, and sampled negative evidences
    claims_indices = [text_to_indices(claim, vocab) for claim in claims]
    pos_indices = [text_to_indices(evidence, vocab) for evidence in pos_evidences]
    neg_indices = [text_to_indices(evidence, vocab) for evidence in neg_evidences]

    # Pad sequences for claims, positive evidences, and sampled negative evidences
    claims_padded = pad_sequence([torch.tensor(ci, dtype=torch.long) for ci in claims_indices], batch_first=True, padding_value=vocab['<pad>'])
    pos_padded = pad_sequence([torch.tensor(pi, dtype=torch.long) for pi in pos_indices], batch_first=True, padding_value=vocab['<pad>'])
    neg_padded = pad_sequence([torch.tensor(ni, dtype=torch.long) for ni in neg_indices], batch_first=True, padding_value=vocab['<pad>'])

    # Generate in-batch negatives: Each claim gets the positive samples of all other claims as its negatives.
    neg_padded_list = []
    for i in range(len(batch)):
        neg_samples = [pos_indices[j] for j in range(len(batch)) if i != j]
        neg_samples.append(neg_indices[i])  # Add the sampled negative evidence for this claim
        neg_padded_stack = pad_sequence([torch.tensor(ni, dtype=torch.long) for ni in neg_samples], batch_first=True, padding_value=vocab['<pad>'])
        neg_padded_list.append(neg_padded_stack)

    # Find the maximum length of the negative sequences in the neg_padded_list
    max_length = max([neg.size(1) for neg in neg_padded_list])

    # Pad each sequence in the neg_padded_list to the maximum length
    neg_padded_list = [F.pad(neg, (0, max_length - neg.size(1)), value=vocab['<pad>']) for neg in neg_padded_list]

    # Stack the list of negative batches to form a single tensor
    combined_neg_padded_stack = torch.stack(neg_padded_list, dim=0)

    return claims_padded, pos_padded, combined_neg_padded_stack

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## 2.1 Transformer Encoder (SiameseNetwork)


In [45]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(-torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(0)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: torch.Tensor):
        token_embedding = token_embedding + self.pos_embedding[:, :token_embedding.size(1), :]
        return self.dropout(token_embedding)

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size: int):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        self.emb_size = emb_size

    def forward(self, tokens: torch.Tensor):
        embeddings = self.embedding(tokens.long()) * math.sqrt(self.emb_size)
        return embeddings

class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, nhead=8, num_encoder_layers=3, dim_feedforward=512, dropout=0.5):
        super(TransformerEncoder, self).__init__()
        self.token_embedding = TokenEmbedding(vocab_size, embed_dim)
        self.pos_encoder = PositionalEncoding(embed_dim, dropout)
        self.layer_norm = nn.LayerNorm(embed_dim)
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_encoder_layers)
        self.attention = nn.MultiheadAttention(embed_dim, nhead, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(embed_dim, 1)

    def forward(self, src, src_key_padding_mask):
        src = self.token_embedding(src)
        src = self.pos_encoder(src)
        src = self.layer_norm(src)
        output = self.transformer_encoder(src, src_key_padding_mask=src_key_padding_mask)

        # Apply attention mechanism
        attn_output, _ = self.attention(output, output, output, key_padding_mask=src_key_padding_mask)

        # Compute attention weights
        attn_weights = torch.softmax(self.fc(attn_output), dim=1)

        # Compute context vector as a weighted sum of the encoder outputs
        context_vector = torch.sum(attn_weights * attn_output, dim=1)

        return context_vector

class SiameseTransformer(nn.Module):
    def __init__(self, vocab_size: int, embed_dim: int, nhead: int = 8, num_encoder_layers: int = 3,
                 dim_feedforward: int = 512, dropout: float = 0.5):
        super(SiameseTransformer, self).__init__()
        self.encoder = TransformerEncoder(vocab_size, embed_dim, nhead, num_encoder_layers, dim_feedforward, dropout)

    def forward(self, claims: torch.Tensor, evidences: torch.Tensor):
        # Create mask on the fly
        claims_mask = (claims == 0)
        evidences_mask = (evidences == 0)

        claims_enc = self.encoder(claims, claims_mask)
        evidences_enc = self.encoder(evidences, evidences_mask)

        # Ensure the encodings have the correct dimensions for the operations
        claims_enc = claims_enc.unsqueeze(1)
        evidences_enc = evidences_enc.unsqueeze(2)

        # Compute dot product
        scores_dot = torch.bmm(claims_enc, evidences_enc).squeeze()

        return scores_dot


## 2.2 Loss Function

In [30]:
def listwise_loss(model, claims_emb, pos_evidences_emb, neg_evidences_emb):
    pos_scores = model(claims_emb, pos_evidences_emb).unsqueeze(1)
    neg_scores = torch.stack([model(claims_emb, neg) for neg in neg_evidences_emb.transpose(0, 1)], dim=1)

    scores = torch.cat((pos_scores, neg_scores), dim=1)
    scores = scores.squeeze(-1)
    scores = F.log_softmax(scores, dim=1)

    # Create target tensor where the index of positive examples is always 0
    target = torch.zeros(scores.size(0), dtype=torch.long, device=scores.device)

    return F.nll_loss(scores, target)

In [11]:
def margin_ranking_loss(model, claims, pos_evidences, neg_evidences, margin=1.5):
    batch_size = claims.size(0)

    # Get the scores for positive evidence
    pos_scores = model(claims, pos_evidences).unsqueeze(1)

    # Get the scores for negative evidence
    neg_scores_list = [model(claims, neg_evidences[:, i, :]).unsqueeze(1) for i in range(neg_evidences.shape[1])]
    neg_scores = torch.cat(neg_scores_list, dim=1)

    # Calculate the margin ranking loss
    target = torch.ones_like(neg_scores, device=claims.device)
    pos_scores = pos_scores.expand_as(neg_scores)  # Expand pos_scores to match neg_scores shape
    loss = F.margin_ranking_loss(pos_scores, neg_scores, target, margin=margin)

    return loss

## 2.3 Model Training

In [21]:
def topk_indices(indices, k=100):
    return [indices[i][:min(k, len(indices[i]))] for i in range(len(indices))]

def evaluate_model(model, claims, evidence_texts, dev_top_indices, top_k, vocab, pad_idx):
    # Convert tensors to lists of indices and get the top k indices for evaluation
    dev_top_indices = topk_indices(dev_top_indices, k=top_k)

    dev_scores = []
    for idx in range(len(claims)):
        top_k_evidence_idxs = dev_top_indices[idx]
        top_k_evidences = [evidence_texts[i] for i in top_k_evidence_idxs]
        scores = score_query(model, claims[idx], top_k_evidences, vocab, pad_idx)
        dev_scores.append(scores)

    reranked_indices = []
    for indices, scores in zip(dev_top_indices, dev_scores):
        indexed_scores = list(zip(indices, scores))
        sorted_by_score = sorted(indexed_scores, key=lambda x: x[1], reverse=True)
        sorted_indices = [idx for idx, _ in sorted_by_score]
        reranked_indices.append(sorted_indices)

    return reranked_indices

def train_model(model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, eval_fn, eval_data, vocab, pad_idx, topk=20):
    for epoch in range(num_epochs):
        print("\n" + "#" * 50)  # Print separation line
        print(f"Starting Epoch {epoch+1}/{num_epochs}")
        model.train()
        epoch_loss = 0
        for batch in dataloader:
            if batch is None:
                continue

            claim, pos_evidences, neg_evidences = batch
            claim = claim.to(device)
            pos_evidences = pos_evidences.to(device)
            neg_evidences = neg_evidences.to(device)

            optimizer.zero_grad()
            loss = criterion(model, claim, pos_evidences, neg_evidences)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
            optimizer.step()
            epoch_loss += loss.item()

        avg_epoch_loss = epoch_loss / len(dataloader)
        print(f'Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_epoch_loss}')

        scheduler.step(avg_epoch_loss)
        current_lr = scheduler.optimizer.param_groups[0]['lr']
        print(f"Current Learning Rate: {current_lr}")

        # Evaluation at the end of each epoch
        model.eval()
        with torch.no_grad():
            reranked_indices = eval_fn(model, eval_data['claims'], eval_data['evidences'], eval_data['top_indices'], top_k=topk, vocab=vocab, pad_idx=pad_idx)
            results = evaluate_evidence_retrieval(reranked_indices, eval_data['ground_truth'], k=5)
            print(f"Epoch {epoch+1} Evaluation - Recall: {results['average_recall']}, Precision: {results['average_precision']}, F1 Score: {results['average_fscore']}")


### 2.3.1 Transformer + Listwise

#### In Batch Negatives

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#data loading
dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)

# Initialize model
embedding_dim = 320  # Dimension of the embeddings
hidden_dim = 256  # Hidden dimension size
dropout_rate = 0.5  # Dropout rate
nhead = 4  # Number of attention heads
num_encoder_layers = 3  # Number of encoder layers
dim_feedforward = 512  # Dimension of the feedforward network

transformer_model = SiameseTransformer(
    vocab_size=len(vocab),
    embed_dim=embedding_dim,
    nhead=nhead,
    num_encoder_layers=num_encoder_layers,
    dim_feedforward=dim_feedforward,
    dropout=dropout_rate
).to(device)

criterion = listwise_loss
optimizer = optim.Adam(transformer_model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 10

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(transformer_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/10
Epoch 1/10, Average Loss: 3.9917064874600143
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.07380952380952381, Precision: 0.04155844155844158, F1 Score: 0.04927849927849929

##################################################
Starting Epoch 2/10
Epoch 2/10, Average Loss: 3.523068825403849
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.09761904761904758, Precision: 0.05454545454545456, F1 Score: 0.06489383632240775

##################################################
Starting Epoch 3/10
Epoch 3/10, Average Loss: 3.435410499572754
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.09686147186147183, Precision: 0.05324675324675328, F1 Score: 0.06377551020408163

##################################################
Starting Epoch 4/10
Epoch 4/10, Average Loss: 3.332676820265941
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.09480519480519478, Precision: 0.049350649350649374, 

#### In Batch + Gold Negatives

In [51]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#data loading
dataset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)

# Initialize model
embedding_dim = 320  # Dimension of the embeddings
hidden_dim = 512  # Hidden dimension size
dropout_rate = 0.7  # Dropout rate
nhead = 4  # Number of attention heads
num_encoder_layers = 3  # Number of encoder layers
dim_feedforward = 512  # Dimension of the feedforward network

transformer_model = SiameseTransformer(
    vocab_size=len(vocab),
    embed_dim=embedding_dim,
    nhead=nhead,
    num_encoder_layers=num_encoder_layers,
    dim_feedforward=dim_feedforward,
    dropout=dropout_rate
).to(device)




criterion = listwise_loss
optimizer = optim.Adam(transformer_model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

clip_value = 1.0

num_epochs = 10

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(transformer_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/10
Epoch 1/10, Average Loss: 3.825396770086044
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.12153679653679648, Precision: 0.06233766233766236, F1 Score: 0.07546382189239334

##################################################
Starting Epoch 2/10
Epoch 2/10, Average Loss: 3.764690276903984
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.07597402597402597, Precision: 0.04545454545454548, F1 Score: 0.053607503607503604

##################################################
Starting Epoch 3/10
Epoch 3/10, Average Loss: 3.7049001180208645
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.08906926406926406, Precision: 0.049350649350649374, F1 Score: 0.05984848484848485

##################################################
Starting Epoch 4/10
Epoch 4/10, Average Loss: 3.67350295262459
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.09480519480519481, Precision: 0.049350649350649374,

### 2.3.2 Transformer + MarginRanking

#### In batch

In [52]:
dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)

# Initialize model
embedding_dim = 320  # Dimension of the embeddings
hidden_dim = 128  # Hidden dimension size
dropout_rate = 0.5  # Dropout rate
nhead = 8  # Number of attention heads
num_encoder_layers = 3  # Number of encoder layers
dim_feedforward = 512  # Dimension of the feedforward network

transformer_model = SiameseTransformer(
    vocab_size=len(vocab),
    embed_dim=embedding_dim,
    nhead=nhead,
    num_encoder_layers=num_encoder_layers,
    dim_feedforward=dim_feedforward,
    dropout=dropout_rate
).to(device)

criterion = margin_ranking_loss
optimizer = optim.Adam(transformer_model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 5

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(transformer_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/5
Epoch 1/5, Average Loss: 1.716033208064544
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.09112554112554112, Precision: 0.055844155844155856, F1 Score: 0.0650484436198722

##################################################
Starting Epoch 2/5
Epoch 2/5, Average Loss: 1.4950196498479598
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.10703463203463204, Precision: 0.058441558441558454, F1 Score: 0.07126881055452484

##################################################
Starting Epoch 3/5
Epoch 3/5, Average Loss: 1.3984164977684999
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.09426406926406927, Precision: 0.049350649350649374, F1 Score: 0.05950319521748095

##################################################
Starting Epoch 4/5
Epoch 4/5, Average Loss: 1.330437080982404
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.12012987012987009, Precision: 0.0662337662337662, F1 Scor

#### In Batch + Gold

In [53]:
dataset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)
# Initialize model
# Initialize model
embedding_dim = 320  # Dimension of the embeddings
hidden_dim = 128  # Hidden dimension size
dropout_rate = 0.5  # Dropout rate
nhead = 8  # Number of attention heads
num_encoder_layers = 3  # Number of encoder layers
dim_feedforward = 512  # Dimension of the feedforward network

transformer_model = SiameseTransformer(
    vocab_size=len(vocab),
    embed_dim=embedding_dim,
    nhead=nhead,
    num_encoder_layers=num_encoder_layers,
    dim_feedforward=dim_feedforward,
    dropout=dropout_rate
).to(device)

criterion = margin_ranking_loss
optimizer = optim.Adam(transformer_model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 5

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(transformer_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/5
Epoch 1/5, Average Loss: 1.6735933866256323
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.10064935064935064, Precision: 0.06103896103896103, F1 Score: 0.07212430426716142

##################################################
Starting Epoch 2/5
Epoch 2/5, Average Loss: 1.5162645028187678
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.08419913419913419, Precision: 0.048051948051948075, F1 Score: 0.05796742939600085

##################################################
Starting Epoch 3/5
Epoch 3/5, Average Loss: 1.3628566754169953
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.08116883116883115, Precision: 0.04545454545454548, F1 Score: 0.054473304473304486

##################################################
Starting Epoch 4/5
Epoch 4/5, Average Loss: 1.216034737917093
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.09274891774891772, Precision: 0.05194805194805197, F1 Sc

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [18]:
def score_query(model, query, evidences, vocab, pad_idx):
    # Convert query and evidences to indices using the same function as during training
    query_indices = text_to_indices(query, vocab)  # Query to indices
    evidence_indices = [text_to_indices(evidence, vocab) for evidence in evidences]  # Evidences to indices

    # Convert lists to tensors and pad
    query_tensor = pad_sequence([torch.tensor(query_indices)], batch_first=True, padding_value=pad_idx)
    evidence_tensors = pad_sequence([torch.tensor(ei) for ei in evidence_indices], batch_first=True, padding_value=pad_idx)

    query_tensor = query_tensor.to(device)
    evidence_tensors = evidence_tensors.to(device)

    # Set the model to evaluation mode and disable gradient computation
    model.eval()
    scores = []
    with torch.no_grad():
        # Process all evidences in one batch for efficiency
        for i in range(evidence_tensors.shape[0]):
            score = model(query_tensor, evidence_tensors[i].unsqueeze(0))
            scores.append(score.item())

    return scores

In [19]:
def evaluate_evidence_retrieval(predicted_indices_list, actual_indices_list, k=5):
    assert len(predicted_indices_list) == len(actual_indices_list), "Both inputs must have the same length."

    total_recall = 0.0
    total_precision = 0.0
    total_fscore = 0.0
    num_claims = len(predicted_indices_list)

    for predicted_indices, actual_indices in zip(predicted_indices_list, actual_indices_list):
        # Convert tensors in predicted_indices to integers if they are not already
        predicted_indices = [index.item() if isinstance(index, torch.Tensor) else index for index in predicted_indices]

        # Retrieve the top k predictions
        top_k_predicted = set(predicted_indices[:k])
        actual_indices_set = set(actual_indices)

        # Calculate the number of correct predictions
        correct_predictions = len(top_k_predicted.intersection(actual_indices_set))

        # Calculate metrics
        if correct_predictions > 0:
            recall = float(correct_predictions) / len(actual_indices_set)
            precision = float(correct_predictions) / k
            if (precision + recall) != 0:
                fscore = 2 * (precision * recall) / (precision + recall)
            else:
                fscore = 0.0
        else:
            recall = 0.0
            precision = 0.0
            fscore = 0.0

        # Accumulate the metrics to calculate averages later
        total_recall += recall
        total_precision += precision
        total_fscore += fscore

    # Calculate average metrics
    average_recall = total_recall / num_claims
    average_precision = total_precision / num_claims
    average_fscore = total_fscore / num_claims

    return {
        "average_recall": average_recall,
        "average_precision": average_precision,
        "average_fscore": average_fscore
    }

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*