# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## 1.1 Data Loading

In [1]:
#Import packages
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import random
from collections import defaultdict, Counter, OrderedDict
import numpy as np
from scipy.sparse import csr_matrix, diags
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

evidences = pd.read_json('/content/drive/MyDrive/nlp/data/evidence.json', orient='index')
train_claims = pd.read_json('/content/drive/MyDrive/nlp/data/train-claims.json', orient='index')
dev_claims = pd.read_json('/content/drive/MyDrive/nlp/data/dev-claims.json', orient='index')

#update column names
evidences.reset_index(inplace=True)
evidences.columns = ['evidence_id', 'evidence_text']

train_claims.reset_index(inplace=True)
train_claims.rename(columns={'index': 'claim_id'}, inplace=True)

dev_claims.reset_index(inplace=True)
dev_claims.rename(columns={'index': 'claim_id'}, inplace=True)

evidence_id = evidences['evidence_id']
evidence_text = evidences['evidence_text']
evidence_idx = evidences.index.tolist()

evidence_id_dict = dict(zip(evidence_id, evidence_idx))

train_claims_text = train_claims['claim_text']
train_evidence_ids = train_claims['evidences']
#map evidence_id to their corrosponding index for faster processing
train_evidence_idxs = train_evidence_ids.apply(lambda x: [evidence_id_dict[evidence_id] for evidence_id in x])

dev_claims_text = dev_claims['claim_text']
dev_evidence_ids = dev_claims['evidences']
dev_evidence_idxs = dev_evidence_ids.apply(lambda x: [evidence_id_dict[evidence_id] for evidence_id in x])

Mounted at /content/drive


## 1.2 Text Preprocessing

In [3]:
#text preprocessing
tt = TweetTokenizer()
stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_data(text):
    tokens = tt.tokenize(text.lower())

    processed_tokens = []

    for token in tokens:
        token = token.lower()
        if token not in stopwords and token.isalpha():
            stemmed_token = stemmer.stem(token)
            processed_tokens.append(stemmed_token)

    return processed_tokens

train_claims_text_processed = train_claims_text.apply(preprocess_data)
dev_claims_text_precessed = dev_claims_text.apply(preprocess_data)
evidence_text_processed = evidence_text.apply(preprocess_data)

## 1.3 BM25 Scores (For Negative sampling)

In [4]:
# Build inverted index
def build_inverted_index(documents):
    inverted_index = defaultdict(list)
    for i, doc in enumerate(documents):
        term_freq = defaultdict(int)
        for term in doc:
            term_freq[term] += 1
        for term, freq in term_freq.items():
            inverted_index[term].append((i, freq))
    return inverted_index

# Compute IDF values
def compute_idf_values(inverted_index, total_documents):
    idf_values = {}
    for term, postings in inverted_index.items():
        idf_values[term] = np.log((total_documents + 1) / (len(postings) + 1))
    return idf_values

# Calculate average document length
def calculate_avg_doc_length(documents):
    total_length = sum(len(doc) for doc in documents)
    return total_length / len(documents)
#k.1.0, b = 0.78 recall=0.14
# Compute BM25 scores
def bm25_scores(query, inverted_index, idf_values, avg_doc_length, k1=0.4, b=0.9):
    scores = defaultdict(float)
    for term in query:
        if term not in inverted_index:
            continue
        doc_list = inverted_index[term]
        idf = idf_values[term]
        for doc_id, tf in doc_list:
            # Compute BM25 score for this document
            doc_length = len(evidence_text_processed[doc_id])
            numerator = idf * tf * (k1 + 1)
            denominator = tf + k1 * (1 - b + b * (doc_length / avg_doc_length))
            scores[doc_id] += numerator / denominator
    return scores

# Build inverted index for evidence text
inverted_index = build_inverted_index(evidence_text_processed)
total_documents = len(evidence_text_processed)
idf_values = compute_idf_values(inverted_index, total_documents)
avg_doc_length = calculate_avg_doc_length(evidence_text_processed)

# Example usage
train_bm25_results = []
for query in train_claims_text_processed:
    scores = bm25_scores(query, inverted_index, idf_values, avg_doc_length)
    train_bm25_results.append(scores)

dev_bm25_results = []
for query in dev_claims_text_precessed:
    scores = bm25_scores(query, inverted_index, idf_values, avg_doc_length)
    dev_bm25_results.append(scores)

In [23]:
train_reranked_indices = [[doc_id for doc_id, _ in sorted(scores.items(), key=lambda x: x[1], reverse=True)] for scores in train_bm25_results]
train_reranked_scores = [[score for _, score in sorted(scores.items(), key=lambda x: x[1], reverse=True)] for scores in train_bm25_results]

dev_reranked_indices = [[doc_id for doc_id, _ in sorted(scores.items(), key=lambda x: x[1], reverse=True)] for scores in dev_bm25_results]
dev_reranked_scores = [[score for _, score in sorted(scores.items(), key=lambda x: x[1], reverse=True)] for scores in dev_bm25_results]

In [25]:
def topk_indices(indices, k=100):
    return [indices[i][:min(k, len(indices[i]))] for i in range(len(indices))]

train_top_indices = topk_indices(train_reranked_indices, k=500)
dev_top_indices = topk_indices(dev_reranked_indices, k=50)
dev_top_scores = topk_indices(dev_reranked_scores, k=50)

In [6]:
def calculate_average_recall(top_k_indices, true_indices):

    recall_values = []

    # Iterate over each pair of top_k_indices and corresponding true indices
    for top_indices, true_inds in zip(top_k_indices, true_indices):
        # Calculate the number of true positives
        true_positives = len(set(top_indices) & set(true_inds))

        # Calculate recall for this claim
        recall = true_positives / len(true_inds) if true_inds else 0  # Avoid division by zero

        # Append the recall for this claim to the list
        recall_values.append(recall)

    # Compute average recall over all claims
    avg_recall = sum(recall_values) / len(recall_values) if recall_values else 0  # Avoid division by zero if list is empty

    return avg_recall

avg_recall = calculate_average_recall(dev_top_indices, dev_evidence_idxs)
print("Average Recall:", avg_recall)

Average Recall: 0.4452380952380952


In [7]:
def build_vocab(texts, min_freq=3):
    # Count all the words
    word_freq = Counter()
    for text in texts:
        word_freq.update(text)

    # Start vocab from special tokens
    vocab = OrderedDict({
        "<pad>": 0,
        "<unk>": 1,
        "<sos>": 2,
        "<eos>": 3
    })
    index = 4  # Start indexing from 4 because 0-3 are reserved for special tokens
    for word, freq in word_freq.items():
        if freq >= min_freq:  # Only include words that meet the frequency threshold
            vocab[word] = index
            index += 1

    return vocab

# Build vocabulary using only evidence texts and applying the frequency threshold
vocab = build_vocab(evidence_text_processed, min_freq=3)

## 1.4 Dataset Loading

### 1.4.1 Random Negatives

In [8]:
def text_to_indices(text, vocab):
    return [vocab.get(token, vocab['<unk>']) for token in text]

class RankingDatasetRandom(Dataset):
    def __init__(self, claims, evidences, true_indices, top_k_indices, k=100, neg_samples = 32):
        self.claims = claims
        self.evidences = evidences
        self.true_indices = true_indices
        self.top_k_indices = top_k_indices
        self.k = k
        self.neg_samples = neg_samples

    def __len__(self):
        return len(self.claims)

    def __getitem__(self, idx):
        claim = self.claims[idx]
        true_idxs = self.true_indices[idx]
        top_k_indices = self.top_k_indices[idx][:self.k]
        valid_indices = [i for i in true_idxs if i in top_k_indices]
        if not valid_indices:
            return None

        pos_idx = random.choice(valid_indices)
        pos_evidence = self.evidences[pos_idx]

        neg_indices = [ i for i in top_k_indices if i not in valid_indices]
        neg_evidences = random.sample([self.evidences[neg_idx] for neg_idx in neg_indices], min(self.neg_samples, len(neg_indices))) # Ensure we do not exceed available negatives

        return claim, pos_evidence, neg_evidences


def random_collate_fn(batch):
    # Remove None items that were skipped in the dataset
    batch = [item for item in batch if item is not None]

    if not batch:
        # If all items are None, return None. This needs to be handled in the training loop.
        return None

    claims, pos_evidences, neg_evidences_lists = zip(*batch)

    # Convert claims and evidences to indices
    claims_indices = [text_to_indices(claim, vocab) for claim in claims]
    pos_indices = [text_to_indices(evidence, vocab) for evidence in pos_evidences]
    neg_indices = [text_to_indices(neg, vocab) for sublist in neg_evidences_lists for neg in sublist]

    # Pad all sequences
    claims_padded = pad_sequence([torch.tensor(ci, dtype=torch.long) for ci in claims_indices], batch_first=True, padding_value=vocab['<pad>'])
    pos_padded = pad_sequence([torch.tensor(pi, dtype=torch.long) for pi in pos_indices], batch_first=True, padding_value=vocab['<pad>'])
    neg_padded = pad_sequence([torch.tensor(ni, dtype=torch.long) for ni in neg_indices], batch_first=True, padding_value=vocab['<pad>'])

    # Now that neg_padded is fully defined, you can reshape it
    if neg_padded.numel() > 0:  # Check to make sure there are elements to avoid size mismatch
        neg_padded = neg_padded.view(len(batch), -1, neg_padded.size(1))

    return claims_padded, pos_padded, neg_padded

### 1.4.2 In-Batch Negatives

In [9]:
class RankingDatasetInBatch(Dataset):
    def __init__(self, claims, evidences, true_indices):
        self.claims = claims
        self.evidences = evidences
        self.true_indices = true_indices

    def __len__(self):
        return len(self.claims)

    def __getitem__(self, idx):
        pos_idx = random.choice(self.true_indices[idx])  # Randomly sample one positive evidence index
        claim = self.claims[idx]
        pos_evidence = self.evidences[pos_idx]
        return claim, pos_evidence

def inbatch_collate_fn(batch):
    claims, pos_evidences = zip(*batch)

    # Prepare claims and positive evidences
    claims_indices = [text_to_indices(claim, vocab) for claim in claims]
    pos_indices = [text_to_indices(evidence, vocab) for evidence in pos_evidences]

    # Pad sequences for claims and positive evidences
    claims_padded = pad_sequence([torch.tensor(ci, dtype=torch.long) for ci in claims_indices], batch_first=True, padding_value=vocab['<pad>'])
    pos_padded = pad_sequence([torch.tensor(pi, dtype=torch.long) for pi in pos_indices], batch_first=True, padding_value=vocab['<pad>'])

    # Generate in-batch negatives: Each claim gets the positive samples of all other claims as its negatives.
    neg_padded_list = []
    max_length = max([len(pi) for pi in pos_indices])  # Find the maximum length of positive evidences in the batch

    for i in range(len(batch)):
        neg_samples = [pos_indices[j] for j in range(len(batch)) if i != j]

        # Pad each negative sample to the maximum length
        neg_padded = [F.pad(torch.tensor(ni, dtype=torch.long), (0, max_length - len(ni)), value=vocab['<pad>']) for ni in neg_samples]
        neg_padded_stack = torch.stack(neg_padded, dim=0)
        neg_padded_list.append(neg_padded_stack)

    # Stack the list of negative batches to form a single tensor
    neg_padded_stack = torch.stack(neg_padded_list, dim=0)

    return claims_padded, pos_padded, neg_padded_stack


### 1.4.3 In-Batch + Top Negative

In [34]:
class RankingDatasetInBatchGold(Dataset):
    def __init__(self, claims, evidences, true_indices, top_indices):
        self.claims = claims
        self.evidences = evidences
        self.true_indices = true_indices
        self.top_indices = top_indices

    def __len__(self):
        return len(self.claims)

    def __getitem__(self, idx):
        pos_idx = random.choice(self.true_indices[idx])  # Randomly sample one positive evidence index
        claim = self.claims[idx]
        pos_evidence = self.evidences[pos_idx]

        # Sample a negative from the top indices
        top_neg_indices = [i for i in self.top_indices[idx] if i not in self.true_indices[idx]]
        neg_idx = random.choice(top_neg_indices[:20])  # Choose one from the top 20 indices that are not true indices
        neg_evidence = self.evidences[neg_idx]

        return claim, pos_evidence, neg_evidence

def inbatch_gold_collate_fn(batch):
    claims, pos_evidences, neg_evidences = zip(*batch)

    # Prepare claims, positive evidences, and sampled negative evidences
    claims_indices = [text_to_indices(claim, vocab) for claim in claims]
    pos_indices = [text_to_indices(evidence, vocab) for evidence in pos_evidences]
    neg_indices = [text_to_indices(evidence, vocab) for evidence in neg_evidences]

    # Pad sequences for claims, positive evidences, and sampled negative evidences
    claims_padded = pad_sequence([torch.tensor(ci, dtype=torch.long) for ci in claims_indices], batch_first=True, padding_value=vocab['<pad>'])
    pos_padded = pad_sequence([torch.tensor(pi, dtype=torch.long) for pi in pos_indices], batch_first=True, padding_value=vocab['<pad>'])
    neg_padded = pad_sequence([torch.tensor(ni, dtype=torch.long) for ni in neg_indices], batch_first=True, padding_value=vocab['<pad>'])

    # Generate in-batch negatives: Each claim gets the positive samples of all other claims as its negatives.
    neg_padded_list = []
    for i in range(len(batch)):
        neg_samples = [pos_indices[j] for j in range(len(batch)) if i != j]
        neg_samples.append(neg_indices[i])  # Add the sampled negative evidence for this claim
        neg_padded_stack = pad_sequence([torch.tensor(ni, dtype=torch.long) for ni in neg_samples], batch_first=True, padding_value=vocab['<pad>'])
        neg_padded_list.append(neg_padded_stack)

    # Find the maximum length of the negative sequences in the neg_padded_list
    max_length = max([neg.size(1) for neg in neg_padded_list])

    # Pad each sequence in the neg_padded_list to the maximum length
    neg_padded_list = [F.pad(neg, (0, max_length - neg.size(1)), value=vocab['<pad>']) for neg in neg_padded_list]

    # Stack the list of negative batches to form a single tensor
    combined_neg_padded_stack = torch.stack(neg_padded_list, dim=0)

    return claims_padded, pos_padded, combined_neg_padded_stack

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## 2.1 GRU (SiameseNetwork)


In [11]:
class GRU_SiameseNetwork(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        # Initialize embedding layer with random weights
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)

    def forward(self, claim, evidence):
        # Embed and process claim
        claim_emb = self.embedding(claim)
        _, claim_hidden = self.gru(claim_emb)
        claim_hidden = claim_hidden.squeeze(0)  # Ensure shape is [batch_size, hidden_dim]

        # Embed and process evidence
        evidence_emb = self.embedding(evidence)
        _, evidence_hidden = self.gru(evidence_emb)
        evidence_hidden = evidence_hidden.squeeze(0)  # Ensure shape is [batch_size, hidden_dim]

        # Calculate dot product
        scores_dot = torch.bmm(claim_hidden.unsqueeze(1), evidence_hidden.unsqueeze(2)).squeeze()
        return scores_dot

## 2.2 GRU + Attention

In [12]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim, 1)  # Adjusted for unidirectional GRU

    def forward(self, outputs, mask):
        attn_weights = torch.tanh(self.attn(outputs))
        attn_weights = attn_weights.masked_fill(mask == 0, -1e9)  # Apply mask
        attn_weights = F.softmax(attn_weights, dim=1)
        context = (attn_weights * outputs).sum(dim=1)
        return context

class GRU_Attn_SiameseNetwork(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_rate=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)  
        self.hidden_dim = hidden_dim
        # Single layer unidirectional GRU
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=1, batch_first=True, dropout=0)
        self.dropout = nn.Dropout(dropout_rate)
        self.attention = Attention(hidden_dim)  # Adjusted for unidirectional output

    def forward_one(self, text):
        text_emb = self.embedding(text)
        text_out, _ = self.gru(text_emb)
        text_out = self.dropout(text_out)

        # Create mask
        mask = (text != 0).unsqueeze(2).to(text.device) 
        text_context = self.attention(text_out, mask)
        return text_context

    def forward(self, claims, evidences):
        claim_contexts = self.forward_one(claims)
        flattened_evidences = evidences.view(-1, evidences.size(-1))
        evidence_contexts = self.forward_one(flattened_evidences)
        evidence_contexts = evidence_contexts.view(claims.size(0), -1, self.hidden_dim)

        # Calculate dot product
        similarities = torch.bmm(claim_contexts.unsqueeze(1), evidence_contexts.transpose(1, 2)).squeeze(1)
        return similarities


## 2.3 Loss Function

In [13]:
def listwise_loss(model, claims_emb, pos_evidences_emb, neg_evidences_emb):
    pos_scores = model(claims_emb, pos_evidences_emb).unsqueeze(1)
    neg_scores = torch.stack([model(claims_emb, neg) for neg in neg_evidences_emb.transpose(0, 1)], dim=1)

    scores = torch.cat((pos_scores, neg_scores), dim=1)
    scores = scores.squeeze(-1)
    scores = F.log_softmax(scores, dim=1)

    # Create target tensor where the index of positive examples is always 0
    target = torch.zeros(scores.size(0), dtype=torch.long, device=scores.device)

    return F.nll_loss(scores, target)

In [14]:
def margin_ranking_loss(model, claims, pos_evidences, neg_evidences, margin=1.5):
    batch_size = claims.size(0)

    # Get the scores for positive evidence
    pos_scores = model(claims, pos_evidences).unsqueeze(1)

    # Get the scores for negative evidence
    neg_scores_list = [model(claims, neg_evidences[:, i, :]).unsqueeze(1) for i in range(neg_evidences.shape[1])]
    neg_scores = torch.cat(neg_scores_list, dim=1)

    # Calculate the margin ranking loss
    target = torch.ones_like(neg_scores, device=claims.device)
    pos_scores = pos_scores.expand_as(neg_scores)  # Expand pos_scores to match neg_scores shape
    loss = F.margin_ranking_loss(pos_scores, neg_scores, target, margin=margin)

    return loss

## 2.4 Model Training

In [15]:
def topk_indices(indices, k=100):
    return [indices[i][:min(k, len(indices[i]))] for i in range(len(indices))]

def evaluate_model(model, claims, evidence_texts, dev_top_indices, top_k, vocab, pad_idx):
    # Convert tensors to lists of indices and get the top k indices for evaluation
    dev_top_indices = topk_indices(dev_top_indices, k=top_k)

    dev_scores = []
    for idx in range(len(claims)):
        top_k_evidence_idxs = dev_top_indices[idx]
        top_k_evidences = [evidence_texts[i] for i in top_k_evidence_idxs]
        scores = score_query(model, claims[idx], top_k_evidences, vocab, pad_idx)
        dev_scores.append(scores)

    reranked_indices = []
    for indices, scores in zip(dev_top_indices, dev_scores):
        indexed_scores = list(zip(indices, scores))
        sorted_by_score = sorted(indexed_scores, key=lambda x: x[1], reverse=True)
        sorted_indices = [idx for idx, _ in sorted_by_score]
        reranked_indices.append(sorted_indices)

    return reranked_indices

def train_model(model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, eval_fn, eval_data, vocab, pad_idx, topk=20):
    for epoch in range(num_epochs):
        print("\n" + "#" * 50)  # Print separation line
        print(f"Starting Epoch {epoch+1}/{num_epochs}")
        model.train()
        epoch_loss = 0
        for batch in dataloader:
            if batch is None:
                continue

            claim, pos_evidences, neg_evidences = batch
            claim = claim.to(device)
            pos_evidences = pos_evidences.to(device)
            neg_evidences = neg_evidences.to(device)

            optimizer.zero_grad()
            loss = criterion(model, claim, pos_evidences, neg_evidences)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
            optimizer.step()
            epoch_loss += loss.item()

        avg_epoch_loss = epoch_loss / len(dataloader)
        print(f'Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_epoch_loss}')

        scheduler.step(avg_epoch_loss)
        current_lr = scheduler.optimizer.param_groups[0]['lr']
        print(f"Current Learning Rate: {current_lr}")

        # Evaluation at the end of each epoch
        model.eval()
        with torch.no_grad():
            reranked_indices = eval_fn(model, eval_data['claims'], eval_data['evidences'], eval_data['top_indices'], top_k=topk, vocab=vocab, pad_idx=pad_idx)
            results = evaluate_evidence_retrieval(reranked_indices, eval_data['ground_truth'], k=5)
            print(f"Epoch {epoch+1} Evaluation - Recall: {results['average_recall']}, Precision: {results['average_precision']}, F1 Score: {results['average_fscore']}")


### 2.4.1 GRU + Listwise

#### Random negatives

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#data loading
dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
# dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 256
# Initialize model
gru_model = GRU_SiameseNetwork(vocab_size, embedding_dim, hidden_dim).to(device)

criterion = listwise_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 5

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/5
Epoch 1/5, Average Loss: 3.8578842053046594
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.11331168831168827, Precision: 0.0727272727272727, F1 Score: 0.08303442589156872

##################################################
Starting Epoch 2/5
Epoch 2/5, Average Loss: 2.550181839710627
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.1269480519480519, Precision: 0.07792207792207788, F1 Score: 0.09074417645846213

##################################################
Starting Epoch 3/5
Epoch 3/5, Average Loss: 1.6366371332536427
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.1291125541125541, Precision: 0.07662337662337661, F1 Score: 0.08979076479076478

##################################################
Starting Epoch 4/5
Epoch 4/5, Average Loss: 1.4144377156616132
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.15151515151515146, Precision: 0.08441558441558437, F1 Score: 

#### In Batch Negatives

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#data loading
# dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)

# Initialize model
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 256
# Initialize model
gru_model = GRU_SiameseNetwork(vocab_size, embedding_dim, hidden_dim).to(device)

criterion = listwise_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 5

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/5
Epoch 1/5, Average Loss: 3.658553355779403
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.15086580086580081, Precision: 0.08961038961038958, F1 Score: 0.10461245104102247

##################################################
Starting Epoch 2/5
Epoch 2/5, Average Loss: 3.3098059862087936
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.16147186147186138, Precision: 0.09999999999999994, F1 Score: 0.11586270871985152

##################################################
Starting Epoch 3/5
Epoch 3/5, Average Loss: 2.899447141549526
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.1553030303030302, Precision: 0.09350649350649344, F1 Score: 0.10856009070294782

##################################################
Starting Epoch 4/5
Epoch 4/5, Average Loss: 2.5034423302381468
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.15681818181818172, Precision: 0.09220779220779213, F1 Score:

#### In Batch + Gold Negatives

In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#data loading
# dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
# dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
dataset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)

vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 256
# Initialize model
gru_model = GRU_SiameseNetwork(vocab_size, embedding_dim, hidden_dim).to(device)

criterion = listwise_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 10

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/10
Epoch 1/10, Average Loss: 3.805561206279657
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.15746753246753234, Precision: 0.09610389610389605, F1 Score: 0.11172954030096885

##################################################
Starting Epoch 2/10
Epoch 2/10, Average Loss: 3.3852939361181016
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.14761904761904757, Precision: 0.08961038961038957, F1 Score: 0.10417439703153987

##################################################
Starting Epoch 3/10
Epoch 3/10, Average Loss: 3.1411674878536124
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.1494588744588744, Precision: 0.08961038961038956, F1 Score: 0.10449391877963306

##################################################
Starting Epoch 4/10
Epoch 4/10, Average Loss: 2.8021051853131027
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.15443722943722937, Precision: 0.09090909090909084, 

### 2.4.2 GRU + MarginRanking

#### Random Negatives

In [36]:
dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
# dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
# dataset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)

vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 256
# Initialize model
gru_model = GRU_SiameseNetwork(vocab_size, embedding_dim, hidden_dim).to(device)

criterion = margin_ranking_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 5

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/5
Epoch 1/5, Average Loss: 1.950166907065954
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.14372294372294364, Precision: 0.08701298701298696, F1 Score: 0.10143784786641928

##################################################
Starting Epoch 2/5
Epoch 2/5, Average Loss: 1.7192371710179708
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.13712121212121206, Precision: 0.08051948051948046, F1 Score: 0.09432075860647286

##################################################
Starting Epoch 3/5
Epoch 3/5, Average Loss: 1.1022015505303175
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.12846320346320342, Precision: 0.074025974025974, F1 Score: 0.08679653679653679

##################################################
Starting Epoch 4/5
Epoch 4/5, Average Loss: 0.42140896373595566
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.13441558441558438, Precision: 0.08441558441558439, F1 Score

#### In-Batch Negatives

In [38]:
# dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
# dateset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)

vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 256
# Initialize model
gru_model = GRU_SiameseNetwork(vocab_size, embedding_dim, hidden_dim).to(device)

criterion = margin_ranking_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 5

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/5
Epoch 1/5, Average Loss: 1.6285589566597571
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.14242424242424237, Precision: 0.08701298701298696, F1 Score: 0.10181405895691607

##################################################
Starting Epoch 2/5
Epoch 2/5, Average Loss: 1.110748780079377
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.1417748917748917, Precision: 0.08571428571428565, F1 Score: 0.0998505462791177

##################################################
Starting Epoch 3/5
Epoch 3/5, Average Loss: 0.8242228168707627
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.1383116883116883, Precision: 0.08311688311688309, F1 Score: 0.0969284683570398

##################################################
Starting Epoch 4/5
Epoch 4/5, Average Loss: 0.5456625077968988
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.14642857142857135, Precision: 0.08831168831168827, F1 Score: 0

#### In-Batch Negatives + Gold

In [39]:
# dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
# dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
dataset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)

vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 256
# Initialize model
gru_model = GRU_SiameseNetwork(vocab_size, embedding_dim, hidden_dim).to(device)

criterion = margin_ranking_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 5

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/5
Epoch 1/5, Average Loss: 1.8205478986104329
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.13961038961038957, Precision: 0.0753246753246753, F1 Score: 0.08964646464646467

##################################################
Starting Epoch 2/5
Epoch 2/5, Average Loss: 1.5460639611268654
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.14231601731601728, Precision: 0.08311688311688306, F1 Score: 0.09732529375386517

##################################################
Starting Epoch 3/5
Epoch 3/5, Average Loss: 1.3267698257397382
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.1445887445887446, Precision: 0.0883116883116883, F1 Score: 0.10235518449804165

##################################################
Starting Epoch 4/5
Epoch 4/5, Average Loss: 1.024223186266728
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.14458874458874454, Precision: 0.09090909090909086, F1 Score: 

### 2.4.3 GRU_ATTENTION + Listwise

#### Random Negatives

In [40]:
dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
# dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
# dateset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)

vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 256
# Initialize model
gru_model = GRU_Attn_SiameseNetwork(vocab_size, embedding_dim, hidden_dim).to(device)

criterion = listwise_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 5

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/5
Epoch 1/5, Average Loss: 3.622001446210421
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.171103896103896, Precision: 0.09090909090909081, F1 Score: 0.11026592455163879

##################################################
Starting Epoch 2/5
Epoch 2/5, Average Loss: 2.90422440186525
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.18030303030303027, Precision: 0.09610389610389602, F1 Score: 0.1159863945578231

##################################################
Starting Epoch 3/5
Epoch 3/5, Average Loss: 2.350960251612541
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.18452380952380945, Precision: 0.10129870129870122, F1 Score: 0.12157287157287153

##################################################
Starting Epoch 4/5
Epoch 4/5, Average Loss: 1.9073215753604205
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.17478354978354974, Precision: 0.09870129870129861, F1 Score: 0.1

#### In Batch Negatives

In [42]:
# dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
# dateset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)

vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 256
# Initialize model
gru_model = GRU_Attn_SiameseNetwork(vocab_size, embedding_dim, hidden_dim).to(device)

criterion = listwise_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 10

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/10
Epoch 1/10, Average Loss: 2.56662284105252
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.16396103896103895, Precision: 0.08701298701298694, F1 Score: 0.10606060606060605

##################################################
Starting Epoch 2/10
Epoch 2/10, Average Loss: 1.9906803308389125
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.15367965367965364, Precision: 0.08441558441558437, F1 Score: 0.10103586889301175

##################################################
Starting Epoch 3/10
Epoch 3/10, Average Loss: 1.6259772303776863
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.1613636363636363, Precision: 0.08831168831168826, F1 Score: 0.10624613481756336

##################################################
Starting Epoch 4/10
Epoch 4/10, Average Loss: 1.4131575868679926
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.17911255411255408, Precision: 0.09610389610389602, F

#### In Batch + Gold

In [48]:
# dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
# dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
dataset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)

vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 512
# Initialize model
gru_model = GRU_Attn_SiameseNetwork(vocab_size, embedding_dim, hidden_dim, dropout_rate=0.6).to(device)

criterion = listwise_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 10

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/10
Epoch 1/10, Average Loss: 2.745050344711695
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.17283549783549776, Precision: 0.09350649350649341, F1 Score: 0.1126674912389198

##################################################
Starting Epoch 2/10
Epoch 2/10, Average Loss: 2.149915328392616
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.18701298701298694, Precision: 0.09999999999999991, F1 Score: 0.12024840239125949

##################################################
Starting Epoch 3/10
Epoch 3/10, Average Loss: 1.8773264579283886
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.17435064935064928, Precision: 0.09090909090909086, F1 Score: 0.1107039785611214

##################################################
Starting Epoch 4/10
Epoch 4/10, Average Loss: 1.6105697078582568
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.19859307359307352, Precision: 0.1077922077922077, F1 

### 2.4.4 GRU_ATTENTION + MarginRanking

#### Random

In [44]:
dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
# dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
# dateset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 256
# Initialize model
gru_model = GRU_Attn_SiameseNetwork(vocab_size, embedding_dim, hidden_dim).to(device)

criterion = margin_ranking_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.0001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 10

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/10
Epoch 1/10, Average Loss: 1.4351360339384813
Current Learning Rate: 0.0001
Epoch 1 Evaluation - Recall: 0.13041125541125537, Precision: 0.07142857142857141, F1 Score: 0.08560090702947847

##################################################
Starting Epoch 2/10
Epoch 2/10, Average Loss: 1.355090245222434
Current Learning Rate: 0.0001
Epoch 2 Evaluation - Recall: 0.1285714285714285, Precision: 0.07012987012987011, F1 Score: 0.08412183055040198

##################################################
Starting Epoch 3/10
Epoch 3/10, Average Loss: 1.2465319618200645
Current Learning Rate: 0.0001
Epoch 3 Evaluation - Recall: 0.12835497835497828, Precision: 0.07012987012987011, F1 Score: 0.08408575551432694

##################################################
Starting Epoch 4/10
Epoch 4/10, Average Loss: 1.177415933364477
Current Learning Rate: 0.0001
Epoch 4 Evaluation - Recall: 0.12922077922077915, Precision: 0.0701298701298701

#### In batch

In [45]:
# dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
# dateset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 256
# Initialize model
gru_model = GRU_Attn_SiameseNetwork(vocab_size, embedding_dim, hidden_dim).to(device)

criterion = margin_ranking_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.0001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 10

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/10
Epoch 1/10, Average Loss: 0.7513295641312232
Current Learning Rate: 0.0001
Epoch 1 Evaluation - Recall: 0.15054112554112545, Precision: 0.08701298701298696, F1 Score: 0.10258709544423826

##################################################
Starting Epoch 2/10
Epoch 2/10, Average Loss: 0.6695915697476803
Current Learning Rate: 0.0001
Epoch 2 Evaluation - Recall: 0.1496753246753246, Precision: 0.08701298701298694, F1 Score: 0.10226242011956294

##################################################
Starting Epoch 3/10
Epoch 3/10, Average Loss: 0.6344564740474408
Current Learning Rate: 0.0001
Epoch 3 Evaluation - Recall: 0.14101731601731593, Precision: 0.08441558441558436, F1 Score: 0.09847454133168414

##################################################
Starting Epoch 4/10
Epoch 4/10, Average Loss: 0.5851351848015418
Current Learning Rate: 0.0001
Epoch 4 Evaluation - Recall: 0.14069264069264062, Precision: 0.08441558441558

#### In Batch + Gold

In [47]:
# dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
# dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
dataset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 256
# Initialize model
gru_model = GRU_Attn_SiameseNetwork(vocab_size, embedding_dim, hidden_dim).to(device)

criterion = margin_ranking_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.0001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 10

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/10
Epoch 1/10, Average Loss: 0.7837640406229557
Current Learning Rate: 0.0001
Epoch 1 Evaluation - Recall: 0.14556277056277053, Precision: 0.07662337662337658, F1 Score: 0.09297052154195012

##################################################
Starting Epoch 2/10
Epoch 2/10, Average Loss: 0.7222232298973279
Current Learning Rate: 0.0001
Epoch 2 Evaluation - Recall: 0.14880952380952378, Precision: 0.07792207792207788, F1 Score: 0.0948258091115234

##################################################
Starting Epoch 3/10
Epoch 3/10, Average Loss: 0.6873870491981506
Current Learning Rate: 0.0001
Epoch 3 Evaluation - Recall: 0.15140692640692638, Precision: 0.08051948051948046, F1 Score: 0.097423211708926

##################################################
Starting Epoch 4/10
Epoch 4/10, Average Loss: 0.666056194366553
Current Learning Rate: 0.0001
Epoch 4 Evaluation - Recall: 0.14848484848484844, Precision: 0.07922077922077916

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## 3.1 Evaluation Function (Need execute before training)

In [16]:
def score_query(model, query, evidences, vocab, pad_idx):
    # Convert query and evidences to indices using the same function as during training
    query_indices = text_to_indices(query, vocab)  # Query to indices
    evidence_indices = [text_to_indices(evidence, vocab) for evidence in evidences]  # Evidences to indices

    # Convert lists to tensors and pad
    query_tensor = pad_sequence([torch.tensor(query_indices)], batch_first=True, padding_value=pad_idx)
    evidence_tensors = pad_sequence([torch.tensor(ei) for ei in evidence_indices], batch_first=True, padding_value=pad_idx)

    query_tensor = query_tensor.to(device)
    evidence_tensors = evidence_tensors.to(device)

    # Set the model to evaluation mode and disable gradient computation
    model.eval()
    scores = []
    with torch.no_grad():
        # Process all evidences in one batch for efficiency
        for i in range(evidence_tensors.shape[0]):
            score = model(query_tensor, evidence_tensors[i].unsqueeze(0))
            scores.append(score.item())

    return scores

In [17]:
def evaluate_evidence_retrieval(predicted_indices_list, actual_indices_list, k=5):
    assert len(predicted_indices_list) == len(actual_indices_list), "Both inputs must have the same length."

    total_recall = 0.0
    total_precision = 0.0
    total_fscore = 0.0
    num_claims = len(predicted_indices_list)

    for predicted_indices, actual_indices in zip(predicted_indices_list, actual_indices_list):
        # Convert tensors in predicted_indices to integers if they are not already
        predicted_indices = [index.item() if isinstance(index, torch.Tensor) else index for index in predicted_indices]

        # Retrieve the top k predictions
        top_k_predicted = set(predicted_indices[:k])
        actual_indices_set = set(actual_indices)

        # Calculate the number of correct predictions
        correct_predictions = len(top_k_predicted.intersection(actual_indices_set))

        # Calculate metrics
        if correct_predictions > 0:
            recall = float(correct_predictions) / len(actual_indices_set)
            precision = float(correct_predictions) / k
            if (precision + recall) != 0:
                fscore = 2 * (precision * recall) / (precision + recall)
            else:
                fscore = 0.0
        else:
            recall = 0.0
            precision = 0.0
            fscore = 0.0

        # Accumulate the metrics to calculate averages later
        total_recall += recall
        total_precision += precision
        total_fscore += fscore

    # Calculate average metrics
    average_recall = total_recall / num_claims
    average_precision = total_precision / num_claims
    average_fscore = total_fscore / num_claims

    return {
        "average_recall": average_recall,
        "average_precision": average_precision,
        "average_fscore": average_fscore
    }

### Test Result Generation

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*