# 2024 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## 1.1 Data Loading

In [1]:
#Import packages
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import random
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

evidences = pd.read_json('/content/drive/MyDrive/nlp/data/evidence.json', orient='index')
train_claims = pd.read_json('/content/drive/MyDrive/nlp/data/train-claims.json', orient='index')
dev_claims = pd.read_json('/content/drive/MyDrive/nlp/data/dev-claims.json', orient='index')

#update column names
evidences.reset_index(inplace=True)
evidences.columns = ['evidence_id', 'evidence_text']

train_claims.reset_index(inplace=True)
train_claims.rename(columns={'index': 'claim_id'}, inplace=True)

dev_claims.reset_index(inplace=True)
dev_claims.rename(columns={'index': 'claim_id'}, inplace=True)

evidence_id = evidences['evidence_id']
evidence_text = evidences['evidence_text']
evidence_idx = evidences.index.tolist()

evidence_id_dict = dict(zip(evidence_id, evidence_idx))

train_claims_text = train_claims['claim_text']
train_evidence_ids = train_claims['evidences']
#map evidence_id to their corrosponding index for faster processing
train_evidence_idxs = train_evidence_ids.apply(lambda x: [evidence_id_dict[evidence_id] for evidence_id in x])

dev_claims_text = dev_claims['claim_text']
dev_evidence_ids = dev_claims['evidences']
dev_evidence_idxs = dev_evidence_ids.apply(lambda x: [evidence_id_dict[evidence_id] for evidence_id in x])

Mounted at /content/drive


## 1.2 Text Preprocessing

In [3]:
#text preprocessing
tt = TweetTokenizer()
stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_data(text):
    tokens = tt.tokenize(text)

    processed_tokens = []

    for token in tokens:
        token = token.lower()
        if token not in stopwords and token.isalpha():
            stemmed_token = stemmer.stem(token)
            processed_tokens.append(stemmed_token)

    return processed_tokens

train_claims_text_processed = train_claims_text.apply(preprocess_data)
dev_claims_text_precessed = dev_claims_text.apply(preprocess_data)
evidence_text_processed = evidence_text.apply(preprocess_data)

## 1.3 Word2Vec Embeddings (For Negative sampling)

In [4]:
from gensim.models import Word2Vec

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=evidence_text_processed, vector_size=300, window=10, min_count=3, workers=12, sg=1, hs=0)

def generate_embedding(text, model):
    # Filter out words that are not in the Word2Vec model's vocabulary
    words = [word for word in text if word in model.wv.key_to_index]
    if not words:  # Handle cases where none of the words are in the vocabulary
        return np.zeros(model.vector_size)
    # Get embeddings for each word in the text and average them
    word_embeddings = [model.wv[word] for word in words]
    sentence_embedding = np.mean(word_embeddings, axis=0)
    return sentence_embedding

# Generate embeddings for train claim_text
train_claim_text_embeddings = [generate_embedding(text, word2vec_model) for text in train_claims_text_processed]

# Generate embeddings for dev claim_text
dev_claim_text_embeddings = [generate_embedding(text, word2vec_model) for text in dev_claims_text_precessed]

# Generate embeddings for all evidence texts
evidence_embeddings = [generate_embedding(text, word2vec_model) for text in evidence_text_processed]

# Function to compute cosine similarity scores for all claims and evidence embeddings
def compute_similarity_scores(claim_embeddings, evidence_embeddings):
    similarity_scores = cosine_similarity(claim_embeddings, evidence_embeddings)
    return similarity_scores


# Compute cosine similarity scores for training claims and evidence embeddings
train_similarity_scores = compute_similarity_scores(train_claim_text_embeddings, evidence_embeddings)

# Compute cosine similarity scores for development claims and evidence embeddings
dev_similarity_scores = compute_similarity_scores(dev_claim_text_embeddings, evidence_embeddings)

In [7]:
#Evaluate the word2vec recall rate at different k values
def compute_recall_at_k(similarity_scores, true_indices, k):
    recall_values = []

    # Convert similarity scores to PyTorch tensor
    similarity_scores_tensor = torch.FloatTensor(similarity_scores)

    # Get top k indices for each sample
    top_k_indices = torch.topk(similarity_scores_tensor, k, dim=-1).indices.tolist()

    for i in range(len(true_indices)):

        true_indices_i = true_indices[i]

        recall_count = sum(1 for idx in true_indices_i if idx in top_k_indices[i])

        recall = recall_count / len(true_indices_i)

        recall_values.append(recall)

    # Compute average recall over all samples
    avg_recall = sum(recall_values) / len(recall_values)

    return avg_recall


train_recall_at_k = compute_recall_at_k(train_similarity_scores, train_evidence_idxs, 15)
print("Training Recall at K:", train_recall_at_k)

dev_recall_at_k = compute_recall_at_k(dev_similarity_scores, dev_evidence_idxs, 10)
print("Dev Recall at K:", dev_recall_at_k)

Training Recall at K: 0.1896986970684039
Dev Recall at K: 0.19036796536796524


In [8]:
def get_top_k_indices(similarity_scores, k=1000):
    if isinstance(similarity_scores, np.ndarray):
        similarity_scores_tensor = torch.FloatTensor(similarity_scores)
    else:
        similarity_scores_tensor = similarity_scores

    # Get top k indices for each sample using PyTorch
    top_k_values, top_k_indices = torch.topk(similarity_scores_tensor, k, dim=1, largest=True, sorted=True)

    # Convert tensor indices to numpy arrays
    top_k_indices_np = top_k_indices.cpu().numpy()
    top_k_values_np = top_k_values.cpu().numpy()

    # Optionally convert numpy arrays to lists if needed
    top_k_indices_list = top_k_indices_np.tolist()
    top_k_values_list = top_k_values_np.tolist()

    return top_k_indices_list, top_k_values_list

train_top_indices, _ = get_top_k_indices(train_similarity_scores, k=50)
dev_top_indices, dev_orig_scores = get_top_k_indices(dev_similarity_scores, k=50)

288364

In [9]:
embedding_dim = word2vec_model.vector_size
vocab = {word: idx for idx, word in enumerate(word2vec_model.wv.index_to_key)}

# Calculate the current maximum index in the vocabulary
max_index = max(vocab.values())


# vocab['<cls>'] = max_index + 1
# vocab['<sep>'] = max_index + 2
vocab['<unk>'] = max_index + 1
vocab['<pad>'] = max_index + 2


# Create random embeddings for the three new special tokens
random_embeddings = np.zeros((1, embedding_dim))

# Extend the embedding matrix with random embeddings for special tokens
padding_embeddings = np.zeros((1, embedding_dim))  # Typically zero vector for padding

extended_embeddings = np.vstack([
    word2vec_model.wv.vectors,  # Existing embeddings from Word2Vec
    random_embeddings,
    padding_embeddings
])
embedding_matrix = torch.FloatTensor(extended_embeddings)

## 1.4 Dataset Loading

### 1.4.1 Random Negatives

In [10]:
def text_to_indices(text, vocab):
    return [vocab.get(token, vocab['<unk>']) for token in text]

class RankingDatasetRandom(Dataset):
    def __init__(self, claims, evidences, true_indices, top_k_indices, k=100, neg_samples = 32):
        self.claims = claims
        self.evidences = evidences
        self.true_indices = true_indices
        self.top_k_indices = top_k_indices
        self.k = k
        self.neg_samples = neg_samples

    def __len__(self):
        return len(self.claims)

    def __getitem__(self, idx):
        claim = self.claims[idx]
        true_idxs = self.true_indices[idx]
        top_k_indices = self.top_k_indices[idx][:self.k]
        valid_indices = [i for i in true_idxs if i in top_k_indices]
        if not valid_indices:
            return None

        pos_idx = random.choice(valid_indices)
        pos_evidence = self.evidences[pos_idx]

        neg_indices = [ i for i in top_k_indices if i not in valid_indices]
        neg_evidences = random.sample([self.evidences[neg_idx] for neg_idx in neg_indices], min(self.neg_samples, len(neg_indices))) # Ensure we do not exceed available negatives

        return claim, pos_evidence, neg_evidences


def random_collate_fn(batch):
    # Remove None items that were skipped in the dataset
    batch = [item for item in batch if item is not None]

    if not batch:
        # If all items are None, return None. This needs to be handled in the training loop.
        return None

    claims, pos_evidences, neg_evidences_lists = zip(*batch)

    # Convert claims and evidences to indices
    claims_indices = [text_to_indices(claim, vocab) for claim in claims]
    pos_indices = [text_to_indices(evidence, vocab) for evidence in pos_evidences]
    neg_indices = [text_to_indices(neg, vocab) for sublist in neg_evidences_lists for neg in sublist]

    # Pad all sequences
    claims_padded = pad_sequence([torch.tensor(ci, dtype=torch.long) for ci in claims_indices], batch_first=True, padding_value=vocab['<pad>'])
    pos_padded = pad_sequence([torch.tensor(pi, dtype=torch.long) for pi in pos_indices], batch_first=True, padding_value=vocab['<pad>'])
    neg_padded = pad_sequence([torch.tensor(ni, dtype=torch.long) for ni in neg_indices], batch_first=True, padding_value=vocab['<pad>'])

    # Now that neg_padded is fully defined, you can reshape it
    if neg_padded.numel() > 0:  # Check to make sure there are elements to avoid size mismatch
        neg_padded = neg_padded.view(len(batch), -1, neg_padded.size(1))

    return claims_padded, pos_padded, neg_padded

### 1.4.2 In-Batch Negatives

In [11]:
class RankingDatasetInBatch(Dataset):
    def __init__(self, claims, evidences, true_indices):
        self.claims = claims
        self.evidences = evidences
        self.true_indices = true_indices

    def __len__(self):
        return len(self.claims)

    def __getitem__(self, idx):
        pos_idx = random.choice(self.true_indices[idx])  # Randomly sample one positive evidence index
        claim = self.claims[idx]
        pos_evidence = self.evidences[pos_idx]
        return claim, pos_evidence

def inbatch_collate_fn(batch):
    claims, pos_evidences = zip(*batch)

    # Prepare claims and positive evidences
    claims_indices = [text_to_indices(claim, vocab) for claim in claims]
    pos_indices = [text_to_indices(evidence, vocab) for evidence in pos_evidences]

    # Pad sequences for claims and positive evidences
    claims_padded = pad_sequence([torch.tensor(ci, dtype=torch.long) for ci in claims_indices], batch_first=True, padding_value=vocab['<pad>'])
    pos_padded = pad_sequence([torch.tensor(pi, dtype=torch.long) for pi in pos_indices], batch_first=True, padding_value=vocab['<pad>'])

    # Generate in-batch negatives: Each claim gets the positive samples of all other claims as its negatives.
    neg_padded_list = []
    max_length = max([len(pi) for pi in pos_indices])  # Find the maximum length of positive evidences in the batch

    for i in range(len(batch)):
        neg_samples = [pos_indices[j] for j in range(len(batch)) if i != j]

        # Pad each negative sample to the maximum length
        neg_padded = [F.pad(torch.tensor(ni, dtype=torch.long), (0, max_length - len(ni)), value=vocab['<pad>']) for ni in neg_samples]
        neg_padded_stack = torch.stack(neg_padded, dim=0)
        neg_padded_list.append(neg_padded_stack)

    # Stack the list of negative batches to form a single tensor
    neg_padded_stack = torch.stack(neg_padded_list, dim=0)

    return claims_padded, pos_padded, neg_padded_stack


### 1.4.3 In-Batch + Top Negative

In [12]:
class RankingDatasetInBatchGold(Dataset):
    def __init__(self, claims, evidences, true_indices, top_indices):
        self.claims = claims
        self.evidences = evidences
        self.true_indices = true_indices
        self.top_indices = top_indices

    def __len__(self):
        return len(self.claims)

    def __getitem__(self, idx):
        pos_idx = random.choice(self.true_indices[idx])  # Randomly sample one positive evidence index
        claim = self.claims[idx]
        pos_evidence = self.evidences[pos_idx]

        # Sample a negative from the top indices
        top_neg_indices = [i for i in self.top_indices[idx] if i not in self.true_indices[idx]]
        neg_idx = random.choice(top_neg_indices[:20])  # Choose one from the top 20 indices that are not true indices
        neg_evidence = self.evidences[neg_idx]

        return claim, pos_evidence, neg_evidence

def inbatch_gold_collate_fn(batch):
    claims, pos_evidences, neg_evidences = zip(*batch)

    # Prepare claims, positive evidences, and sampled negative evidences
    claims_indices = [text_to_indices(claim, vocab) for claim in claims]
    pos_indices = [text_to_indices(evidence, vocab) for evidence in pos_evidences]
    neg_indices = [text_to_indices(evidence, vocab) for evidence in neg_evidences]

    # Pad sequences for claims, positive evidences, and sampled negative evidences
    claims_padded = pad_sequence([torch.tensor(ci, dtype=torch.long) for ci in claims_indices], batch_first=True, padding_value=vocab['<pad>'])
    pos_padded = pad_sequence([torch.tensor(pi, dtype=torch.long) for pi in pos_indices], batch_first=True, padding_value=vocab['<pad>'])
    neg_padded = pad_sequence([torch.tensor(ni, dtype=torch.long) for ni in neg_indices], batch_first=True, padding_value=vocab['<pad>'])

    # Generate in-batch negatives: Each claim gets the positive samples of all other claims as its negatives.
    neg_padded_list = []
    for i in range(len(batch)):
        neg_samples = [pos_indices[j] for j in range(len(batch)) if i != j]
        neg_samples.append(neg_indices[i])  # Add the sampled negative evidence for this claim
        neg_padded_stack = pad_sequence([torch.tensor(ni, dtype=torch.long) for ni in neg_samples], batch_first=True, padding_value=vocab['<pad>'])
        neg_padded_list.append(neg_padded_stack)

    # Find the maximum length of the negative sequences in the neg_padded_list
    max_length = max([neg.size(1) for neg in neg_padded_list])

    # Pad each sequence in the neg_padded_list to the maximum length
    neg_padded_list = [F.pad(neg, (0, max_length - neg.size(1)), value=vocab['<pad>']) for neg in neg_padded_list]

    # Stack the list of negative batches to form a single tensor
    combined_neg_padded_stack = torch.stack(neg_padded_list, dim=0)

    return claims_padded, pos_padded, combined_neg_padded_stack

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## 2.1 GRU (SiameseNetwork)


In [13]:
class GRU_SiameseNetwork(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, use_cosine=False):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.gru = nn.GRU(embedding_matrix.size(1), hidden_dim, batch_first=True)
        self.use_cosine = use_cosine

    def forward(self, claim, evidence):
        # Embed and process claim
        claim_emb = self.embedding(claim)
        _, claim_hidden = self.gru(claim_emb)
        claim_hidden = claim_hidden.squeeze(0)  # Ensure shape is [batch_size, hidden_dim]

        # Embed and process evidence
        evidence_emb = self.embedding(evidence)
        _, evidence_hidden = self.gru(evidence_emb)
        evidence_hidden = evidence_hidden.squeeze(0)  # Ensure shape is [batch_size, hidden_dim]

        if self.use_cosine:
            # Calculate cosine similarity
            scores_cosine = F.cosine_similarity(claim_hidden, evidence_hidden)
            return scores_cosine
        else:
            # Calculate dot product
            scores_dot = torch.bmm(claim_hidden.unsqueeze(1), evidence_hidden.unsqueeze(2)).squeeze()
            return scores_dot

## 2.2 GRU + Attention

In [14]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim, 1)  # Adjusted for unidirectional GRU

    def forward(self, outputs):
        attn_weights = torch.tanh(self.attn(outputs))
        attn_weights = F.softmax(attn_weights, dim=1)
        context = (attn_weights * outputs).sum(dim=1)
        return context

class GRU_Attn_SiameseNetwork(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, dropout_rate=0.5):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.hidden_dim = hidden_dim
        # Single layer unidirectional GRU
        self.gru = nn.GRU(embedding_matrix.size(1), hidden_dim, num_layers=1, batch_first=True, dropout=0)
        self.dropout = nn.Dropout(dropout_rate)
        self.attention = Attention(hidden_dim)  # Adjusted for unidirectional output

    def forward_one(self, text):
        text_emb = self.embedding(text)
        text_out, _ = self.gru(text_emb)
        text_out = self.dropout(text_out)
        text_context = self.attention(text_out)
        return text_context

    def forward(self, claims, evidences):
        claim_contexts = self.forward_one(claims)
        flattened_evidences = evidences.view(-1, evidences.size(-1))
        evidence_contexts = self.forward_one(flattened_evidences)
        evidence_contexts = evidence_contexts.view(claims.size(0), -1, self.hidden_dim)

        # Calculate dot product
        similarities = torch.bmm(claim_contexts.unsqueeze(1), evidence_contexts.transpose(1, 2)).squeeze(1)
        return similarities


## 2.3 Loss Function

In [15]:
def listwise_loss(model, claims_emb, pos_evidences_emb, neg_evidences_emb):
    pos_scores = model(claims_emb, pos_evidences_emb).unsqueeze(1)
    neg_scores = torch.stack([model(claims_emb, neg) for neg in neg_evidences_emb.transpose(0, 1)], dim=1)

    scores = torch.cat((pos_scores, neg_scores), dim=1)
    scores = scores.squeeze(-1)
    scores = F.log_softmax(scores, dim=1)

    # Create target tensor where the index of positive examples is always 0
    target = torch.zeros(scores.size(0), dtype=torch.long, device=scores.device)

    return F.nll_loss(scores, target)

In [16]:
def margin_ranking_loss(model, claims, pos_evidences, neg_evidences, margin=1.5):
    batch_size = claims.size(0)

    # Get the scores for positive evidence
    pos_scores = model(claims, pos_evidences).unsqueeze(1)

    # Get the scores for negative evidence
    neg_scores_list = [model(claims, neg_evidences[:, i, :]).unsqueeze(1) for i in range(neg_evidences.shape[1])]
    neg_scores = torch.cat(neg_scores_list, dim=1)

    # Calculate the margin ranking loss
    target = torch.ones_like(neg_scores, device=claims.device)
    pos_scores = pos_scores.expand_as(neg_scores)  # Expand pos_scores to match neg_scores shape
    loss = F.margin_ranking_loss(pos_scores, neg_scores, target, margin=margin)

    return loss

## 2.4 Model Training

In [17]:
def topk_indices(indices, k=100):
    return [indices[i][:min(k, len(indices[i]))] for i in range(len(indices))]

def evaluate_model(model, claims, evidence_texts, dev_top_indices, top_k, vocab, pad_idx):
    # Convert tensors to lists of indices and get the top k indices for evaluation
    dev_top_indices = topk_indices(dev_top_indices, k=top_k)

    dev_scores = []
    for idx in range(len(claims)):
        top_k_evidence_idxs = dev_top_indices[idx]
        top_k_evidences = [evidence_texts[i] for i in top_k_evidence_idxs]
        scores = score_query(model, claims[idx], top_k_evidences, vocab, pad_idx)
        dev_scores.append(scores)

    reranked_indices = []
    for indices, scores in zip(dev_top_indices, dev_scores):
        indexed_scores = list(zip(indices, scores))
        sorted_by_score = sorted(indexed_scores, key=lambda x: x[1], reverse=True)
        sorted_indices = [idx for idx, _ in sorted_by_score]
        reranked_indices.append(sorted_indices)

    return reranked_indices

def train_model(model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, eval_fn, eval_data, vocab, pad_idx, topk=10):
    for epoch in range(num_epochs):
        print("\n" + "#" * 50)  # Print separation line
        print(f"Starting Epoch {epoch+1}/{num_epochs}")
        model.train()
        epoch_loss = 0
        for batch in dataloader:
            if batch is None:
                continue

            claim, pos_evidences, neg_evidences = batch
            claim = claim.to(device)
            pos_evidences = pos_evidences.to(device)
            neg_evidences = neg_evidences.to(device)

            optimizer.zero_grad()
            loss = criterion(model, claim, pos_evidences, neg_evidences)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
            optimizer.step()
            epoch_loss += loss.item()

        avg_epoch_loss = epoch_loss / len(dataloader)
        print(f'Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_epoch_loss}')

        scheduler.step(avg_epoch_loss)
        current_lr = scheduler.optimizer.param_groups[0]['lr']
        print(f"Current Learning Rate: {current_lr}")

        # Evaluation at the end of each epoch
        model.eval()
        with torch.no_grad():
            reranked_indices = eval_fn(model, eval_data['claims'], eval_data['evidences'], eval_data['top_indices'], top_k=topk, vocab=vocab, pad_idx=pad_idx)
            results = evaluate_evidence_retrieval(reranked_indices, eval_data['ground_truth'], k=5)
            print(f"Epoch {epoch+1} Evaluation - Recall: {results['average_recall']}, Precision: {results['average_precision']}, F1 Score: {results['average_fscore']}")


### 2.4.1 GRU + Listwise

#### Random negatives

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#data loading
dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
# dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)

# Initialize model
gru_model = GRU_SiameseNetwork(embedding_matrix, hidden_dim=256).to(device)

criterion = listwise_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 5

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/5
Epoch 1/5, Average Loss: 2.1955058680142567
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.03950216450216451, Precision: 0.02727272727272728, F1 Score: 0.02976190476190477

##################################################
Starting Epoch 2/5
Epoch 2/5, Average Loss: 1.0037813795165524
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.0818181818181818, Precision: 0.04415584415584418, F1 Score: 0.05321583178726038

##################################################
Starting Epoch 3/5
Epoch 3/5, Average Loss: 1.23356187847662
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.10508658008658006, Precision: 0.05714285714285715, F1 Score: 0.06877963306534736

##################################################
Starting Epoch 4/5
Epoch 4/5, Average Loss: 0.9071867422446592
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.112012987012987, Precision: 0.05714285714285715, F1 Score: 0

#### In Batch Negatives

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#data loading
# dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)

# Initialize model
gru_model = GRU_SiameseNetwork(embedding_matrix, hidden_dim=256).to(device)

criterion = listwise_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 5

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/5
Epoch 1/5, Average Loss: 3.9717476673615284
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.04166666666666667, Precision: 0.02857142857142858, F1 Score: 0.0313852813852814

##################################################
Starting Epoch 2/5
Epoch 2/5, Average Loss: 3.16024518929995
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.047294372294372305, Precision: 0.03246753246753248, F1 Score: 0.035930735930735945

##################################################
Starting Epoch 3/5
Epoch 3/5, Average Loss: 2.2243050888706093
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.06482683982683983, Precision: 0.036363636363636376, F1 Score: 0.042383013811585254

##################################################
Starting Epoch 4/5
Epoch 4/5, Average Loss: 2.434563590131655
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.062987012987013, Precision: 0.03766233766233768, F1 Score

#### In Batch + Gold Negatives

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#data loading
# dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
# dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
dateset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)

# Initialize model
gru_model = GRU_SiameseNetwork(embedding_matrix, hidden_dim=256).to(device)

criterion = listwise_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 10

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/10
Epoch 1/10, Average Loss: 4.785622691496824
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.04913419913419914, Precision: 0.033766233766233784, F1 Score: 0.037280972995258725

##################################################
Starting Epoch 2/10
Epoch 2/10, Average Loss: 4.2801327992895475
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.0471861471861472, Precision: 0.029870129870129873, F1 Score: 0.034070294784580506

##################################################
Starting Epoch 3/10
Epoch 3/10, Average Loss: 3.479642996090824
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.09036796536796536, Precision: 0.048051948051948075, F1 Score: 0.057926200783343644

##################################################
Starting Epoch 4/10
Epoch 4/10, Average Loss: 1.2899046310093163
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.09545454545454544, Precision: 0.05064935064935

### 2.4.2 GRU + MarginRanking

#### Random Negatives

In [None]:
dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
# dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
# dateset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)

# Initialize model
gru_model = GRU_SiameseNetwork(embedding_matrix, hidden_dim=256).to(device)

criterion = margin_ranking_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 5

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##############################
Starting Epoch 1/5
Epoch 1/5, Average Loss: 0.47602683692597425
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.04112554112554113, Precision: 0.02467532467532468, F1 Score: 0.02855081426509998

##############################
Starting Epoch 2/5
Epoch 2/5, Average Loss: 0.2963211371291739
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.0420995670995671, Precision: 0.02597402597402598, F1 Score: 0.02970521541950114

##############################
Starting Epoch 3/5
Epoch 3/5, Average Loss: 0.25288911521410906
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.04621212121212122, Precision: 0.02727272727272728, F1 Score: 0.03188517831374975

##############################
Starting Epoch 4/5
Epoch 4/5, Average Loss: 0.09329307737061754
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.0764069264069264, Precision: 0.04155844155844158, F1 Score: 0.049866007008864156

##############################
Starting Epoch 5/5
Epo

#### In-Batch Negatives

In [None]:
# dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
# dateset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)

# Initialize model
gru_model = GRU_SiameseNetwork(embedding_matrix, hidden_dim=256).to(device)

criterion = margin_ranking_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 5

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/5
Epoch 1/5, Average Loss: 1.2086798911197827
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.04166666666666667, Precision: 0.02857142857142858, F1 Score: 0.0313852813852814

##################################################
Starting Epoch 2/5
Epoch 2/5, Average Loss: 0.9666505486537249
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.04274891774891776, Precision: 0.02857142857142858, F1 Score: 0.031617192331478056

##################################################
Starting Epoch 3/5
Epoch 3/5, Average Loss: 0.6468074870510743
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.03625541125541126, Precision: 0.02467532467532468, F1 Score: 0.026875901875901883

##################################################
Starting Epoch 4/5
Epoch 4/5, Average Loss: 1.0438217436058972
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.04772727272727273, Precision: 0.02987012987012988, F1 Sc

#### In-Batch Negatives + Gold

In [None]:
# dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
# dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
dateset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)

# Initialize model
gru_model = GRU_SiameseNetwork(embedding_matrix, hidden_dim=256).to(device)

criterion = margin_ranking_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 5

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/5
Epoch 1/5, Average Loss: 1.0463474919207585
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.03820346320346321, Precision: 0.02597402597402598, F1 Score: 0.02846320346320347

##################################################
Starting Epoch 2/5
Epoch 2/5, Average Loss: 0.8968893769077766
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.04231601731601732, Precision: 0.02727272727272728, F1 Score: 0.030643166357452085

##################################################
Starting Epoch 3/5
Epoch 3/5, Average Loss: 0.7256629971118692
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.06168831168831169, Precision: 0.03376623376623378, F1 Score: 0.03960523603380748

##################################################
Starting Epoch 4/5
Epoch 4/5, Average Loss: 0.5660642033132414
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.07110389610389613, Precision: 0.036363636363636376, F1 S

### 2.4.3 GRU_ATTENTION + Listwise

#### Random Negatives

In [None]:
dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
# dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
# dateset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)

# Initialize model
gru_model = GRU_Attn_SiameseNetwork(embedding_matrix, hidden_dim=256).to(device)

criterion = listwise_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 5

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##############################
Starting Epoch 1/5
Epoch 1/5, Average Loss: 1.7771361332912095
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.04664502164502165, Precision: 0.02857142857142858, F1 Score: 0.03285920428777573

##############################
Starting Epoch 2/5
Epoch 2/5, Average Loss: 2.9392272769645595
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.05173160173160174, Precision: 0.02727272727272728, F1 Score: 0.03289527932385076

##############################
Starting Epoch 3/5
Epoch 3/5, Average Loss: 1.054238751872551
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.07564935064935065, Precision: 0.04155844155844158, F1 Score: 0.04934034219748506

##############################
Starting Epoch 4/5
Epoch 4/5, Average Loss: 0.7275857265301632
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.11190476190476188, Precision: 0.05714285714285716, F1 Score: 0.07005772005772007

##############################
Starting Epoch 5/5
Epoch 

#### In Batch Negatives

In [23]:
# dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
# dateset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initialize model
gru_model = GRU_Attn_SiameseNetwork(embedding_matrix, hidden_dim=256).to(device)

criterion = listwise_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 10

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/10
Epoch 1/10, Average Loss: 2.8855569454339833
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.06233766233766234, Precision: 0.036363636363636376, F1 Score: 0.04272830344258917

##################################################
Starting Epoch 2/10
Epoch 2/10, Average Loss: 2.1878171395032835
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.0617965367965368, Precision: 0.033766233766233784, F1 Score: 0.03984230055658629

##################################################
Starting Epoch 3/10
Epoch 3/10, Average Loss: 1.855283335233346
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.07932900432900433, Precision: 0.04675324675324678, F1 Score: 0.05435992578849723

##################################################
Starting Epoch 4/10
Epoch 4/10, Average Loss: 1.6409508968010926
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.09989177489177489, Precision: 0.05324675324675326

#### In Batch + Gold

In [None]:
# dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
# dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
dateset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)

# Initialize model
gru_model = GRU_Attn_SiameseNetwork(embedding_matrix, hidden_dim=256).to(device)

criterion = listwise_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 5

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/5
Epoch 1/5, Average Loss: 3.1354176631340613
Current Learning Rate: 0.001
Epoch 1 Evaluation - Recall: 0.037987012987013, Precision: 0.024675324675324677, F1 Score: 0.027525252525252532

##################################################
Starting Epoch 2/5
Epoch 2/5, Average Loss: 2.4771112769077988
Current Learning Rate: 0.001
Epoch 2 Evaluation - Recall: 0.03993506493506494, Precision: 0.024675324675324677, F1 Score: 0.028081838796124522

##################################################
Starting Epoch 3/5
Epoch 3/5, Average Loss: 2.277987898924412
Current Learning Rate: 0.001
Epoch 3 Evaluation - Recall: 0.0405844155844156, Precision: 0.027272727272727275, F1 Score: 0.030122655122655138

##################################################
Starting Epoch 4/5
Epoch 4/5, Average Loss: 1.9903023657747185
Current Learning Rate: 0.001
Epoch 4 Evaluation - Recall: 0.05281385281385282, Precision: 0.03766233766233768, F1 S

### 2.4.4 GRU_ATTENTION + MarginRanking

#### Random

In [None]:
dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
# dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
# dateset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)
# Initialize model
gru_model = GRU_Attn_SiameseNetwork(embedding_matrix, hidden_dim=512).to(device)

criterion = margin_ranking_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.0001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 10

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/10
Epoch 1/10, Average Loss: 0.6597070049051231
Current Learning Rate: 0.0001
Epoch 1 Evaluation - Recall: 0.04134199134199135, Precision: 0.02857142857142858, F1 Score: 0.031240981240981257

##################################################
Starting Epoch 2/10
Epoch 2/10, Average Loss: 0.6697607715017139
Current Learning Rate: 0.0001
Epoch 2 Evaluation - Recall: 0.03841991341991343, Precision: 0.02597402597402598, F1 Score: 0.028499278499278507

##################################################
Starting Epoch 3/10
Epoch 3/10, Average Loss: 0.7035767276628087
Current Learning Rate: 0.0001
Epoch 3 Evaluation - Recall: 0.03841991341991343, Precision: 0.02597402597402598, F1 Score: 0.028499278499278507

##################################################
Starting Epoch 4/10
Epoch 4/10, Average Loss: 0.40746428398415446
Current Learning Rate: 0.0001
Epoch 4 Evaluation - Recall: 0.03755411255411256, Precision: 0.025974025

#### In batch

In [None]:
# dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
# dateset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)
# Initialize model
gru_model = GRU_Attn_SiameseNetwork(embedding_matrix, hidden_dim=512).to(device)

criterion = margin_ranking_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.0001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 10

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/10
Epoch 1/10, Average Loss: 1.111365088285544
Current Learning Rate: 0.0001
Epoch 1 Evaluation - Recall: 0.03787878787878788, Precision: 0.02597402597402598, F1 Score: 0.028318903318903327

##################################################
Starting Epoch 2/10
Epoch 2/10, Average Loss: 1.0481630907608912
Current Learning Rate: 0.0001
Epoch 2 Evaluation - Recall: 0.03658008658008659, Precision: 0.02467532467532468, F1 Score: 0.027020202020202028

##################################################
Starting Epoch 3/10
Epoch 3/10, Average Loss: 0.9053717437080848
Current Learning Rate: 0.0001
Epoch 3 Evaluation - Recall: 0.03398268398268399, Precision: 0.022077922077922082, F1 Score: 0.024422799422799433

##################################################
Starting Epoch 4/10
Epoch 4/10, Average Loss: 0.6300904668676548
Current Learning Rate: 0.0001
Epoch 4 Evaluation - Recall: 0.03647186147186148, Precision: 0.0220779220

#### In Batch + Gold

In [None]:
# dataset = RankingDatasetRandom(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=random_collate_fn)
# dataset = RankingDatasetInBatch(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_collate_fn)
dateset = RankingDatasetInBatchGold(train_claims_text_processed, evidence_text_processed, train_evidence_idxs, train_top_indices)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=inbatch_gold_collate_fn)
# Initialize model
gru_model = GRU_Attn_SiameseNetwork(embedding_matrix, hidden_dim=512).to(device)

criterion = margin_ranking_loss
optimizer = optim.Adam(gru_model.parameters(), lr=0.0001)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

clip_value = 1.0

num_epochs = 10

eval_data = {
    'claims': dev_claims_text_precessed,
    'evidences': evidence_text_processed,
    'top_indices': dev_top_indices,
    'ground_truth': dev_evidence_idxs
}


train_model(gru_model, dataloader, criterion, optimizer, scheduler, device, num_epochs, clip_value, evaluate_model, eval_data, vocab, vocab['<pad>'])


##################################################
Starting Epoch 1/10
Epoch 1/10, Average Loss: 1.1877917700853102
Current Learning Rate: 0.0001
Epoch 1 Evaluation - Recall: 0.03787878787878789, Precision: 0.02597402597402598, F1 Score: 0.028318903318903327

##################################################
Starting Epoch 2/10
Epoch 2/10, Average Loss: 0.9802435988034958
Current Learning Rate: 0.0001
Epoch 2 Evaluation - Recall: 0.03625541125541126, Precision: 0.024675324675324684, F1 Score: 0.026875901875901883

##################################################
Starting Epoch 3/10
Epoch 3/10, Average Loss: 1.0351166549401405
Current Learning Rate: 0.0001
Epoch 3 Evaluation - Recall: 0.03268398268398269, Precision: 0.02077922077922078, F1 Score: 0.023124098124098127

##################################################
Starting Epoch 4/10
Epoch 4/10, Average Loss: 0.6907822417143064
Current Learning Rate: 0.0001
Epoch 4 Evaluation - Recall: 0.0326839826839827, Precision: 0.0207792207

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [21]:
def score_query(model, query, evidences, vocab, pad_idx):
    # Convert query and evidences to indices using the same function as during training
    query_indices = text_to_indices(query, vocab)  # Query to indices
    evidence_indices = [text_to_indices(evidence, vocab) for evidence in evidences]  # Evidences to indices

    # Convert lists to tensors and pad
    query_tensor = pad_sequence([torch.tensor(query_indices)], batch_first=True, padding_value=pad_idx)
    evidence_tensors = pad_sequence([torch.tensor(ei) for ei in evidence_indices], batch_first=True, padding_value=pad_idx)

   
    query_tensor = query_tensor.to(device)
    evidence_tensors = evidence_tensors.to(device)

    # Set the model to evaluation mode and disable gradient computation
    model.eval()
    scores = []
    with torch.no_grad():
        # Process all evidences in one batch for efficiency
        for i in range(evidence_tensors.shape[0]):
            score = model(query_tensor, evidence_tensors[i].unsqueeze(0))
            scores.append(score.item())

    return scores

In [22]:
def evaluate_evidence_retrieval(predicted_indices_list, actual_indices_list, k=5):
    assert len(predicted_indices_list) == len(actual_indices_list), "Both inputs must have the same length."

    total_recall = 0.0
    total_precision = 0.0
    total_fscore = 0.0
    num_claims = len(predicted_indices_list)

    for predicted_indices, actual_indices in zip(predicted_indices_list, actual_indices_list):
        # Convert tensors in predicted_indices to integers if they are not already
        predicted_indices = [index.item() if isinstance(index, torch.Tensor) else index for index in predicted_indices]

        # Retrieve the top k predictions
        top_k_predicted = set(predicted_indices[:k])
        actual_indices_set = set(actual_indices)

        # Calculate the number of correct predictions
        correct_predictions = len(top_k_predicted.intersection(actual_indices_set))

        # Calculate metrics
        if correct_predictions > 0:
            recall = float(correct_predictions) / len(actual_indices_set)
            precision = float(correct_predictions) / k
            if (precision + recall) != 0:
                fscore = 2 * (precision * recall) / (precision + recall)
            else:
                fscore = 0.0
        else:
            recall = 0.0
            precision = 0.0
            fscore = 0.0

        # Accumulate the metrics to calculate averages later
        total_recall += recall
        total_precision += precision
        total_fscore += fscore

    # Calculate average metrics
    average_recall = total_recall / num_claims
    average_precision = total_precision / num_claims
    average_fscore = total_fscore / num_claims

    return {
        "average_recall": average_recall,
        "average_precision": average_precision,
        "average_fscore": average_fscore
    }

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*