In [None]:
import sys
sys.path.append("..")
import pandas as pd
import numpy as np
import pickle
from sif_src.utils import load_glove_vectors

In [None]:
valid_df = pd.read_pickle("../pickle_backups/marco_valid_df2024-06-04T17.38.1717490321.pickle")

In [None]:
valid_passages = valid_df["passages"]

In [None]:
valid_texts = []
for i in range(len(valid_passages)):
    valid_texts.append(valid_passages[i]["passage_text"])

In [None]:
valid_df["texts"] = valid_texts

In [None]:
glove_vectors, word_to_index = load_glove_vectors('../wv/glove.6B.50d.txt')

In [None]:
from collections import Counter

word_counts = Counter()

for inner_list in valid_df['texts']:
    for sentence in inner_list:
        word_counts.update(sentence.split())

In [None]:
def sif_weight(word, a=1e-3):
    return a / (a + word_counts[word])

def sentence_to_sif(sentence, embeddings_index, embedding_dim=50, a=1e-3):
    words = sentence.split()
    weights = [sif_weight(word) for word in words]
    embedding_matrix = np.zeros((len(words), embedding_dim))
    for i, word in enumerate(words):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector * weights[i]
    return np.sum(embedding_matrix, axis=0) / (np.sum(weights) + 1e-9)

In [None]:
def glove_embeddings(passage, word_to_index, embeddings_index, embedding_dim=50):
    words = passage.split()
    embedding_matrix = np.zeros((len(words), embedding_dim))
    
    for i, word in enumerate(words):
        word_index = word_to_index.get(word.lower()) 
        if word_index is not None:
            embedding_vector = embeddings_index[word_index]
            embedding_matrix[i] = embedding_vector
    
    if len(words) > 0:
        return np.mean(embedding_matrix, axis=0)
    else:
        return np.zeros(embedding_dim)

In [None]:
apply_call_count = 0

def compute_passage_embeddings(passage_texts, embedding_type, word_to_index, glove_vectors):
    global apply_call_count
    passage_embeddings = []
    total_iterations = 0
    
    for text in passage_texts:
        sentence_embeddings = []
        for sentence in text:
            if embedding_type == 'sif':
                embedding = sentence_to_sif(sentence, glove_vectors)
            elif embedding_type == 'glove':
                embedding = glove_embeddings(sentence, word_to_index, glove_vectors)
            sentence_embeddings.append(embedding)
        passage_embeddings.append(sentence_embeddings)
        total_iterations += 1
        print("Iterations processed in compute_passage_embeddings:", total_iterations)
    
    apply_call_count += 1
    
    return passage_embeddings


In [None]:
valid_df['query_sif'] = valid_df['query'].apply(lambda x: sentence_to_sif(x, glove_vectors))
print("Step 1 finished")

In [None]:
valid_df['passage_sif'] = valid_df['texts'].apply(compute_passage_embeddings, args=('sif', word_to_index, glove_vectors))

In [None]:
valid_df['query_glove'] = valid_df['query'].apply(lambda x: glove_embeddings(x, word_to_index, glove_vectors))
print("Step 3 finished")


In [None]:
valid_df['passage_glove'] = valid_df['texts'].apply(compute_passage_embeddings, args=('glove', word_to_index, glove_vectors))
print("Step 4 finished")

print("Total number of times apply is called:", apply_call_count)

In [None]:
with open('../pickle_backups/0608_valid_sifglove.pickle', 'wb') as f:
    pickle.dump(valid_df, f)