In [96]:
import pickle
import math
from collections import Counter
import ir_datasets
import nltk
import re
import string

nltk.download('stopwords')
nltk.download('punkt')

# Load the final vocabulary
with open('final_vocabulary.pkl', 'rb') as file:
    loaded_vocabulary = pickle.load(file)

# Load the dataset
dataset = ir_datasets.load("beir/webis-touche2020/v2")
total_documents = dataset.docs_count()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\codei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\codei\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [97]:
def pre_process_doc(query):
    query = text_lowercase(query)
    query = remove_non_alpha(query)
    query = remove_num(query)
    query = remove_punc(query)
    query = remove_whitespace(query)
    query = remove_stopwords(query)

    return query
def text_lowercase(text):
    return text.lower()

def remove_non_alpha(text):
    res = ""
    for elem in text:
        if elem.isalnum() or elem == " ":
          res += elem
    return res

def remove_num(text):
    res = re.sub(r'\d+', '', text)
    return res

def remove_punc(text):
    trans = str.maketrans('', '', string.punctuation)
    return text.translate(trans)

def remove_whitespace(text):
    return  " ".join(text.split())

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text


In [98]:

def calculate_df(term):
    # Initialize the document frequency
    df = 0

    # Iterate over the documents in the dataset
    for doc in dataset.docs_iter():
        # Convert the document into a list of terms
        document_terms = doc.text.split()

        # If the term is in the document, increment the document frequency
        if term in document_terms:
            df += 1

    return df

def get_tfidf(sentence, weighting_scheme):
    # Create an empty dictionary to store the TF-IDF scores for the sample sentence
    tfidf_scores = {}

    # Tokenize the sample sentence
    terms = pre_process_doc(sentence)

        # Compute TF-IDF for each term in the sample sentence
    for term in terms:
        if term in loaded_vocabulary:
            term_frequency = terms.count(term)
            document_frequency = calculate_df(term)
            inverse_document_frequency = math.log(total_documents / (1 + document_frequency))

            if weighting_scheme == 'standard':
                tfidf = term_frequency * inverse_document_frequency
            elif weighting_scheme == 'log_max':
                    tfidf = (1 + math.log(term_frequency)) * inverse_document_frequency

            elif weighting_scheme == 'double_norm_prob':
                    max_term_frequency = max(terms.count(t) for t in terms)
                    tfidf = (0.5 + 0.5 * term_frequency / max_term_frequency) * inverse_document_frequency
        tfidf_scores[term] += tfidf
        return tfidf

In [100]:
from sklearn.metrics.pairwise import cosine_similarity
result = []
for query in dataset.queries_iter():
    print('-------------------')
    print("Query: ", query.text, "\n")
    weighting_schemes = ['standard', 'log_max', 'double_norm_prob']
    for scheme in weighting_schemes:
        for doc in dataset.docs_iter():
            doc_vec = get_tfidf(doc.text, scheme)
            query_vec = get_tfidf(query.text, scheme)

    cos_sim = cosine_similarity(doc_vec.reshape(1,-1), query_vec.reshape(1,-1))
    result.append([doc,cos_sim])

-------------------
Query:  Should teachers get tenure? 



KeyboardInterrupt: 

In [None]:
result = sorted(result, key=lambda x:x[1], reverse = True)

In [None]:
import pandas as pd
res2 = pd.DataFrame(result)

In [None]:
for i in range(5):
    print("doc {}:".format(i+1) + res2[0][i])