In [53]:
import pickle
import math
from collections import Counter
import ir_datasets
import nltk

# Load the final vocabulary
with open('final_vocabulary.pkl', 'rb') as file:
    loaded_vocabulary = pickle.load(file)

# Load the dataset
dataset = ir_datasets.load("beir/webis-touche2020/v2")
total_documents = dataset.docs_count()

def calculate_df(term):
    # Initialize the document frequency
    df = 0

    # Iterate over the documents in the dataset
    for doc in dataset.docs_iter():
        # Convert the document into a list of terms
        document_terms = doc.text.split()

        # If the term is in the document, increment the document frequency
        if term in document_terms:
            df += 1

    return df

def get_ranking(sample_sentence):
    # List of weighting schemes
    weighting_schemes = ['standard', 'log_max', 'double_norm_prob']

    # Iterate through each weighting scheme
    for weighting_scheme in weighting_schemes:
        print(f"Weighting Scheme: {weighting_scheme}\n")

        # Create an empty dictionary to store the TF-IDF scores for the sample sentence
        tfidf_scores = {}

        # Tokenize the sample sentence
        terms = sample_sentence.split()

        # Compute TF-IDF for each term in the sample sentence
        for term in terms:
            if term in loaded_vocabulary:
                term_frequency = terms.count(term)
                document_frequency = calculate_df(term)
                inverse_document_frequency = math.log(total_documents / (1 + document_frequency))

                if weighting_scheme == 'standard':
                    tfidf = term_frequency * inverse_document_frequency

                elif weighting_scheme == 'log_max':
                    tfidf = (1 + math.log(term_frequency)) * inverse_document_frequency

                elif weighting_scheme == 'double_norm_prob':
                    max_term_frequency = max(terms.count(t) for t in terms)
                    tfidf = (0.5 + 0.5 * term_frequency / max_term_frequency) * inverse_document_frequency

                tfidf_scores[term] = tfidf

        # Rank terms based on their TF-IDF scores
        sorted_terms = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)

        # Print the top N ranked terms
        N = 10
        for rank, (term, score) in enumerate(sorted_terms[:N], start=1):
            print(f"Rank {rank}: Term '{term}', TF-IDF Score: {score}")

        print("\n")

In [54]:
print(total_documents)

382545


In [56]:
# Iterate over the documents and queries in the dataset
ctr = 0

for doc, query in zip(dataset.docs_iter(), dataset.queries_iter()):
    # print the doc text and query text
    if ctr < 4:
        ctr += 1
        continue

    print('-------------------')
    # print the first 400 characters of the doc text
    print("Doc: ", doc.text[:500], "...\n")
    print("Query: ", query.text, "\n")

    get_ranking(query.text)

    ctr += 1

    if ctr == 4:
        break

-------------------

Query:  Should social security be privatized? 

Weighting Scheme: standard

Rank 1: Term 'social', TF-IDF Score: 3.1401581224063317
Rank 2: Term 'be', TF-IDF Score: 0.5594517683977795


Weighting Scheme: log_max

Rank 1: Term 'social', TF-IDF Score: 3.1401581224063317
Rank 2: Term 'be', TF-IDF Score: 0.5594517683977795


Weighting Scheme: double_norm_prob

Rank 1: Term 'social', TF-IDF Score: 3.1401581224063317
Rank 2: Term 'be', TF-IDF Score: 0.5594517683977795


-------------------
Doc:  First of all we invented amazing things like WiFi, Google Maps, Polymer bank notes (if you are American and do not know what they are, they are plastic WATERPROOF bills), Ultrasound scanners, stainless steel braces and many more things. Why put us into the shadow if we have made such amazing things the whole world uses nowadays! I bet you have used at the very least ONE thing I put up there unless you are on a Ethernet cable still using those old paper maps wherever you go! There