In [139]:
import nltk
import math
from collections import defaultdict

load_document() function generates 'document_data' tuple that contains the data associated with each documen

In [140]:
def load_documents():
    
    directory_path = "./"
    file_list = ["/d1.txt", "/d2.txt", "/d3.txt", "/d4.txt","/d5.txt","/d6.txt"]
    document_data = {}

    for idx, filename in enumerate(file_list, 1):       
        file = open(directory_path + filename)
        file_content = file.read()
        file.close()
        document_data[f'D{idx}'] = file_content
    return document_data

print(load_documents())

{'D1': 'Researchers have successfully applied large language models (LLMs) such as ChatGPT to reranking in an information retrieval context, but to date, such work has mostly been built on proprietary models hidden behind opaque API endpoints. This approach yields experimental results that are not reproducible and non-deterministic, threatening the veracity of outcomes that build on such shaky foundations. To address this significant shortcoming, we present RankVicuna, the first fully open-source LLM capable of performing high-quality listwise reranking in a zero-shot setting. Experimental results on the TREC 2019 and 2020 Deep Learning Tracks show that we can achieve effectiveness comparable to zero-shot reranking with GPT-3.5 with a much smaller 7B parameter model, although our effectiveness remains slightly behind reranking with GPT-4. We hope our work provides the foundation for future research on reranking with modern LLMs.', 'D2': 'With the advent of transformer-based architectur

description file 

In [141]:
def preprocess_data(document_data,tokenise,normalise):
    processed_data = {}
    for doc_id, content in document_data.items():
        # Tokenization
        if (tokenise=="Split"):
            tokens=content.split()
        else:
            ExpReg = nltk.RegexpTokenizer('(?:[A-Z]\.)+|\d+(?:\.\d+)?DA?|\w+|\.{3}')
            tokens = ExpReg.tokenize(content)
        # Remove stopwords
        motsvides = nltk.corpus.stopwords.words('english')
        tokens_without_stopw = [token for token in tokens if token.lower() not in motsvides]

        # Stemming using the Lancaster stemmer
        if (normalise=="Lancaster"):
            Lancaster = nltk.LancasterStemmer()
            termes_normalization = [Lancaster.stem(terme) for terme in tokens_without_stopw]
        else:
            Porter = nltk.PorterStemmer()
            termes_normalization = [Porter.stem(terme) for terme in tokens_without_stopw]

        processed_data[doc_id] = termes_normalization

    return processed_data

In [142]:
def calculate_term_frequencies(processed_data):
    term_frequencies = {}
    for doc_id, terms in processed_data.items():
        term_frequencies[doc_id] = defaultdict(int)
        for term in terms:
            term_frequencies[doc_id][term] += 1

    return term_frequencies

In [143]:
def calculate_term_weights(term_frequencies, processed_data, num_documents):
    term_weights = {}
    for doc_id, terms in processed_data.items():
        max_term_freq = max(term_frequencies[doc_id].values())
        term_weights[doc_id] = {}
        for term, freq in term_frequencies[doc_id].items():
            term_weights[doc_id][term] = (freq / max_term_freq) * math.log(num_documents / (1 + sum(1 for d in processed_data if term in processed_data[d])))

    return term_weights

inverted file 

In [144]:
def inverted_index_file(processed_data):
    inverted_index = defaultdict(list)
    term_frequencies = calculate_term_frequencies(processed_data)

    for doc_id, terms in processed_data.items():
        for term in terms:
            inverted_index[term].append(doc_id)

    return inverted_index, term_frequencies

In [145]:
def descriptorfile(processed_data, term_weights):
    all_descriptors = []
    term_frequencies = calculate_term_frequencies(processed_data)
    for doc_id, terms in processed_data.items():
        for term in terms:
            frequency = term_weights[doc_id][term]
            all_descriptors.append(f"{doc_id}: {term} - Frequency: {term_frequencies[doc_id][term]} - Weight: {frequency}")

    return all_descriptors
"""
document_data = load_documents()  # Load the documents
processed_data = preprocess_data(document_data, 'Split', 'Lancaster')
num_documents = len(document_data)
term_frequencies = calculate_term_frequencies(processed_data)
term_weights = calculate_term_weights(term_frequencies, processed_data, num_documents)
descriptorfile=descriptorfile(processed_data,term_weights)
print(descriptorfile)
"""

"\ndocument_data = load_documents()  # Load the documents\nprocessed_data = preprocess_data(document_data, 'Split', 'Lancaster')\nnum_documents = len(document_data)\nterm_frequencies = calculate_term_frequencies(processed_data)\nterm_weights = calculate_term_weights(term_frequencies, processed_data, num_documents)\ndescriptorfile=descriptorfile(processed_data,term_weights)\nprint(descriptorfile)\n"

In [146]:
'''document_data=load_documents()
processed_data = preprocess_data(document_data)
num_documents = len(document_data)
term_frequencies = calculate_term_frequencies(processed_data)
term_weights = calculate_term_weights(term_frequencies, processed_data, num_documents)'''


'document_data=load_documents()\nprocessed_data = preprocess_data(document_data)\nnum_documents = len(document_data)\nterm_frequencies = calculate_term_frequencies(processed_data)\nterm_weights = calculate_term_weights(term_frequencies, processed_data, num_documents)'

Create the files and save them 

In [147]:
'''document_data=load_documents()
# Create a descriptor file for all documents
descriptor_filename = "all_descriptors.txt"
all_descriptors = descriptorfile(processed_data, term_weights)
with open(descriptor_filename, "w") as file:
    for descriptor in all_descriptors:
        file.write(f"{descriptor}\n")
print(f"Descriptor file created for all documents: {descriptor_filename}")
#inverted 
inverted_index, term_frequencies = create_inverted_index(processed_data)

with open("inverted_index.txt", "w") as text_file:
    for term, doc_list in inverted_index.items():
        text_file.write(f"{term}:\n")
        for doc_id in doc_list:
            term_freq = term_frequencies.get(doc_id, {}).get(term, 0)
            term_weight = term_weights.get(doc_id, {}).get(term, 0.0)
            text_file.write(f"  {doc_id}: Frequency: {term_freq} - Weight: {term_weight}\n")


print("Inverted index with term weights saved to 'inverted_index.txt.'")'''

'document_data=load_documents()\n# Create a descriptor file for all documents\ndescriptor_filename = "all_descriptors.txt"\nall_descriptors = descriptorfile(processed_data, term_weights)\nwith open(descriptor_filename, "w") as file:\n    for descriptor in all_descriptors:\n        file.write(f"{descriptor}\n")\nprint(f"Descriptor file created for all documents: {descriptor_filename}")\n#inverted \ninverted_index, term_frequencies = create_inverted_index(processed_data)\n\nwith open("inverted_index.txt", "w") as text_file:\n    for term, doc_list in inverted_index.items():\n        text_file.write(f"{term}:\n")\n        for doc_id in doc_list:\n            term_freq = term_frequencies.get(doc_id, {}).get(term, 0)\n            term_weight = term_weights.get(doc_id, {}).get(term, 0.0)\n            text_file.write(f"  {doc_id}: Frequency: {term_freq} - Weight: {term_weight}\n")\n\n\nprint("Inverted index with term weights saved to \'inverted_index.txt.\'")'

In [148]:
from collections import defaultdict
def descriptor(processed_data, term_weights, term_frequencies):
    all_descriptors = []
    for doc_id, terms in processed_data.items():
        for term in terms:
            frequency = term_weights[doc_id][term]
            doc_term_tuple = (doc_id, term, term_frequencies[doc_id][term], frequency)
            all_descriptors.append(doc_term_tuple)

    return all_descriptors


def inverted_index(processed_data, term_frequencies, term_weights):
    inverted_index = defaultdict(list)

    for doc_id, terms in processed_data.items():
        for term in terms:
            term_freq = term_frequencies.get(doc_id, {}).get(term, 0)
            term_weight = term_weights.get(doc_id, {}).get(term, 0.0)
            term_doc_tuple = (term, doc_id, term_freq, term_weight)
            inverted_index[term].append(term_doc_tuple)

    return inverted_index
#desc=descriptor(processed_data, term_weights, term_frequencies)
#inv=inverted_index(processed_data, term_frequencies, term_weights)
def get_terms_by_doc_id(all_descriptors, target_doc_id):
    terms_with_doc_id = [item for item in all_descriptors if item[0] == target_doc_id]
    return terms_with_doc_id




def get_terms_by_term_inverted(inverted_index, target_term):
    terms_with_target_term = []
    term_doc_tuples = inverted_index.get(target_term, [])
    terms_with_target_term.extend(term_doc_tuples)
    return terms_with_target_term





inverted_index = inverted_index(processed_data, term_frequencies, term_weights)
target_doc_id = "D1"  # Replace with the doc_id you want to filter by
terms_with_doc_id = get_terms_by_doc_id_inverted(inverted_index, target_doc_id)
for term_tuple in terms_with_doc_id:
    print(term_tuple)



    # Example usage:
'''
all_descriptors = descriptor(processed_data, term_weights, term_frequencies)
target_doc_id = "D1"  # Replace with the doc_id you want to filter by
terms_with_doc_id = get_terms_by_doc_id(all_descriptors, target_doc_id)

for term_tuple in terms_with_doc_id:
    print(term_tuple)'''

In [172]:
import tkinter as tk
from tkinter import ttk
from collections import defaultdict

# Define a function to handle the retrieval and display
def retrieve_and_display():
    selected_docs = doc_var.get()
    tokenization_method = tokenization_method_var.get()
    stemming_method = stemming_method_var.get()
    output_type = output_type_var.get()
    
    # Split the selected_docs string to get a list of selected document IDs
    selected_doc_ids = selected_docs.split(", ")

    # Load the relevant data (inverted index or descriptors)
    if output_type == "DOCS per TERM":
        relevant_data = inverted_index(processed_data, term_frequencies, term_weights)
        terms_with_doc_id_inverted = get_terms_by_term_inverted(relevant_data, selected_docs)
        
        # Display the selected document data
        result_text.delete(1.0, tk.END)  # Clear previous results
        if terms_with_doc_id_inverted:
            for term_tuple in terms_with_doc_id_inverted:
                result_text.insert(tk.END, f"{term_tuple}\n")
        else:
            result_text.insert(tk.END, "No document for this term.\n")
        
    elif output_type == "TERMS per DOC":
        relevant_data = descriptor(processed_data, term_weights, term_frequencies)
        terms_with_doc_id = get_terms_by_doc_id(relevant_data, selected_docs)
        
        # Display the selected document data
        result_text.delete(1.0, tk.END)  # Clear previous results
        if terms_with_doc_id:
            for data in terms_with_doc_id:
                result_text.insert(tk.END, f"{data}\n")
        else:
            result_text.insert(tk.END, "No term for this document.\n")

# Rest of the code remains the same


# Create the main application window
app = tk.Tk()
app.title("Document Retrieval")

# Create labels and entry fields for user input
doc_label = ttk.Label(app, text="Query:")
doc_var = tk.StringVar()
doc_entry = ttk.Entry(app, textvariable=doc_var,width=40)
doc_label.grid(row=0, column=0, padx=5, pady=5)
doc_entry.grid(row=0, column=1, padx=5, pady=5)  # Place the text field

tokenization_label = ttk.Label(app, text="Select Tokenization Method:")
tokenization_method_var = tk.StringVar()
tokenization_method_var.set("Split")
tokenization_method_combobox = ttk.Combobox(app, textvariable=tokenization_method_var, values=["Split", "Tokenize"])

stemming_label = ttk.Label(app, text="Select Stemming Method:")
stemming_method_var = tk.StringVar()
stemming_method_var.set("Lancaster")
stemming_method_combobox = ttk.Combobox(app, textvariable=stemming_method_var, values=["Lancaster", "Porter"])

output_label = ttk.Label(app, text="Index:")
output_type_var = tk.StringVar()
output_type_var.set("DOCS per TERM")
output_type_combobox = ttk.Combobox(app, textvariable=output_type_var, values=["DOCS per TERM", "TERMS per DOC"])

retrieve_button = ttk.Button(app, text="Search", command=retrieve_and_display)

# Create a text widget for displaying the results
result_text = tk.Text(app, width=90, height=30)

# Arrange the widgets on the GUI
tokenization_label.grid(row=1, column=0, padx=5, pady=5)
tokenization_method_combobox.grid(row=1, column=1, padx=5, pady=5)
stemming_label.grid(row=2, column=0, padx=5, pady=5)
stemming_method_combobox.grid(row=2, column=1, padx=5, pady=5)
output_label.grid(row=3, column=0, padx=5, pady=5)
output_type_combobox.grid(row=3, column=1, padx=5, pady=5)
retrieve_button.grid(row=4, column=0, columnspan=2, padx=5, pady=10)
result_text.grid(row=5, column=0, columnspan=2, padx=5, pady=5)

# Start the GUI application
app.mainloop()
