In [55]:
!pip install nltk numpy



In [56]:
import nltk
nltk.download('punkt')

from nltk.stem import PorterStemmer
ps = PorterStemmer()

[nltk_data] Downloading package punkt to /home/emmanuel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [57]:
# Load stored term-document matrix and mapping files

import pickle

with open("tdm.pkl", "rb") as f:
    tdm= pickle.load(f)

with open("dic_map.pkl", "rb") as f:
    dic_map = pickle.load(f)

with open("doc_map.pkl", "rb") as f:
    doc_map = pickle.load(f)

In [58]:
import re
import numpy as np

alpha = 0.2  # Threshold value for filtering results

def evaluate(tokens):
    # Initialize an array to hold the frequency of each term in the dictionary
    a = np.zeros(shape=(len(dic_map)))  
    
    # Count the occurrences of each token in the document
    for token in tokens:
        if dic_map.get(token):
            a[dic_map[token]] += 1
    
    # Calculate the L2 norm of the frequency vector or set it to 1 if it's 0 to avoid division by zero
    norm = np.linalg.norm(a) or 1.0
        
    # Normalize the frequency vector
    a *= 1 / norm
    
    # Dictionary to store the cosine similarity of each document with the query
    res = {}
    for key, value in doc_map.items():
        # Calculate the dot product of the normalized frequency vector and the document vector
        dot_prod = a.dot(tdm[value])
        res[key] = dot_prod

    # Calculate the threshold for relevance based on the highest cosine similarity
    threshold = max(res.values()) * alpha

    # Dictionary to store the final relevant documents and their cosine similarity
    final_res = {}
    for key, value in res.items():
        # Filter out documents with cosine similarity below the threshold
        if value >= threshold:
            final_res[key] = value
    
    # Sort the final results by cosine similarity in descending order
    sorted_res = {k: v for k, v in sorted(final_res.items(), key=lambda item: item[1], reverse=True)}
    return sorted_res

def process_vector_query(query):
    # Tokenize the query using regular expression
    tokens = re.findall(r"[\w]+|[^-_\w\s()@#$%^&*+={[\]};,<>./?~`\"]", query)
    
    # Stem and lowercase each token
    tokens = [ps.stem(token.lower()) for token in tokens]
    
    # Evaluate the relevance of documents based on the query
    result = evaluate(tokens)
    return result  # Return relevant documents sorted by relevance

In [69]:
import tkinter as tk
from tkinter import ttk
import re

def display_results(result, treeview):
    # Clear existing results in the treeview
    for item in treeview.get_children():
        treeview.delete(item)

    if result:
        # Display the new results
        for i, (key, value) in enumerate(result.items()):
            data = (key, value)
            treeview.insert(parent='', index=i, values=data)
    # Add to show end of table
    treeview.insert(parent='', index='end', values=('XXX', 'YYY'))

def search_vector():
    query = entry_vector.get()
    result = process_vector_query(query)
    display_results(result, treeview_vector)

# Create the main Tkinter window
root = tk.Tk()
root.title("Vector Query Query Search")

window_height = 500  # Adjust this value as needed
root.geometry(f"600x{window_height}")

frame_vector = ttk.Frame(root)
frame_vector.grid(row=0, column=0, padx=10, pady=10, sticky="nsew")

label_vector = ttk.Label(frame_vector, text="Query:")
label_vector.grid(row=0, column=0, padx=5, pady=5)

entry_vector = ttk.Entry(frame_vector, width=40)
entry_vector.grid(row=0, column=1, padx=5, pady=5)

button_vector = ttk.Button(frame_vector, text="Search", command=search_vector)
button_vector.grid(row=0, column=2, padx=5, pady=5)

frame_treeview = ttk.Frame(frame_vector)
frame_treeview.grid(row=1, column=0, columnspan=3, sticky="nsew")

treeview_vector = ttk.Treeview(frame_treeview, columns=('doc', 'cosine'), show='headings', height=20, width=60)
treeview_vector.heading('doc', text='Document ID')
treeview_vector.heading('cosine', text='Cosine Score')

treeview_vector.grid(row=1, column=0, columnspan=3, sticky="nsew")

frame_treeview.config(height=10, width=50)

# Set row and column weights for resizing
root.grid_rowconfigure(0, weight=1)
root.grid_rowconfigure(1, weight=1)
root.grid_columnconfigure(0, weight=1)

# Start the Tkinter event loop
root.mainloop()

TclError: unknown option "-width"