In [2]:
import nltk
nltk.download('punkt')

from nltk.stem import PorterStemmer
ps = PorterStemmer()

[nltk_data] Downloading package punkt to /home/emmanuel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
import pickle

with open("tdm.pkl", "rb") as f:
    tdm= pickle.load(f)

with open("dic_map.pkl", "rb") as f:
    dic_map = pickle.load(f)

with open("doc_map.pkl", "rb") as f:
    doc_map = pickle.load(f)

In [4]:
import re
import numpy as np
def evaluate(tokens):
    a = np.zeros(shape=(len(dic_map)))
    for token in tokens:
        if dic_map.get(token):
            a[dic_map[token]] += 1
    norm  = np.linalg.norm(a) or 1.0
        
    a *= 1 / norm
    res = {}
    for key, value in doc_map.items():
        dot_prod = a.dot(tdm[value])
        if dot_prod > 0.025:
            res[key] = dot_prod
    print(res)
    sorted_res =  {k: v for k, v in sorted(res.items(), key=lambda item: item[1])}
    print(sorted_res)
    return list(sorted_res.keys())[::-1]
def process_vector_query(query):
    tokens = re.findall(r"[\w]+|[^-_\w\s()@#$%^&*+={[\]};,<>./?~`\"]", query)
    
    tokens = [ps.stem(token.lower()) for token in tokens]
    result = evaluate(tokens)
    return result

print(process_vector_query("machine learning"))

{1: 0.029152196613819372, 2: 0.03550771866470323, 3: 0.029687854137701433, 7: 0.036410612647920615, 16: 0.042826507091961785}
{1: 0.029152196613819372, 3: 0.029687854137701433, 2: 0.03550771866470323, 7: 0.036410612647920615, 16: 0.042826507091961785}
[16, 7, 2, 3, 1]


In [5]:
print(tdm[doc_map[16]][dic_map[ps.stem('machine')]] + tdm[doc_map[16]][dic_map[ps.stem('learning')]])
print(tdm[doc_map[7]][dic_map[ps.stem('machine')]] + tdm[doc_map[7]][dic_map[ps.stem('learning')]])
print(tdm[doc_map[2]][dic_map[ps.stem('machine')]] + tdm[doc_map[2]][dic_map[ps.stem('learning')]])
print(tdm[doc_map[3]][dic_map[ps.stem('machine')]] + tdm[doc_map[3]][dic_map[ps.stem('learning')]])
print(tdm[doc_map[1]][dic_map[ps.stem('machine')]] + tdm[doc_map[1]][dic_map[ps.stem('learning')]])

0.0605658271585199
0.051492382221002685
0.050215497304551604
0.041984965959291576
0.04122743182423037


In [6]:
import tkinter as tk
from tkinter import ttk
import re

def display_results(result, treeview):
    # Clear existing results in the treeview
    for item in treeview.get_children():
        treeview.delete(item)

    if result:
        # Display the new results
        for i, (doc) in enumerate(result):
            data = (doc)
            treeview.insert(parent='', index=i, values=data)
    # Add to show end of table
    treeview.insert(parent='', index='end', values=('XXX'))

def search_vector():
    query = entry_vector.get()
    result = process_vector_query(query)
    display_results(result, treeview_vector)

# Create the main Tkinter window
root = tk.Tk()
root.title("Vector Query Query Search")

frame_vector = ttk.Frame(root)
frame_vector.grid(row=0, column=0, padx=10, pady=10, sticky="nsew")

label_vector = ttk.Label(frame_vector, text="Query:")
label_vector.grid(row=0, column=0, padx=5, pady=5)

entry_vector = ttk.Entry(frame_vector, width=30)
entry_vector.grid(row=0, column=1, padx=5, pady=5)

button_vector = ttk.Button(frame_vector, text="Search", command=search_vector)
button_vector.grid(row=0, column=2, padx=5, pady=5)

treeview_vector = ttk.Treeview(frame_vector, columns=('doc'), show='headings')
treeview_vector.heading('doc', text='Document ID')
treeview_vector.grid(row=1, column=0, columnspan=3, sticky="nsew")

# Set row and column weights for resizing
root.grid_rowconfigure(0, weight=1)
root.grid_rowconfigure(1, weight=1)
root.grid_columnconfigure(0, weight=1)

# Start the Tkinter event loop
root.mainloop()

{1: 0.029152196613819372, 2: 0.03550771866470323, 3: 0.029687854137701433, 7: 0.036410612647920615, 16: 0.042826507091961785}
{1: 0.029152196613819372, 3: 0.029687854137701433, 2: 0.03550771866470323, 7: 0.036410612647920615, 16: 0.042826507091961785}
{12: 0.027568200128642383}
{12: 0.027568200128642383}
{1: 0.029152196613819372, 2: 0.03550771866470323, 3: 0.029687854137701433, 7: 0.036410612647920615, 16: 0.042826507091961785}
{1: 0.029152196613819372, 3: 0.029687854137701433, 2: 0.03550771866470323, 7: 0.036410612647920615, 16: 0.042826507091961785}
