In [20]:
import os
import time
from collections import defaultdict
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tracemalloc

class InformationRetrievalSystem:
    def __init__(self, directory):
        self.directory = directory
        self.documents = {}  # Stores document content
        self.boolean_index = defaultdict(set)  # Index for boolean search
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = None  # TF-IDF matrix for vector search
        self.doc_ids = []  # List of document filenames
        self.load_documents()
        self.build_indexes()

    def load_documents(self):
        """Loads documents from the directory and stores their content."""
        for filename in os.listdir(self.directory):
            filepath = os.path.join(self.directory, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                content = file.read().lower()
                self.documents[filename] = content

    def build_indexes(self):
        """Builds the boolean and TF-IDF indexes from documents."""
        self.doc_ids = list(self.documents.keys())
        
        # Build boolean index
        for doc_id, text in self.documents.items():
            words = set(text.split())
            for word in words:
                self.boolean_index[word].add(doc_id)
        
        # Compute TF-IDF matrix
        self.tfidf_matrix = self.vectorizer.fit_transform(self.documents.values())

    def boolean_search(self, query):
        """Performs boolean search using AND, OR, NOT operators."""
        tokens = re.split(r'\s+', query.upper())
        result_set = None
        current_op = "AND"

        for token in tokens:
            if token in {"AND", "OR", "NOT"}:
                current_op = token
            else:
                matching_docs = self.boolean_index.get(token.lower(), set())
                if result_set is None:
                    result_set = matching_docs
                else:
                    if current_op == "AND":
                        result_set &= matching_docs
                    elif current_op == "OR":
                        result_set |= matching_docs
                    elif current_op == "NOT":
                        result_set -= matching_docs
        
        return result_set if result_set is not None else set()

    def exact_match_search(self, query):
        """Finds documents that contain the exact query string."""
        return [doc_id for doc_id, text in self.documents.items() if query.lower() in text]

    def vector_search(self, query, top_n=10):
        """Finds documents most similar to the query using TF-IDF and cosine similarity."""
        query_vector = self.vectorizer.transform([query])  # Convert query to TF-IDF vector
        similarities = cosine_similarity(query_vector, self.tfidf_matrix)[0]  # Compute cosine similarity
        results = sorted(zip(self.doc_ids, similarities), key=lambda x: x[1], reverse=True)  # Sort results
        return [doc_id for doc_id, _ in results[:top_n]]  # Return top N document IDs
    
    def search(self, query, model_choice):
        """Executes the search based on the user's chosen model and measures performance."""
        tracemalloc.start()
        start_time = time.time()

        if model_choice == "1":
            results = self.boolean_search(query)
        elif model_choice == "2":
            results = self.exact_match_search(query)
        elif model_choice == "3":
            top_n = int(input("Enter the number of top results to display: "))
            results = self.vector_search(query, top_n)
        else:
            print("Invalid choice. Please enter 1, 2, or 3.")
            return

        elapsed_time = time.time() - start_time
        memory_usage = tracemalloc.get_traced_memory()
        tracemalloc.stop()

        print(f"Results ({len(results)} found) in {elapsed_time:.4f} sec")
        print(f"Memory usage: {memory_usage[1] / 1024:.2f} KB")
        print(results)


In [22]:
ir_system = InformationRetrievalSystem("recipes")
#while True:
print("Choose a search model:")
print("1: Boolean Model")
print("2: Exact Match Search")
print("3: Vector Space Model")
model_choice = input("Enter your choice (1, 2, or 3) or 'exit' to quit: ")
# if model_choice.lower() == 'exit':
#     break
query = input("Enter your query: ")
ir_system.search(query, model_choice)
print("--------------------------------------------------")
print("Search completed.")
print("--------------------------------------------------")

Choose a search model:
1: Boolean Model
2: Exact Match Search
3: Vector Space Model
Results (5 found) in 4.2227 sec
Memory usage: 563.43 KB
['Carrot Cake Pancakes Carrot.txt', 'Blackout Cake Blackout.txt', 'Carrot Cake Baked Oatmeal.txt', 'Strawberry Shortcake This.txt', 'Microwave Chocolate Pudding.txt']
--------------------------------------------------
Search completed.
--------------------------------------------------
