In [10]:
import os
import time
from collections import defaultdict
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class InformationRetrievalSystem:
    def __init__(self, directory):
        self.directory = directory
        self.documents = {}  # store the content of doc
        self.boolean_index = defaultdict(set)
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = None  # TF-IDF matrix
        self.doc_ids = []  # filename list
        self.load_documents()
        self.build_indexes()

    def load_documents(self):
        for filename in os.listdir(self.directory):
            filepath = os.path.join(self.directory, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                content = file.read().lower()
                self.documents[filename] = content

    def build_indexes(self):
        self.doc_ids = list(self.documents.keys())

        for doc_id, text in self.documents.items():
            words = set(text.split())  # unique words
            for word in words:
                self.boolean_index[word].add(doc_id)  # step 1 : assemble sequence <token, docID>
                # step 2 : boolean_index store the set of documents containing the word (ascending order)
                # step 2 : merge multiple term entries per document

        self.tfidf_matrix = self.vectorizer.fit_transform(self.documents.values())

    def boolean_search(self, query):
        tokens = re.split(r'\s+', query.upper())
        result_set = None
        current_op = "AND"

        for i, token in enumerate(tokens):  # merge postings
            if token in {"AND", "OR", "NOT"}:
                current_op = token
            else:
                matching_docs = self.boolean_index.get(token.lower(), set())
                if result_set is None:
                    result_set = matching_docs
                else:
                    if current_op == "AND":
                        result_set &= matching_docs
                    elif current_op == "OR":
                        result_set |= matching_docs
                    elif current_op == "NOT":
                        result_set -= matching_docs

        return result_set if result_set is not None else set()

    def exact_match_search(self, query):
        return [doc_id for doc_id, text in self.documents.items() if query.lower() in text]

    def vector_search(self, query, top_n):
        """Recherche basée sur le modèle vectoriel (TF-IDF + similarité cosinus)"""
        query_vector = self.vectorizer.transform([query])  # transform query to tf-idf vector

        similarities = cosine_similarity(query_vector, self.tfidf_matrix)[0]  # similarity with all documents

        results = sorted(zip(self.doc_ids, similarities), key=lambda x: x[1], reverse=True)[:top_n]  # sort by similarity and limit results
        return results
    
    def search(self, query, model_choice):
        if model_choice == "1":
            start_time = time.time()
            results = self.boolean_search(query)
            elapsed_time = time.time() - start_time
            print(f"Booleen model ({len(results)} results): {results} en {elapsed_time:.4f} sec")
        elif model_choice == "2":
            start_time = time.time()
            results = self.exact_match_search(query)
            elapsed_time = time.time() - start_time
            print(f"Exact search ({len(results)} results): {results} en {elapsed_time:.4f} sec")
        elif model_choice == "3":
            top_n = int(input("Enter the number of results you want: "))
            start_time = time.time()
            results = self.vector_search(query, top_n)
            elapsed_time = time.time() - start_time
            print(f"Vector space model ({len(results)} results): {[doc for doc, _ in results]} en {elapsed_time:.4f} sec")
        else:
            print("Invalid choice. Please enter 1, 2, or 3.")


In [13]:
ir_system = InformationRetrievalSystem("recipes")
#while True:
print("Choose a search model:")
print("1: Boolean Model")
print("2: Exact Match Search")
print("3: Vector Space Model")
model_choice = input("Enter your choice (1, 2, or 3) or 'exit' to quit: ")
# if model_choice.lower() == 'exit':
#     break
query = input("Enter your query: ")
ir_system.search(query, model_choice)
print("--------------------------------------------------")
print("Search completed.")
print("--------------------------------------------------")

Choose a search model:
1: Boolean Model
2: Exact Match Search
3: Vector Space Model
Vector space model (6 results): ['Mandarin Orange Watergate.txt', 'Pistachio Chip Ice Cream.txt', 'Crescent Sausage Egg Roll.txt', 'No Bake Espresso Martini.txt', 'German Chocolate Cake.txt', 'Sticky Rice with Passion.txt'] en 0.0045 sec
--------------------------------------------------
Search completed.
--------------------------------------------------
