In [1]:
import os
import time
from collections import defaultdict
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class InformationRetrievalSystem:
    def __init__(self, directory):
        self.directory = directory
        self.documents = {}  # store the content of doc
        self.boolean_index = defaultdict(set)
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = None  # TF-IDF matrix
        self.doc_ids = []  #filename list
        self.load_documents()
        self.build_indexes()

    def load_documents(self):
        for filename in os.listdir(self.directory):
            filepath = os.path.join(self.directory, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                content = file.read().lower()
                self.documents[filename] = content

    def build_indexes(self):
        self.doc_ids = list(self.documents.keys())

        for doc_id, text in self.documents.items():
            words = set(text.split()) #unique words
            for word in words: 
                self.boolean_index[word].add(doc_id) #step 1 : assemble sequence <token, docID>
                #step 2 : boolean_index store the set of documents containing the word (ascending order)
                #step 2 : merge multiple term entries per document
                
        
        self.tfidf_matrix = self.vectorizer.fit_transform(self.documents.values())


    def boolean_search(self, query):
        tokens = re.split(r'\s+', query.upper())
        result_set = None
        current_op = "AND"

        for i, token in enumerate(tokens): #merge postings
            if token in {"AND", "OR", "NOT"}:
                current_op = token
            else:
                matching_docs = self.boolean_index.get(token.lower(), set())
                if result_set is None:
                    result_set = matching_docs
                else:
                    if current_op == "AND":
                        result_set &= matching_docs
                    elif current_op == "OR":
                        result_set |= matching_docs
                    elif current_op == "NOT":
                        result_set -= matching_docs

        return result_set if result_set is not None else set()

    def exact_match_search(self, query):
        return [doc_id for doc_id, text in self.documents.items() if query.lower() in text]

    def vector_search(self, query):
        """Recherche basée sur le modèle vectoriel (TF-IDF + similarité cosinus)"""
        query_vector = self.vectorizer.transform([query])  # transform query to tf-idf vector

        similarities = cosine_similarity(query_vector, self.tfidf_matrix)[0]  # similarity with all documents

        results = sorted(zip(self.doc_ids, similarities), key=lambda x: x[1], reverse=True)  # sort by similarity
        return results
    
    def search(self, query):
        start_time = time.time()
        boolean_results = self.boolean_search(query)
        boolean_time = time.time() - start_time

        start_time = time.time()
        exact_results = self.exact_match_search(query)
        exact_time = time.time() - start_time

        start_time = time.time()
        vector_results = self.vector_search(query)
        vector_time = time.time() - start_time

        print("--- Résults ---")
        print(f"Booleen model ({len(boolean_results)} résults) : {boolean_results} en {boolean_time:.4f} sec")
        print(f"Exact search ({len(exact_results)} résults) : {exact_results} en {exact_time:.4f} sec")
        print(f"Vector space model ({len(vector_results)} résults) : {[doc for doc, _ in vector_results]} in {vector_time:.4f} sec")



In [2]:
ir_system = InformationRetrievalSystem("recipes")
while True:
    query = input("Enter your request (or 'exit') : ")
    if query.lower() == 'exit':
        break
    ir_system.search(query)

--- Résults ---
Booleen model (3 résults) : {'Carrot Cake Pancakes Carrot.txt', 'Carrot Bread This carrot.txt', 'Carrot Cake Baked Oatmeal.txt'} en 0.0000 sec
Exact search (2 résults) : ['Carrot Cake Pancakes Carrot.txt', 'Carrot Cake Baked Oatmeal.txt'] en 0.0004 sec
Vector space model (136 résults) : ['Carrot Cake Pancakes Carrot.txt', 'Blackout Cake Blackout.txt', 'Carrot Cake Baked Oatmeal.txt', 'Strawberry Shortcake This.txt', 'Microwave Chocolate Pudding.txt', 'Salted Caramel Irish Cream.txt', 'Cinnamon Nutmeg Coffee.txt', 'Rainbow Clown Cake I went.txt', 'Carrot Bread This carrot.txt', 'Cherry Cheesecake Dump.txt', 'German Chocolate Cake.txt', 'Cottage Cheese Blueberry.txt', 'Cherry Torte with Cherry.txt', 'Easter Egg Rolls These.txt', 'Easy Strawberry Brownies.txt', 'Fluffy Cake Donuts This.txt', 'Whole Orange Blender Cake.txt', 'Easy Pecan Sticky Buns.txt', 'Creepy Halloween Skull.txt', 'Pumpkin Oreo Cupcakes.txt', 'Fudgy Black Forest Bars.txt', 'Baked Mini Doughnuts Yummy.txt