In [None]:
import os
import time
import math
from collections import defaultdict, Counter
import re

class InformationRetrievalSystem:
    def __init__(self, directory):
        self.directory = directory
        self.documents = {}  # store the content of documents
        self.boolean_index = defaultdict(set) 
        self.vector_index = {}
        self.document_frequencies = Counter()
        self.load_documents()
        self.build_indexes()

    def load_documents(self):
        for filename in os.listdir(self.directory):
            filepath = os.path.join(self.directory, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                content = file.read().lower()
                self.documents[filename] = content

    def build_indexes(self):
        for doc_id, text in self.documents.items():
            words = set(text.split()) #unique words
            for word in words: 
                self.boolean_index[word].add(doc_id) #step 1 : assemble sequence <token, docID>
                #step 2 : boolean_index store the set of documents containing the word (ascending order)
                #step 2 : merge multiple term entries per document
                
                self.document_frequencies[word] += 1 #step 3 add document frequency
        
        for doc_id, text in self.documents.items():
            term_frequencies = Counter(text.split())
            max_freq = max(term_frequencies.values()) 
            doc_vector = {}
            for term, freq in term_frequencies.items():
                tf = freq / max_freq  # Normalize TF
                idf = math.log(len(self.documents) / (1 + self.document_frequencies[term])) #1+ to avoid division by 0
                doc_vector[term] = tf * idf  # Calcul TF-IDF
            self.vector_index[doc_id] = doc_vector


    def boolean_search(self, query):
        tokens = re.split(r'\s+', query.upper())
        result_set = None
        current_op = "AND"

        for i, token in enumerate(tokens): #merge postings
            if token in {"AND", "OR", "NOT"}:
                current_op = token
            else:
                matching_docs = self.boolean_index.get(token.lower(), set())
                if result_set is None:
                    result_set = matching_docs
                else:
                    if current_op == "AND":
                        result_set &= matching_docs
                    elif current_op == "OR":
                        result_set |= matching_docs
                    elif current_op == "NOT":
                        result_set -= matching_docs

        return result_set if result_set is not None else set()

    def exact_match_search(self, query):
        results = [doc_id for doc_id, text in self.documents.items() if query.lower() in text]
        return results

    def vector_search(self, query):
        query_terms = query.lower().split()
        query_vector = {term: math.log(len(self.documents) / (1 + self.document_frequencies[term])) for term in query_terms} #tf_idf for each term
        
        scores = {}
        for doc_id, doc_vector in self.vector_index.items():
            score = sum(query_vector.get(term, 0) * doc_vector.get(term, 0) for term in query_terms) #cosine similarity
            scores[doc_id] = score
        
        return sorted(scores.items(), key=lambda x: x[1], reverse=True) #sort by score in descending order

    def search(self, query):
        start_time = time.time()
        boolean_results = self.boolean_search(query)
        boolean_time = time.time() - start_time

        start_time = time.time()
        exact_results = self.exact_match_search(query)
        exact_time = time.time() - start_time

        start_time = time.time()
        vector_results = self.vector_search(query)
        vector_time = time.time() - start_time

        print("--- Résults ---")
        print(f"Booleen model ({len(boolean_results)} résults) : {boolean_results} en {boolean_time:.4f} sec")
        print(f"Exact search ({len(exact_results)} résults) : {exact_results} en {exact_time:.4f} sec")
        print(f"Vector space model ({len(vector_results)} résults) : {[doc for doc, _ in vector_results]} in {vector_time:.4f} sec")



In [27]:
ir_system = InformationRetrievalSystem("recipes")
while True:
    query = input("Enter your request (or 'exit') : ")
    if query.lower() == 'exit':
        break
    ir_system.search(query)

--- Résultats ---
Modèle booléen (1 résultats) : {'Carrot Cake Pancakes Carrot.txt'} en 0.0000 sec
Recherche exacte (0 résultats) : [] en 0.0003 sec
Modèle vectoriel (136 résultats) : ['Carrot Cake Pancakes Carrot.txt', 'Easy Bacon Pancake Sticks.txt', 'Carrot Cake Baked Oatmeal.txt', 'Easy Sausage Pancakes.txt', 'Carrot Bread This carrot.txt', 'Easy McGriddle Casserole.txt', 'High Protein Scrambled.txt', 'Bacon Pancakes These bacon.txt', 'Easter Egg Rolls These.txt', 'Pistachio Chip Ice Cream.txt', 'Crescent Sausage Egg Roll.txt', 'No Bake Espresso Martini.txt', 'German Chocolate Cake.txt', 'Pavlova Christmas Trees.txt', 'Watermelon Fruit Bowl.txt', 'Orange Meringue Pie A.txt', 'Cauliflower Popcorn Cauliflower.txt', 'Chef John s Tortilla de.txt', 'Lemon Turmeric Crinkles.txt', 'Strawberry Shortcake This.txt', 'Breakfast Nachos These.txt', 'Ramen Omelet with Spinach.txt', 'Flat Croissant with Nutella.txt', 'Christmas Bark This Christmas.txt', 'Brioche French Toast Casserole.txt', 'Cher