In [39]:
# Bloc 1: Choix du modèle, requête et obtention des résultats avec les numéros des documents

import os
import time
from collections import defaultdict
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tracemalloc

class InformationRetrievalSystem:
    def __init__(self, directory):
        self.directory = directory
        self.documents = {}  # Stores document content
        self.boolean_index = defaultdict(set)  # Index for boolean search
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = None  # TF-IDF matrix for vector search
        self.doc_ids = []  # List of document filenames
        self.load_documents()
        self.build_indexes()

    def load_documents(self):
        """Loads documents from the directory and stores their content."""
        for filename in os.listdir(self.directory):
            filepath = os.path.join(self.directory, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                content = file.read().lower()
                self.documents[filename] = content

    def build_indexes(self):
        """Builds the boolean and TF-IDF indexes from documents."""
        self.doc_ids = list(self.documents.keys())
        
        # Build boolean index
        for doc_id, text in self.documents.items():
            words = set(text.split())
            for word in words:
                self.boolean_index[word].add(doc_id)
        
        # Compute TF-IDF matrix
        self.tfidf_matrix = self.vectorizer.fit_transform(self.documents.values())

    def boolean_search(self, query):
        """Performs boolean search using AND, OR, NOT operators."""
        tokens = re.split(r'\s+', query.upper())
        result_set = None
        current_op = "AND"

        for token in tokens:
            if token in {"AND", "OR", "NOT"}:
                current_op = token
            else:
                matching_docs = self.boolean_index.get(token.lower(), set())
                if result_set is None:
                    result_set = matching_docs
                else:
                    if current_op == "AND":
                        result_set &= matching_docs
                    elif current_op == "OR":
                        result_set |= matching_docs
                    elif current_op == "NOT":
                        result_set -= matching_docs
        
        return result_set if result_set is not None else set()

    def exact_match_search(self, query):
        """Finds documents that contain the exact query string."""
        return [doc_id for doc_id, text in self.documents.items() if query.lower() in text]

    def vector_search(self, query, top_n=10):
        """Finds documents most similar to the query using TF-IDF and cosine similarity."""
        query_vector = self.vectorizer.transform([query])  # Convert query to TF-IDF vector
        similarities = cosine_similarity(query_vector, self.tfidf_matrix)[0]  # Compute cosine similarity
        results = sorted(zip(self.doc_ids, similarities), key=lambda x: x[1], reverse=True)  # Sort results
        return [(i+1, doc_id) for i, (doc_id, _) in enumerate(results[:top_n])]  # Return top N document IDs with numbers
    
    def search(self, query, model_choice, top_n=10):
        """Executes the search based on the user's chosen model."""
        tracemalloc.start()
        start_time = time.time()

        if model_choice == "1":
            results = self.boolean_search(query)
            # Convert set to a list with index numbers for easier user selection
            results = [(i+1, doc_id) for i, doc_id in enumerate(results)]
        elif model_choice == "2":
            results = self.exact_match_search(query)
            # Convert list to a list with index numbers for easier user selection
            results = [(i+1, doc_id) for i, doc_id in enumerate(results)]
        elif model_choice == "3":
            results = self.vector_search(query, top_n)
        else:
            print("Invalid choice. Please enter 1, 2, or 3.")
            return []

        elapsed_time = time.time() - start_time
        memory_usage = tracemalloc.get_traced_memory()
        tracemalloc.stop()

        print(f"Results ({len(results)} found) in {elapsed_time:.4f} sec")
        print(f"Memory usage: {memory_usage[1] / 1024:.2f} KB")
        
        # Display results with numbers
        print("Documents returned:")
        if not results:
            print("No results found.")
        else:
            for number, doc_id in results:
                print(f"{number}: {doc_id}")
        
        return results

# Créer l'instance de la classe avec le répertoire contenant les recettes
ir_system = InformationRetrievalSystem("recipes")

# Choisir le modèle de recherche et la requête
model_choice = input("Choose a search model (1: Boolean, 2: Exact Match, 3: Vector Space): ")
query = input("Enter your search query: ")

# Si le modèle choisi est le modèle vectoriel, demander le nombre de résultats à afficher
if model_choice == "3":
    top_n = int(input("Enter the number of top results to display: "))
else:
    top_n = 10  # Par défaut, afficher 10 résultats

# Exécuter la recherche
results = ir_system.search(query, model_choice, top_n)

# # Sauvegarder les résultats pour l'évaluation dans le prochain bloc
# print("Search results:", results)


Results (4 found) in 0.0001 sec
Memory usage: 1.22 KB
Documents returned:
1: Carrot Cake Baked Oatmeal.txt
2: Carrot Bread This carrot.txt
3: Carrot Cake Pancakes Carrot.txt
4: Easter Egg Rolls These.txt


In [40]:
# Bloc 2: Entrer les numéros des documents pertinents et évaluer la précision et le rappel

def compute_precision_recall(retrieved, relevant):
    """Calculates precision and recall."""
    retrieved_set = set(retrieved)
    relevant_set = set(relevant)

    true_positives = len(retrieved_set & relevant_set)
    precision = true_positives / len(retrieved_set) if retrieved_set else 0
    recall = true_positives / len(relevant_set) if relevant_set else 0

    return precision, recall

# Demander à l'utilisateur d'entrer les numéros des documents pertinents
relevant_doc_numbers = input("Enter relevant document numbers separated by commas (if known): ").split(',')
relevant_doc_numbers = [int(doc.strip()) for doc in relevant_doc_numbers if doc.strip()]

# Convertir les numéros des documents en noms de fichiers
relevant_docs = [results[number-1][1] for number in relevant_doc_numbers]

# Calculer la précision et le rappel
precision, recall = compute_precision_recall([doc_id for _, doc_id in results], relevant_docs)

print(f"Precision: {precision:.4f}, Recall: {recall:.4f}")


Precision: 0.7500, Recall: 1.0000
