## import BIB

In [37]:
from PyQt5.QtWidgets import (
    QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout,
    QLineEdit, QPushButton, QRadioButton, QLabel, QGroupBox,
    QTableWidget, QTableWidgetItem, QScrollArea, QTextEdit, QStackedWidget, QGridLayout,
    QMessageBox,QHeaderView,QComboBox
)
from PyQt5.QtGui import QDoubleValidator
from PyQt5.QtGui import QIcon
from PyQt5.QtCore import Qt
import sys
import numpy as np

In [38]:
import nltk
import os
import math
from collections import defaultdict
from nltk import FreqDist


STOPWORDS = set(nltk.corpus.stopwords.words('english'))
PORTER_STEMMER = nltk.PorterStemmer()
LANCASTER_STEMMER = nltk.LancasterStemmer()

## Fonctions

In [39]:
path = '../Collestions'

In [40]:
# def get_processing_args():
#     tokenization = "Split"
#     normalization = "None",
#     file_type = "TPD"
#     return tokenization, normalization, file_type

In [41]:
def preprocessing(doc_path, tokenization, normalization):
    with open(doc_path, 'r') as file:
        text = file.read()
        
    # Tokenization
    if tokenization == "Split":
        tokens = text.split()
    else:
        exp_reg = nltk.RegexpTokenizer(r'\d+(?:\.\d+)?x\d+|\d+(?:\.\d+)|\w+(?:-\w+)*|(?:[A-Z]\.)+|\w+')
        tokens = exp_reg.tokenize(text)

    # Remove stopwords
    tokens = [term for term in tokens if term.lower() not in STOPWORDS]

    # Normalization
    if normalization == "Porter":
        tokens = [PORTER_STEMMER.stem(term) for term in tokens]
    elif normalization == "Lancaster":
        tokens = [LANCASTER_STEMMER.stem(term) for term in tokens]

    return tokens

In [42]:
# def build_global_term_frequencies(tokenization, normalization):
#     global_term_frequencies = defaultdict(int)

#     for doc_name in os.listdir('Collections'):
#         doc_path = os.path.join('Collections', doc_name)
#         tokens = preprocessing(doc_path, tokenization, normalization)
#         unique_terms = set(tokens)

#         for term in unique_terms:
#             global_term_frequencies[term] += 1
            
#     return global_term_frequencies

In [43]:
# def TPD_result(query, terms_freq, global_term_frequencies, N):
#     max_freq = max(terms_freq.values())
#     results=[]
#     for idx, (term, freq) in enumerate(terms_freq.items(), start=1):
#         poids = (freq / max_freq) * math.log10((N / global_term_frequencies[term]) + 1)
#         results.append((idx, term, query, freq, round(poids, 4)))
        
#     return results

In [44]:
# def nb_termes_glob(tokenization, normalization):
#     nb_termes_global = []
#     for doc_name in os.listdir('Collections'):
#         doc_path = os.path.join('Collections', doc_name)
#         # Appliquer le prétraitement pour obtenir les tokens du document
#         tokens = preprocessing(doc_path, tokenization, normalization)
        
#         # Ajouter les tokens du document à la liste globale
#         nb_termes_global.extend(tokens)

#     # Obtenir le nombre de termes uniques
#     termes_uniques = np.unique(nb_termes_global)
#     print("Termes uniques : ", termes_uniques)  # Optionnel : pour visualiser les termes uniques
    
#     return len(termes_uniques)
            

In [45]:
def process_input(query, normalization):
    # appliquer le traitement sur la requete
    if normalization == "Porter":
        query = PORTER_STEMMER.stem(query) 
    elif normalization == "Lancaster":
        query = LANCASTER_STEMMER.stem(query) 
    return query

In [46]:
# def text_processing(query,tokenization, normalization, file_type):
#     # tokenization, normalization, file_type = get_processing_args()
#     nb_terms = 0
#     # calculer la frequence de chaque element
#     global_term_frequencies = build_global_term_frequencies(tokenization, normalization)  # Calculate global term frequencies
#     N = len(os.listdir('Collections'))
#     results =[]
#     if file_type == "TPD":
#         doc_path = os.path.join('Collections', f"{query}.txt")
#         tokens = preprocessing(doc_path, tokenization, normalization)
#         nb_terms = len(np.unique(tokens))
#         terms_freq = FreqDist(tokens)
        
#         results = TPD_result(query, terms_freq, global_term_frequencies, N)
#         print(f"la resultat de chaque requete type {results}")
#         return results , nb_terms
        
#     else :
#         query = process_input(query, normalization)
#         i=0
#         for doc_name in os.listdir('Collections'):
#             doc_path = os.path.join('Collections', doc_name)
#             Tokens = preprocessing(doc_path, tokenization, normalization)
#             terms_freq = FreqDist(Tokens)

#             max_freq = max(terms_freq.values())
#             for term, freq in terms_freq.items():  
#                 if term == query:  # Check if the term is the specific query term
#                     poids = ((freq / max_freq) * math.log10((N / global_term_frequencies[term]) + 1))
#                     i+=1
#                     results.append((i, term, os.path.splitext(doc_name)[0], freq, round(poids, 4)))
#         print(f"la resultat de chaque requete type {results}")
#         return results , nb_terms
   
                    

In [47]:
def get_text(query):
    # if raw:
    doc_path = os.path.join('Collections', f"{query}.txt")
    with open(doc_path, 'r') as file:
        text = file.read()
    return text
    # elif processed:
    #     text_processing(query)

In [48]:
import json
def create_descriptor_and_inverse_files_with_weights(path, tokenization, normalization, output_path="output"):
    # Créer un nom unique pour les fichiers en fonction des choix de tokenization et normalization
    descriptor_filename = f"descripteur_{tokenization}_{normalization}.json"
    inverse_index_filename = f"inverse_index_{tokenization}_{normalization}.json"
    
    descriptor_path = os.path.join(output_path, descriptor_filename)
    inverse_index_path = os.path.join(output_path, inverse_index_filename)
    
    # Vérifier si les fichiers existent déjà
    if os.path.exists(descriptor_path) and os.path.exists(inverse_index_path):
        print(f"Les fichiers descripteur et inverse existent déjà : {descriptor_path} et {inverse_index_path}")
        
        # Charger les fichiers existants
        with open(descriptor_path, "r", encoding="utf-8") as desc_file:
            descripteur = json.load(desc_file)
        with open(inverse_index_path, "r", encoding="utf-8") as inverse_file:
            inverse_index = json.load(inverse_file)
        
        return descripteur, inverse_index
    
    # Si les fichiers n'existent pas, les créer
    print("Création des fichiers descripteur et inverse...")
    
    # Initialiser les fichiers descripteur et inverse
    descripteur = {}
    inverse_index = defaultdict(lambda: defaultdict(lambda: {"freq": 0, "poids": 0}))
    
    # Nombre total de documents
    documents = os.listdir(path)
    N = len(documents)
    
    # Calcul des fréquences globales pour les poids
    global_term_frequencies = defaultdict(int)
    for doc_name in documents:
        doc_path = os.path.join(path, doc_name)
        tokens = preprocessing(doc_path, tokenization, normalization)
        unique_terms = set(tokens)
        for term in unique_terms:
            global_term_frequencies[term] += 1
    
    # Construire les fichiers descripteur et inverse
    for doc_name in documents:
        doc_path = os.path.join(path, doc_name)
        tokens = preprocessing(doc_path, tokenization, normalization)
        terms_freq = FreqDist(tokens)  # Fréquence des termes
        max_freq = max(terms_freq.values())  # Fréquence maximale dans le document
        doc_key = os.path.splitext(doc_name)[0]
        
        # Ajouter les termes au fichier descripteur
        descripteur[doc_key] = {}
        for term, freq in terms_freq.items():
            poids = (freq / max_freq) * math.log10((N / global_term_frequencies[term]) + 1)
            descripteur[doc_key][term] = {"freq": freq, "poids": round(poids, 4)}
            
            # Ajouter les termes au fichier inverse
            inverse_index[term][doc_key]["freq"] = freq
            inverse_index[term][doc_key]["poids"] = round(poids, 4)
    
    # Sauvegarder les fichiers en JSON
    os.makedirs(output_path, exist_ok=True)
    
    with open(descriptor_path, "w", encoding="utf-8") as desc_file:
        json.dump(descripteur, desc_file, indent=4, ensure_ascii=False)
        
    with open(inverse_index_path, "w", encoding="utf-8") as inverse_file:
        json.dump(inverse_index, inverse_file, indent=4, ensure_ascii=False)
    

    return descripteur, inverse_index


In [49]:
def total_terme_per_doc(query, descripteur):
    if query in descripteur:
        terms = descripteur[query]
        total = len(terms)  # Nombre de termes distincts dans le document
        return total
    else:
        return 0  # Si le document n'existe pas dans le fichier descripteur


In [50]:
def total_termes_descripteur(descripteur):
    total = 0
    for terms in descripteur.values():  # Parcourt chaque document
        total += len(terms)  # Ajoute le nombre de termes distincts dans chaque document
    return total


In [51]:
def open_descripteur_invers(normalization , tokenization ,output_path):
    # Générer les noms des fichiers
    descriptor_filename = f"descripteur_{tokenization}_{normalization}.json"
    inverse_index_filename = f"inverse_index_{tokenization}_{normalization}.json"

    descriptor_path = os.path.join(output_path, descriptor_filename)
    inverse_index_path = os.path.join(output_path, inverse_index_filename)

    # Vérifier si les fichiers existent, sinon les créer
    if not os.path.exists(descriptor_path) or not os.path.exists(inverse_index_path):
        print(f"Les fichiers pour {tokenization} et {normalization} n'existent pas. Création en cours...")
        create_descriptor_and_inverse_files_with_weights(path, tokenization, normalization, output_path)

    # Charger les fichiers descripteur et inverse
    with open(descriptor_path, "r", encoding="utf-8") as desc_file:
        descripteur = json.load(desc_file)
    with open(inverse_index_path, "r", encoding="utf-8") as inverse_file:
        inverse_index = json.load(inverse_file)
    return descripteur , inverse_index
        
    

In [52]:
def processing(query, tokenization, normalization, path, output_path="output",methode = 'TPD'):
    
    descripteur , inverse_index = open_descripteur_invers(normalization , tokenization ,output_path)
   
    nb_total_terme_per_doc = total_terme_per_doc(query, descripteur)
    nb_total_per_collection = total_termes_descripteur(descripteur)
    
    formatted_results = []
    if methode == 'TPD':
        if query in descripteur:
            terms = descripteur[query]
            for i, (term, data) in enumerate(terms.items(), start=1):
                formatted_results.append((i, term, query, data['freq'], data['poids']))
        
    else:
        if query in inverse_index:
            docs = inverse_index[query]
            for i, (doc, data) in enumerate(docs.items(), start=1):
                formatted_results.append((i, query, doc, data['freq'], data['poids']))
    
    
   

    return formatted_results , nb_total_terme_per_doc , nb_total_per_collection
    

In [53]:
formatted_results , nb_total_terme_per_doc , nb_total_per_collection = processing("D1", "split", None, path, output_path="output",methode = 'TPD')
formatted_results

[(1, 'experimental', 'D1', 2, 0.2817),
 (2, 'investigation', 'D1', 1, 0.1408),
 (3, 'aerodynamics', 'D1', 1, 0.1408),
 (4, 'wing', 'D1', 3, 0.301),
 (5, 'slipstream', 'D1', 5, 0.7042),
 (6, '.', 'D1', 6, 0.301),
 (7, 'study', 'D1', 1, 0.1003),
 (8, 'propeller', 'D1', 1, 0.1408),
 (9, 'made', 'D1', 2, 0.2007),
 (10, 'order', 'D1', 1, 0.1408),
 (11, 'determine', 'D1', 1, 0.1408),
 (12, 'spanwise', 'D1', 1, 0.1408),
 (13, 'distribution', 'D1', 1, 0.0663),
 (14, 'lift', 'D1', 3, 0.4225),
 (15, 'increase', 'D1', 1, 0.1408),
 (16, 'due', 'D1', 2, 0.2817),
 (17, 'different', 'D1', 3, 0.301),
 (18, 'angles', 'D1', 1, 0.1408),
 (19, 'attack', 'D1', 1, 0.1408),
 (20, 'free', 'D1', 1, 0.1003),
 (21, 'stream', 'D1', 1, 0.1003),
 (22, 'velocity', 'D1', 1, 0.1003),
 (23, 'ratios', 'D1', 1, 0.1408),
 (24, 'results', 'D1', 1, 0.1003),
 (25, 'intended', 'D1', 1, 0.1408),
 (26, 'part', 'D1', 2, 0.2817),
 (27, 'evaluation', 'D1', 2, 0.2817),
 (28, 'basis', 'D1', 1, 0.1003),
 (29, 'theoretical', 'D1', 1, 

In [54]:
def calculer_relevance_BM25(query, tokenization, normalization, path, k=1.5, b=0.75, output_path="output"):
    
        
    descripteur , inverse_index = open_descripteur_invers(normalization , tokenization ,output_path)

    # Calcul des longueurs de documents et de la taille moyenne
    doc_lengths = {doc: sum(term_data["freq"] for term_data in terms.values()) for doc, terms in descripteur.items()}

    # Nbre total des terms
    total_terms = sum(doc_lengths.values())
    # Nombre total de documents
    N = len(descripteur) 
    # la taille moyenne des documents 
    avdl = total_terms / N if N > 0 else 1  # Eviter division par zéro
    
    # Initialiser les scores de pertinence
    relevance_dict = {doc: 0 for doc in descripteur}

    # Traiter chaque terme de la requête
    for term in query.split():
        term = process_input(term, normalization)
        
        # dans le cas ou le terme n'exist pas dans le fichier inverse
        if term not in inverse_index:
            continue  # Ignorer les termes absents du fichier inverse

        # Nombre de documents contenant le terme
        ni = len(inverse_index[term])
        # print(f"nombre de document contenant {term} : {ni}")

        # Calculer l'IDF avec un seuil pour éviter des valeurs négatives ou extrêmes
        idf = math.log10(((N - ni + 0.5) / (ni + 0.5)) ) if ni + 0.5 != 0 else 0
        # print(idf)
        for doc_name, data in inverse_index[term].items():
            freq_ti_d = data["freq"]  # Fréquence du terme dans le document
            # print(f"frequant terme {term} dans le document {doc_name} est :{freq_ti_d}")
            dl = doc_lengths.get(doc_name, 0)  # Taille du document d
            
            if dl == 0:
                continue  # Éviter division par zéro

            # Calcul du score BM25
            # numerator = freq_ti_d * (k + 1)
            numerator = freq_ti_d 
            denominator = freq_ti_d + (k * ((1 - b) + b * (dl / avdl)))
            RSV = idf * (numerator / denominator)

            
            # Ajouter au score total du document
            relevance_dict[doc_name] += RSV
            
    # print(relevance_dict)
    
    filtered_relevances = {doc: score for doc, score in relevance_dict.items() if score != 0}

    # Trier les documents par pertinence décroissante
    sorted_relevance = dict(sorted(filtered_relevances.items(), key=lambda item: item[1], reverse=True))
    print(sorted_relevance)
    return sorted_relevance


In [55]:
tokenization = "Split"  # Ou "Regex" selon votre choix
normalization = "Lancaster"  # Ou "Lancaster" selon votre choix
query = "effect distribution "  # Exemple de requête
path = "../Collections"  # Chemin vers le dossier contenant les documents
output_path = "output"


relevance_scores = calculer_relevance_BM25(query, tokenization, normalization, path, k=1.5, b=0.75, output_path=output_path)

if relevance_scores:
    print("Scores de pertinence :", relevance_scores)

{'D4': -0.08823075947891026, 'D2': -0.09795556690944766, 'D1': -0.2728925083902616, 'D3': -0.27339867077595187, 'D6': -0.3028044304368359}
Scores de pertinence : {'D4': -0.08823075947891026, 'D2': -0.09795556690944766, 'D1': -0.2728925083902616, 'D3': -0.27339867077595187, 'D6': -0.3028044304368359}


In [56]:
def calculer_relevance(query, tokenization, normalization, file_type , path ):
    # Créer une liste des documents à partir des fichiers dans le dossier
    docs = [doc.split('.')[0] for doc in os.listdir(path)]
    

    relevance_dict = {doc: 0 for doc in docs}
    for term in query.split():
        occurrence, _ ,_= processing(term, tokenization, normalization, path, output_path="output",methode =file_type)
        print(occurrence)
        # print("Occurrences:", occurrence)
        # exemple de occurrence : [(1, 'effect', 'D1', 1, 0.0795), (2, 'effect', 'D3', 1, 0.1193), (3, 'effect', 'D6', 1, 0.1193)]
        
        for occ in occurrence:
            doc_name = occ[2]  # Nom du document contenant le terme
            poids_terme = occ[4]  # Poids du terme dans ce document
            relevance_dict[doc_name] += poids_terme
            
    return relevance_dict

In [57]:
relevance = calculer_relevance('effect distribution wing slipstream experiment investig aerodynam','Split', 'Porter', 'DPT',path)
relevance

[(1, 'effect', 'D1', 2, 0.1326), (2, 'effect', 'D2', 1, 0.0398), (3, 'effect', 'D3', 1, 0.0995), (4, 'effect', 'D6', 3, 0.2985)]
[]
[(1, 'wing', 'D1', 3, 0.301), (2, 'wing', 'D6', 2, 0.301)]
[(1, 'slipstream', 'D1', 5, 0.7042)]
[(1, 'experiment', 'D1', 2, 0.2817)]
[(1, 'investig', 'D1', 1, 0.1003), (2, 'investig', 'D2', 1, 0.0602)]
[(1, 'aerodynam', 'D1', 1, 0.1003), (2, 'aerodynam', 'D6', 1, 0.1505)]


{'D1': 1.6201,
 'D2': 0.1,
 'D3': 0.0995,
 'D4': 0,
 'D5': 0,
 'D6': 0.7499999999999999}

In [58]:
import os
import math

def calculer_relevance_cosinus(query, tokenization, normalization, file_type, path):
    descripteur, _ = open_descripteur_invers(normalization, tokenization, output_path)
    
    # Calcul de la norme des vecteurs des documents (somme des poids au carré)
    poids = {doc: sum((term_data["poids"])**2 for term_data in terms.values()) for doc, terms in descripteur.items()}
  
    
    # Création de la liste des documents
    docs = [doc.split('.')[0] for doc in os.listdir(path)]
    
    # Initialisation des vecteurs de documents
    doc_vectors = {doc: {} for doc in docs}
    
    # Remplir les vecteurs des documents
    for term in query.split():
        occurrence, _, _ = processing(term, tokenization, normalization, path, output_path="output", methode=file_type)
        for occ in occurrence:
            doc_name = occ[2]  # Nom du document
            poids_terme = occ[4]  # Poids du terme
            doc_vectors[doc_name][term] = poids_terme
    # doc_vectors =>   {'D1': {'effect': 0.0795, 'distribution': 0.0663}, 'D2': {}, 'D3': {'effect': 0.1193, 'distribution': 0.0995}, 'D4': {'distribution': 0.0332}, 'D5': {}, 'D6': {'effect': 0.1193, 'distribution': 0.0995}}
    print(f"doc vectors {doc_vectors}")
    
    # Calcul de la similarité cosinus
    relevance_dict = {}
    for doc_name, doc_vector in doc_vectors.items():
        # doc_name, doc_vector => 'D1': {'effect': 0.0795, 'distribution': 0.0663}       
        dot_product = sum(doc_vector.get(term, 0) for term in query.split())  # Produit scalaire
        norm_query = math.sqrt(len(query.split()))  # Norme de la requête (poids uniformes)
        norm_doc = math.sqrt(poids.get(doc_name, 0))  # Norme du document
        similarity = dot_product / (norm_query * norm_doc) if norm_query > 0 and norm_doc > 0 else 0
        
        relevance_dict[doc_name] = similarity

    return relevance_dict


In [59]:
relevance_dict = calculer_relevance_cosinus('effect distribution','Split', 'None', 'DPT',path)
relevance_dict

doc vectors {'D1': {'effect': 0.0795, 'distribution': 0.0663}, 'D2': {}, 'D3': {'effect': 0.1193, 'distribution': 0.0995}, 'D4': {'distribution': 0.0332}, 'D5': {}, 'D6': {'effect': 0.1193, 'distribution': 0.0995}}


{'D1': 0.06900726067964397,
 'D2': 0.0,
 'D3': 0.12320148746796511,
 'D4': 0.024022928446014405,
 'D5': 0.0,
 'D6': 0.08426610754279582}

In [60]:
def calculer_jaccard_similarity(query, tokenization, normalization, file_type, path):
    
    descripteur, _ = open_descripteur_invers(normalization, tokenization, output_path)
    
    poids = {doc: sum((term_data["poids"])**2 for term_data in terms.values()) for doc, terms in descripteur.items()}
    
    
    docs = [doc.split('.')[0] for doc in os.listdir(path)]
    
    # Initialisation des vecteurs de documents
    doc_vectors = {doc: {} for doc in docs}
    
    # Remplir les vecteurs des documents
    for term in query.split():
        occurrence, _, _ = processing(term, tokenization, normalization, path, output_path="output", methode=file_type)
        for occ in occurrence:
            doc_name = occ[2]  # Nom du document
            poids_terme = occ[4]  # Poids du terme
            doc_vectors[doc_name][term] = poids_terme
            

    # Calcul de la mesure de Jaccard pour chaque document
    relevance_dict = {}
    for doc_name, doc_vector in doc_vectors.items():
        print(f"doc_name, doc_vector : {doc_name, doc_vector}")
        # doc_name, doc_vector => 'D1': {'effect': 0.0795, 'distribution': 0.0663}       
        dot_product = sum(doc_vector.get(term, 0) for term in query.split())  # Produit scalaire
        
        norm_query = len(query.split())  # Norme de la requête (poids uniformes)
        norm_doc = poids.get(doc_name, 0) # Norme du document
        union = norm_doc+norm_query-dot_product
        similarity =  (dot_product)/(union)if union > 0 else 0
        
        relevance_dict[doc_name] = similarity

    return relevance_dict



In [61]:
jaccard_relevance = calculer_jaccard_similarity('effect distribution', 'Split', 'None', 'DPT', path='../Collections')
jaccard_relevance

doc_name, doc_vector : ('D1', {'effect': 0.0795, 'distribution': 0.0663})
doc_name, doc_vector : ('D2', {})
doc_name, doc_vector : ('D3', {'effect': 0.1193, 'distribution': 0.0995})
doc_name, doc_vector : ('D4', {'distribution': 0.0332})
doc_name, doc_vector : ('D5', {})
doc_name, doc_vector : ('D6', {'effect': 0.1193, 'distribution': 0.0995})


{'D1': 0.035681015160415504,
 'D2': 0.0,
 'D3': 0.0651538779902336,
 'D4': 0.011362936292260204,
 'D5': 0.0,
 'D6': 0.0424672942054332}

In [101]:
from pyparsing import infixNotation, opAssoc, Word, alphas, ParseException  # Import des modules nécessaires
# Une bibliothèque pour analyser du texte et vérifier s'il convient à une certaine grammaire (comme ici pour des requêtes logiques).
# infixNotation : Permet de définir une grammaire avec des opérateurs comme AND, OR, NOT.
# opAssoc : Définit l'ordre dans lequel les opérateurs sont évalués.
# Word, alphas : Permet de reconnaître des mots composés uniquement de lettres (comme "chien").

# Définition des mots-clés logiques
AND = "AND"
OR = "OR"
NOT = "NOT"

# Classe représentant un terme simple
class Term:
    def __init__(self, tokens):
        self.term = tokens[0]  # Le terme est le premier élément de la liste des tokens

    def eval(self, doc):
        return self.term in doc  # Le terme doit être présent dans le document


class BooleanOp:
    def __init__(self, tokens):
        self.args = tokens[0][0::2]  # Extrait les arguments
        # Transforme les arguments en instances de `Term` si ce n'est pas déjà fait
        self.args = [arg if isinstance(arg, Term) else Term([arg]) for arg in self.args]

class AndOp(BooleanOp):
    def eval(self, doc):
        return all(arg.eval(doc) for arg in self.args)

class OrOp(BooleanOp):
    def eval(self, doc):
        return any(arg.eval(doc) for arg in self.args)

class NotOp(BooleanOp):
    def eval(self, doc):
        return not self.args[0].eval(doc)

# Définition de la grammaire des termes et des opérateurs
term = Word(alphas).setParseAction(Term)  # Un terme est simplement un mot composé de lettres
    # Word(alphas): Reconnaît des mots composés uniquement de lettres (comme "chien").
    # setParseAction(Term): Chaque mot reconnu est transformé en un objet de la classe Term.
expr = infixNotation(term, [
    (NOT, 1, opAssoc.RIGHT, NotOp),  # Le NOT est évalué de droite à gauche
    (AND, 2, opAssoc.LEFT, AndOp),   # Le AND est évalué de gauche à droite
    (OR, 2, opAssoc.LEFT, OrOp),     # Le OR est évalué de gauche à droite
])

# Fonction pour vérifier la requête booléenne
def verifier_requete_booleenne(query, doc_vectors):
    try:
        # Analyser la requête avec la grammaire définie
        parsed_query = expr.parseString(query, parseAll=True)[0]
    except ParseException as e:
        # Si la requête n'est pas valide, lever une exception
        raise ValueError(f"Requête booléenne invalide : {str(e)}")

    # Dictionnaire pour stocker les résultats de l'évaluation
    pertinence_dict = {}

    # Évaluer la requête pour chaque document
    for doc_name, doc_vector in doc_vectors.items():
        pertinence_dict[doc_name] = parsed_query.eval(doc_vector)

    # Retourner le dictionnaire des résultats
    return pertinence_dict


In [102]:
def model_booleen(query, tokenization, normalization, file_type, path,output_path="output"):
    # query => effect and ditribtion or snow 
    
   
    docs = [doc.split('.')[0] for doc in os.listdir(path)]
    
    # Initialisation des vecteurs de documents
    doc_vectors = {doc: {} for doc in docs}
    
    # Mots-clés logiques
    logical_operators = {"AND", "OR", "NOT"}
    
    # Séparer les termes des opérateurs logiques
    terms = [term for term in query.split() if term not in logical_operators]
    
    # Remplir les vecteurs des documents
    for term in terms:
        occurrence, _, _ = processing(term, tokenization, normalization, path, output_path="output", methode=file_type)
        for occ in occurrence:
            doc_name = occ[2]  # Nom du document
            poids_terme = occ[4]  # Poids du terme
            doc_vectors[doc_name][term] = poids_terme
    print(doc_vectors)
   
    resultats = verifier_requete_booleenne(query, doc_vectors)
    resultats = [(doc , val) for doc , val in resultats.items()]
    return resultats
    
     

In [108]:
res = model_booleen("NOT effect AND NOT distribution", "Split", "None", "DPT", path)

{'D1': {'effect': 0.0795, 'distribution': 0.0663}, 'D2': {}, 'D3': {'effect': 0.1193, 'distribution': 0.0995}, 'D4': {'distribution': 0.0332}, 'D5': {}, 'D6': {'effect': 0.1193, 'distribution': 0.0995}}


In [109]:
print(res)

[('D1', False), ('D2', False), ('D3', False), ('D4', False), ('D5', False), ('D6', False)]


## Interface

In [None]:

class SearchApp(QMainWindow):
    def __init__(self):
        super().__init__()
        self.path = 'Collections'
        self.setWindowTitle("Document Search and Processing")
        self.setGeometry(100, 100, 1170, 800) #8,6 => 900?700 
        self.setWindowIcon(QIcon("./icons/interface_icon.png")) 
        self.setFixedSize(1170, 800) #900:700
        
        
        # Layout principal
        central_widget = QWidget()
        self.setCentralWidget(central_widget)
        self.main_layout = QVBoxLayout(central_widget)

        # Barre de recherche
        search_layout = QHBoxLayout()
        query_label = QLabel("Query: ", self)
        search_layout.addWidget(query_label)
        
        self.search_bar = QLineEdit(self)
        self.search_bar.setPlaceholderText("Enter document name...")
        self.search_button = QPushButton("Search", self)
        
        search_layout.addWidget(self.search_bar)
        search_layout.addWidget(self.search_button)
        self.main_layout.addLayout(search_layout)

        # Options de radio
        radio_layout = QHBoxLayout()
        self.raw_text_radio = QRadioButton("Raw Text", self)
        self.processed_text_radio = QRadioButton("Processed Text", self)
        radio_layout.addWidget(self.raw_text_radio)
        radio_layout.addWidget(self.processed_text_radio)
        self.main_layout.addLayout(radio_layout)
        
        # Section Tokenization
        tokenization_box = QGroupBox("Tokenization")
        tokenization_layout = QVBoxLayout()
        self.split_radio = QRadioButton("Split", self)
        self.regex_radio = QRadioButton("Regex", self)
        tokenization_layout.addWidget(self.split_radio)
        tokenization_layout.addWidget(self.regex_radio)
        tokenization_box.setLayout(tokenization_layout)
        
        # Section Normalization
        normalization_box = QGroupBox("Normalization")
        normalization_layout = QVBoxLayout()
        self.no_stem_radio = QRadioButton("No Stem", self)
        self.porter_radio = QRadioButton("Porter", self)
        self.lancaster_radio = QRadioButton("Lancaster", self)
        normalization_layout.addWidget(self.no_stem_radio)
        normalization_layout.addWidget(self.porter_radio)
        normalization_layout.addWidget(self.lancaster_radio)
        normalization_box.setLayout(normalization_layout)
        
        # Section Indexation
        indexation_box = QGroupBox("Indexation")
        indexation_layout = QVBoxLayout()
        self.doc_per_term_radio = QRadioButton("Documents per Term", self)
        self.term_per_doc_radio = QRadioButton("Terms per Document", self)
        indexation_layout.addWidget(self.doc_per_term_radio)
        indexation_layout.addWidget(self.term_per_doc_radio)
        indexation_box.setLayout(indexation_layout)
        
        
        matching_box = QGroupBox("Matching")
        matching_layout = QGridLayout()

        # Boutons radio
        self.vector_space_radio = QRadioButton("Vector Space Model")
        self.probability_model_radio = QRadioButton("Probabilistic Model (BM25)")
        self.boolean_model_radio = QRadioButton("Boolean Model")
        self.data_mining_model_radio = QRadioButton("Data Mining Model")

        # Menu déroulant pour "Vector Space Model"
        self.matching_options = QComboBox()
        self.matching_options.addItems(["Scalar Product", "Cosine Similarity", "Jaccard Index"])

        # Champs de saisie pour k et b sous "Probabilistic Model"
        self.k_input = QLineEdit()
        validator = QDoubleValidator()
        validator.setNotation(QDoubleValidator.StandardNotation)  # Autorise les notations standard (pas scientifiques)
        validator.setRange(-1000.0, 1000.0, 3)  # Plage de valeurs entre -1000 et 1000 avec 3 décimales max
        self.k_input.setValidator(validator)
        self.k_input.setPlaceholderText("K")

        self.b_input = QLineEdit()
        self.b_input.setValidator(validator)  # Utilisez le même validateur pour `b_input`
        self.b_input.setPlaceholderText("B")

        # Ajout des widgets au layout (organisé par colonnes et lignes)
        matching_layout.addWidget(self.vector_space_radio, 0, 0)  # Ligne 0, Colonne 0
        matching_layout.addWidget(self.matching_options, 0, 1)     # Ligne 0, Colonne 1
        matching_layout.addWidget(self.probability_model_radio, 1, 0)  # Ligne 1, Colonne 0
        matching_layout.addWidget(self.k_input, 1, 1)             # Ligne 1, Colonne 1
        matching_layout.addWidget(self.b_input, 1, 2)             # Ligne 1, Colonne 2
        matching_layout.addWidget(self.boolean_model_radio, 2, 0) # Ligne 2, Colonne 0
        matching_layout.addWidget(self.data_mining_model_radio, 3, 0)  # Ligne 3, Colonne 0

        # Appliquer le layout au QGroupBox
        matching_box.setLayout(matching_layout)

       
       
        
        # Disposition des sections
        sections_layout = QHBoxLayout()
        sections_layout.addWidget(tokenization_box)
        sections_layout.addWidget(normalization_box)
        sections_layout.addWidget(indexation_box)
        sections_layout.addWidget(matching_box) 
        self.main_layout.addLayout(sections_layout)
        
        # Zone de résultats (QStackedWidget pour alterner entre texte et tableau)
        self.result_label = QLabel("Result: ", self)
        self.main_layout.addWidget(self.result_label)

        self.result_area = QStackedWidget(self)
        self.result_area.setFixedHeight(400)  # Taille fixe pour éviter d'étendre la mise en page
        self.result_area.setFixedWidth(600)  # Ajustez selon la largeur désirée
        
        
        # Widget pour afficher le texte brut
        self.raw_text_widget = QTextEdit(self)
        self.raw_text_widget.setReadOnly(True)  # Rendre le texte en lecture seule
        self.result_area.addWidget(self.raw_text_widget)
        
        # Widget pour afficher le tableau
        self.table = QTableWidget(0, 5, self)  # 5 colonnes pour N°, N° doc, terme, fréquence, poids
        self.table.setHorizontalHeaderLabels(["N°", "N° doc", "Term", "Frequency", "Weight"])
        self.table.setShowGrid(False)  # Masquer la grille du tableau
        
        # Faire en sorte que les colonnes s'étendent pour couvrir toute la largeur
        header = self.table.horizontalHeader()
        header.setSectionResizeMode(QHeaderView.Stretch)
        self.result_area.addWidget(self.table)
        
        self.main_layout.addWidget(self.result_area)




    #    //////////////////////////////////////
        Total_terms_layout = QHBoxLayout()
        
        # Création et configuration des QLabel
        self.terms_per_doc = QLabel(self)
        self.terms_all_doc = QLabel(self)
        
        # Appliquer les styles pour enlever le fond et les bordures
        style = """
            QLabel {
                margin-left: 20px;
                background-color: transparent;
                border: none;
                font-size: 14px;
                font-family: Arial, sans-serif;
            }
        """
        self.terms_all_doc.setStyleSheet(style)
        self.terms_per_doc.setStyleSheet(style)
        
        
        # Ajout des QLabel au layout horizontal
        Total_terms_layout.addWidget(self.terms_per_doc)
        Total_terms_layout.addWidget(self.terms_all_doc)
        
        # Ajout du layout horizontal dans le layout principal
        self.main_layout.addLayout(Total_terms_layout)
        
        
        
        
        # Ajustements dans le code principal
        self.main_layout.setContentsMargins(2, 2, 2, 2)  # Réduire les marges globales
        self.main_layout.setSpacing(8)  # Diminuer l'espace entre les sections
        self.result_area.setContentsMargins(2, 0, 2, 0)  # Marges gauche et droite de 2px pour le tableau
        self.result_area.setFixedWidth(self.width() - 4) 
                
                
        # events
        self.search_button.clicked.connect(self.process_search)
        self.raw_text_radio.clicked.connect(self.raw_text_radio_process)
        self.processed_text_radio.clicked.connect(self.processed_text_radio_process)
        self.vector_space_radio.toggled.connect(self.toggle_radio_buttons)
        
    def toggle_radio_buttons(self,state):
        self.doc_per_term_radio.setEnabled(not state) 
        self.term_per_doc_radio.setEnabled(not state)
        
    def raw_text_radio_process(self):
        self.vector_space_radio.setEnabled(False)
        self.matching_options.setEnabled(False)
        self.split_radio.setEnabled(False) 
        self.regex_radio.setEnabled(False)
        self.lancaster_radio.setEnabled(False)
        self.porter_radio.setEnabled(False)
        self.doc_per_term_radio.setEnabled(False)
        self.term_per_doc_radio.setEnabled(False)
        self.no_stem_radio.setEnabled(False)
        self.terms_per_doc.setText("")
        self.terms_all_doc.setText("")
       
       
    def processed_text_radio_process(self):
        self.vector_space_radio.setEnabled(True)
        self.matching_options.setEnabled(True)
        self.split_radio.setEnabled(True) 
        self.regex_radio.setEnabled(True)
        self.lancaster_radio.setEnabled(True)
        self.porter_radio.setEnabled(True)
        self.doc_per_term_radio.setEnabled(True)
        self.term_per_doc_radio.setEnabled(True)
        self.no_stem_radio.setEnabled(True)
        
         
    def display_Total_Terms(self, termes_global, nb_termes ,index):
        if nb_termes != 0 and index == 'TPD':
            # Afficher le nombre de termes par document
            self.terms_per_doc.setText(f"Terms per document : {nb_termes}")
        else :
            self.terms_per_doc.setText("")
        self.terms_all_doc.setText(f"Total terms  : {termes_global}")

        
        
    def process_search(self):
        # Obtenir le numéro de document
        document_number = self.search_bar.text()
        
        if not document_number:
            self.show_error("Veuillez entrer un numéro de document valide.")
            return

        # 
        # Vérifier le type de texte sélectionné
        if self.raw_text_radio.isChecked():
            # verification de nom_document
            result = get_text(document_number)
            self.show_raw_text(result)
        
        else:
            # Obtenir les méthodes sélectionnées
            tokenization_method = "Split" if self.split_radio.isChecked() else "Regex"
            if self.porter_radio.isChecked() :
                normalization_method = "Porter" 
            elif self.no_stem_radio.isChecked():
                normalization_method = "None" 
            else :
                normalization_method ="Lancaster"
            indexation_method = "DPT" if self.doc_per_term_radio.isChecked() else "TPD"
            
            if self.vector_space_radio.isChecked():
            # Récupération de la méthode de matching sélectionnée
                matching_method = self.matching_options.currentText()
                # Utilisation
                
                if matching_method == "Scalar Product":
                    print("produit scalaire")
                    relevances = calculer_relevance(document_number, tokenization_method, normalization_method, 'DPT',self.path)
                    print(relevances)

                elif matching_method == "Cosine Similarity":
                    print("produit cosine")
                    relevances = calculer_relevance_cosinus(document_number, tokenization_method, normalization_method, 'DPT',self.path)
                    print(relevances)

                elif matching_method == "Jaccard Index":
                    print("jacard")
                    relevances = calculer_jaccard_similarity(document_number, tokenization_method, normalization_method, 'DPT',self.path)
                    print(relevances)
                filtered_relevances = {doc: score for doc, score in relevances.items() if score > 0}
                sorted_relevances = sorted(filtered_relevances.items(), key=lambda item: item[1], reverse=True)
                print(sorted_relevances)
                # Afficher les relevances triées
                self.display_relevance(sorted_relevances)
            elif self.probability_model_radio.isChecked():
                try:
                    text = self.k_input.text()
                    corrected_text_k = text.replace(",", ".")
                    text = self.b_input.text()
                    corrected_text_b = text.replace(",", ".")
                    k = float(corrected_text_k)  # Convertit en flottant
                    b = float(corrected_text_b)
                except ValueError:
                    QMessageBox.warning(self, "Invalid Input", "Please enter valid numeric values for k and b.")
                    k = 1.5  # Valeur par défaut si erreur
                    b = 0.75  # Valeur par défaut si erreur
                print(k,b)
                relevances = calculer_relevance_BM25(document_number, tokenization_method, normalization_method, self.path ,k ,b,'output')
                # filtered_relevances = {doc: score for doc, score in relevances.items() if score > 0}
                # sorted_relevances = sorted(filtered_relevances.items(), key=lambda item: item[1], reverse=True)
                print("BM25")
                print(relevances)
                relevances = list(relevances.items())
                self.display_relevance(relevances)
            elif self.boolean_model_radio.isChecked():
                try :
                    print(document_number, tokenization_method, normalization_method)
                    relevances = model_booleen(document_number, tokenization_method, normalization_method, "DPT", self.path,output_path="output")
                    self.display_relevance(relevances)
                except Exception as e :
                    print(e)
                    QMessageBox.warning(self, "Invalid Input", "Please enter valid Expression.")
                    
            else :
                # termes_global = nb_termes_glob(tokenization_method, normalization_method)
                # Appeler la fonction pour obtenir les données
                data , nb_termes , termes_global=processing(document_number, tokenization_method,  normalization_method, path='../Collections', output_path="output",methode = indexation_method)
                self.display_results(data)
                self.display_Total_Terms(termes_global , nb_termes , indexation_method)
                
                     

    def reset_table(self, column_headers):
        """
        Réinitialise complètement le tableau avec de nouvelles colonnes et leurs en-têtes.

        Parameters:
            column_headers (list of str): Liste des noms des colonnes.
        """
        self.table.clear()  # Efface tout le contenu (cellules et en-têtes)
        self.table.setRowCount(0)  # Réinitialise le nombre de lignes
        self.table.setColumnCount(len(column_headers))  # Définir le nombre de colonnes
        self.table.setHorizontalHeaderLabels(column_headers)  # Définir les en-têtes de colonnes

        # Ajuster les colonnes pour s'étendre uniformément
        header = self.table.horizontalHeader()
        header.setSectionResizeMode(QHeaderView.Stretch)


    def show_raw_text(self, text):
        self.raw_text_widget.setText(text)
        self.result_area.setCurrentWidget(self.raw_text_widget)  # Afficher le widget de texte brut

        
    def display_results(self, data):
    # Supprimer l'affichage de l'index de ligne
        self.table.verticalHeader().setVisible(False)
        # Réinitialiser le tableau avec les colonnes spécifiques
        column_headers = ["N°", "N° doc", "Term", "Frequency", "Weight"]
        self.reset_table(column_headers)
        
        # Nettoyer le tableau et ajouter les résultats
        self.table.setRowCount(0)
        for index, row_data in enumerate(data):
            row_position = self.table.rowCount()
            self.table.insertRow(row_position)
            for column, value in enumerate(row_data):
                item = QTableWidgetItem(str(value))
                item.setTextAlignment(Qt.AlignCenter)  # Centrer le texte dans chaque cellule
                self.table.setItem(row_position, column, item)
        
        # Afficher le widget de tableau
        self.result_area.setCurrentWidget(self.table)
    


    def display_relevance(self, sorted_relevances):
        """
        Affiche les relevances triées dans le tableau.

        Parameters:
            sorted_relevances (list of tuples): Une liste de tuples (document, score de pertinence).
        """
        # Réinitialiser le tableau avec deux colonnes
        column_headers = ["Document", "Score de Pertinence"]
        self.reset_table(column_headers)

        # Ajouter les données triées dans le tableau
        for doc_id, score in sorted_relevances:
            row_position = self.table.rowCount()
            self.table.insertRow(row_position)

            # Ajouter l'identifiant du document
            doc_item = QTableWidgetItem(doc_id)
            doc_item.setTextAlignment(Qt.AlignCenter)
            self.table.setItem(row_position, 0, doc_item)

            # Ajouter le score de pertinence
            if isinstance(score, bool):
                score_item = QTableWidgetItem("True" if score else "False")
            elif isinstance(score, (int, float)):
                score_item = QTableWidgetItem(f"{score:.4f}")
            else:
                raise ValueError(f"Type non pris en charge : {type(score)}")

            score_item.setTextAlignment(Qt.AlignCenter)
            self.table.setItem(row_position, 1, score_item)

        # Afficher le tableau
        self.result_area.setCurrentWidget(self.table)

        
    
        
        
    def show_error(self, message):
        error_dialog = QMessageBox(self)
        error_dialog.setIcon(QMessageBox.Critical)
        error_dialog.setWindowTitle("Erreur")
        error_dialog.setText(message)
        error_dialog.exec_()


In [67]:
if __name__ == "__main__":
    app = QApplication(sys.argv)
    app.setStyleSheet("""
    QMainWindow {
        background-color: #f5f5f5;
    }
    QLabel {
        color: #333333;
        font-size: 14px;
    }
    QLineEdit {
        border: 1px solid #CCCCCC;
        border-radius: 5px;
        padding: 5px;
    }
    QPushButton {
        background-color: #4CAF50;
        color: white;
        font-size: 14px;
        padding: 5px 10px;
        border-radius: 5px;
    }
    QPushButton:hover {
        background-color: #45a049;
    }
    
    QRadioButton {
        font-size: 13px;
    }
    QGroupBox {
        font-size: 15px;
        color: #333333;
        border: 1px solid #CCCCCC;
        border-radius: 8px;
        margin-top: 10px;
        padding: 10px;
    }
    QTextEdit {
        background-color: #f0f0f0;
        border: 1px solid #CCCCCC;
        border-radius: 5px;
        padding: 5px;
    }
    QTableWidget {
        background-color: #FFFFFF;
        border: 1px solid #CCCCCC;
        border-radius: 5px;
        padding: 2px;
        gridline-color: #E0E0E0;
    }
    QTableWidget::item {
        padding: 5px;
        border-bottom: 1px solid #E0E0E0;
    }
    QHeaderView::section {
        background-color: #f0f0f0;
        padding: 5px;
        border: 1px solid #CCCCCC;
        font-weight: bold;
    }

    QLabel {
        margin-left: 20px;
        background-color: transparent;
        border: none;
        font-size: 14px;
        font-family: Arial, sans-serif;
    }
    
""")

    window = SearchApp()
    window.show()
    sys.exit(app.exec_())


produit scalaire
[(1, 'effect', 'D1', 1, 0.0795), (2, 'effect', 'D3', 1, 0.1193), (3, 'effect', 'D6', 1, 0.1193)]
[(1, 'distribution', 'D1', 1, 0.0663), (2, 'distribution', 'D3', 1, 0.0995), (3, 'distribution', 'D4', 1, 0.0332), (4, 'distribution', 'D6', 1, 0.0995)]
{'D1': 0.14579999999999999, 'D2': 0, 'D3': 0.2188, 'D4': 0.0332, 'D5': 0, 'D6': 0.2188}
[('D3', 0.2188), ('D6', 0.2188), ('D1', 0.14579999999999999), ('D4', 0.0332)]
produit cosine
doc vectors {'D1': {'effect': 0.0795, 'distribution': 0.0663}, 'D2': {}, 'D3': {'effect': 0.1193, 'distribution': 0.0995}, 'D4': {'distribution': 0.0332}, 'D5': {}, 'D6': {'effect': 0.1193, 'distribution': 0.0995}}
{'D1': 0.06900726067964397, 'D2': 0.0, 'D3': 0.12320148746796511, 'D4': 0.024022928446014405, 'D5': 0.0, 'D6': 0.08426610754279582}
[('D3', 0.12320148746796511), ('D6', 0.08426610754279582), ('D1', 0.06900726067964397), ('D4', 0.024022928446014405)]
jacard
doc_name, doc_vector : ('D1', {'effect': 0.0795, 'distribution': 0.0663})
doc_na

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
data = {'D4': -0.08823075947891026, 'D2': -0.09795556690944766, 'D1': -0.2728925083902616, 'D3': -0.27339867077595187, 'D6': -0.3028044304368359}
sorted_relevances = sorted(data.items(), key=lambda item: item[1], reverse=True)
for item in data.items():
    print(item)
print(sorted_relevances)
relevances = list(data.items())
print(relevances)

('D4', -0.08823075947891026)
('D2', -0.09795556690944766)
('D1', -0.2728925083902616)
('D3', -0.27339867077595187)
('D6', -0.3028044304368359)
[('D4', -0.08823075947891026), ('D2', -0.09795556690944766), ('D1', -0.2728925083902616), ('D3', -0.27339867077595187), ('D6', -0.3028044304368359)]
[('D4', -0.08823075947891026), ('D2', -0.09795556690944766), ('D1', -0.2728925083902616), ('D3', -0.27339867077595187), ('D6', -0.3028044304368359)]
