In [34]:
import nltk
import os
import math
from collections import defaultdict
from nltk import FreqDist
import numpy as np
import json
from nltk.probability import FreqDist

STOPWORDS = set(nltk.corpus.stopwords.words('english'))
PORTER_STEMMER = nltk.PorterStemmer()
LANCASTER_STEMMER = nltk.LancasterStemmer()

In [35]:
path = '../Collections'


In [36]:
def preprocessing(doc_path, tokenization, normalization):
    # read all fils 
    with open(doc_path, 'r') as file:
        text = file.read()
        
    # Tokenization
    if tokenization == "Split":
        tokens = text.split()
    else:
        exp_reg = nltk.RegexpTokenizer(r'\d+(?:\.\d+)?x\d+|\d+(?:\.\d+)|\w+(?:-\w+)*|(?:[A-Z]\.)+|\w+')
        tokens = exp_reg.tokenize(text)

    # Remove stopwords
    tokens = [term for term in tokens if term.lower() not in STOPWORDS]

    # Normalization
    if normalization == "Porter":
        tokens = [PORTER_STEMMER.stem(term) for term in tokens]
    elif normalization == "Lancaster":
        tokens = [LANCASTER_STEMMER.stem(term) for term in tokens]
    print("------Tokens:------------------",tokens)
    return tokens

In [37]:
def build_global_term_frequencies(tokenization, normalization):
    global_term_frequencies = defaultdict(int)

    for doc_name in os.listdir(path):
        doc_path = os.path.join(path, doc_name)
        tokens = preprocessing(doc_path, tokenization, normalization)
        unique_terms = set(tokens)

        for term in unique_terms:
            global_term_frequencies[term] += 1
            
            
    print("------Global term frequencies:------------------",global_term_frequencies)
    
    return global_term_frequencies

In [38]:
def TPD_result(query, terms_freq, global_term_frequencies, N):
    max_freq = max(terms_freq.values())
    results=[]
    for idx, (term, freq) in enumerate(terms_freq.items(), start=1):
        poids = (freq / max_freq) * math.log10((N / global_term_frequencies[term]) + 1)
        results.append((idx, term, query, freq, round(poids, 4)))
        
    return results

In [39]:
def nb_termes_glob(tokenization, normalization):
    nb_termes_global = []
    for doc_name in os.listdir(path):
        doc_path = os.path.join(path, doc_name)
        # Appliquer le prétraitement pour obtenir les tokens du document
        tokens = preprocessing(doc_path, tokenization, normalization)
        
        # Ajouter les tokens du document à la liste globale
        nb_termes_global.extend(tokens)

    # Obtenir le nombre de termes uniques
    termes_uniques = np.unique(nb_termes_global)
    print("Termes uniques : ", termes_uniques)  # Optionnel : pour visualiser les termes uniques
    
    return len(termes_uniques)
            

In [40]:
def text_processing(query,tokenization, normalization, file_type):
    
    # tokenization, normalization, file_type = get_processing_args()
    nb_terms = 0
    global_term_frequencies = build_global_term_frequencies(tokenization, normalization)  # Calculate global term frequencies
    N = len(os.listdir(path))
    results =[]
    if file_type == "TPD":
        doc_path = os.path.join(path, f"{query}.txt")
        tokens = preprocessing(doc_path, tokenization, normalization)
        nb_terms = len(np.unique(tokens))
        terms_freq = FreqDist(tokens)
        
        result = TPD_result(query, terms_freq, global_term_frequencies, N)
        
        return result , nb_terms
        
    else :
        query = process_input(query, normalization)
        i=0
        for doc_name in os.listdir(path):
            doc_path = os.path.join(path, doc_name)
            Tokens = preprocessing(doc_path, tokenization, normalization)
            terms_freq = FreqDist(Tokens)

            max_freq = max(terms_freq.values())
            for term, freq in terms_freq.items():  
                if term == query:  # Check if the term is the specific query term
                    poids = ((freq / max_freq) * math.log10((N / global_term_frequencies[term]) + 1))
                    i+=1
                    results.append((i, term, os.path.splitext(doc_name)[0], freq, round(poids, 4)))
        
        return results , nb_terms
   
                    

In [41]:
def create_descriptor_and_inverse_files_with_weights(path, tokenization, normalization, output_path="output"):
    # Créer un nom unique pour les fichiers en fonction des choix de tokenization et normalization
    descriptor_filename = f"descripteur_{tokenization}_{normalization}.json"
    inverse_index_filename = f"inverse_index_{tokenization}_{normalization}.json"
    
    descriptor_path = os.path.join(output_path, descriptor_filename)
    inverse_index_path = os.path.join(output_path, inverse_index_filename)
    
    # Vérifier si les fichiers existent déjà
    if os.path.exists(descriptor_path) and os.path.exists(inverse_index_path):
        print(f"Les fichiers descripteur et inverse existent déjà : {descriptor_path} et {inverse_index_path}")
        
        # Charger les fichiers existants
        with open(descriptor_path, "r", encoding="utf-8") as desc_file:
            descripteur = json.load(desc_file)
        with open(inverse_index_path, "r", encoding="utf-8") as inverse_file:
            inverse_index = json.load(inverse_file)
        
        return descripteur, inverse_index
    
    # Si les fichiers n'existent pas, les créer
    print("Création des fichiers descripteur et inverse...")
    
    # Initialiser les fichiers descripteur et inverse
    descripteur = {}
    inverse_index = defaultdict(lambda: defaultdict(lambda: {"freq": 0, "poids": 0}))
    
    # Nombre total de documents
    documents = os.listdir(path)
    N = len(documents)
    
    # Calcul des fréquences globales pour les poids
    global_term_frequencies = defaultdict(int)
    for doc_name in documents:
        doc_path = os.path.join(path, doc_name)
        tokens = preprocessing(doc_path, tokenization, normalization)
        unique_terms = set(tokens)
        for term in unique_terms:
            global_term_frequencies[term] += 1
    
    # Construire les fichiers descripteur et inverse
    for doc_name in documents:
        doc_path = os.path.join(path, doc_name)
        tokens = preprocessing(doc_path, tokenization, normalization)
        terms_freq = FreqDist(tokens)  # Fréquence des termes
        max_freq = max(terms_freq.values())  # Fréquence maximale dans le document
        doc_key = os.path.splitext(doc_name)[0]
        
        # Ajouter les termes au fichier descripteur
        descripteur[doc_key] = {}
        for term, freq in terms_freq.items():
            poids = (freq / max_freq) * math.log10((N / global_term_frequencies[term]) + 1)
            descripteur[doc_key][term] = {"freq": freq, "poids": round(poids, 4)}
            
            # Ajouter les termes au fichier inverse
            inverse_index[term][doc_key]["freq"] = freq
            inverse_index[term][doc_key]["poids"] = round(poids, 4)
    
    # Sauvegarder les fichiers en JSON
    os.makedirs(output_path, exist_ok=True)
    
    with open(descriptor_path, "w", encoding="utf-8") as desc_file:
        json.dump(descripteur, desc_file, indent=4, ensure_ascii=False)
        
    with open(inverse_index_path, "w", encoding="utf-8") as inverse_file:
        json.dump(inverse_index, inverse_file, indent=4, ensure_ascii=False)
    

    return descripteur, inverse_index



In [42]:
def process_input(query, normalization):
    # appliquer le traitement sur la requete
    if normalization == "Porter":
        query = PORTER_STEMMER.stem(query) 
    elif normalization == "Lancaster":
        query = LANCASTER_STEMMER.stem(query) 
    return query

In [55]:
def modele_booleen1(query , normalization , tokenization ,output_path):
    # Générer les noms des fichiers
    descriptor_filename = f"descripteur_{tokenization}_{normalization}.json"
    inverse_index_filename = f"inverse_index_{tokenization}_{normalization}.json"

    descriptor_path = os.path.join(output_path, descriptor_filename)
    inverse_index_path = os.path.join(output_path, inverse_index_filename)

    # Vérifier si les fichiers existent, sinon les créer
    if not os.path.exists(descriptor_path) or not os.path.exists(inverse_index_path):
        print(f"Les fichiers pour {tokenization} et {normalization} n'existent pas. Création en cours...")
        create_descriptor_and_inverse_files_with_weights(path, tokenization, normalization, output_path)

    # Charger les fichiers descripteur et inverse
    with open(descriptor_path, "r", encoding="utf-8") as desc_file:
        descripteur = json.load(desc_file)
    with open(inverse_index_path, "r", encoding="utf-8") as inverse_file:
        inverse_index = json.load(inverse_file)
        
        
        
    # cleaning input
    termes = query.split()
    cleaned_query = [process_input(terme , normalization) if terme!='NOT' and terme!='OR' and terme!='AND' else terme for terme in termes ] 
    query_termes = [process_input(terme , normalization)   for terme in termes if terme!='NOT' and terme!='OR' and terme!='AND'] 
    
    #Reconstruire la requête après le preprocessing des termes
    q = ''
    for terme in cleaned_query:
        q = q + ' ' + terme
   
    q = q.lower()
    
    relevance_dict = {doc: 0 for doc in descripteur}
    for terme in query_termes :
        for doc_name, data in descripteur[terme].items():
            if terme == data:
                relevance_dict[doc_name][f'{terme}']=True 
            else :
                relevance_dict[doc_name][f'{terme}']=False
        
    
    print (relevance_dict)
    
    return q, cleaned_query

In [78]:
def modele_booleen(query, normalization, tokenization, output_path, path):
    # Générer les noms des fichiers
    descriptor_filename = f"descripteur_{tokenization}_{normalization}.json"
    inverse_index_filename = f"inverse_index_{tokenization}_{normalization}.json"

    descriptor_path = os.path.join(output_path, descriptor_filename)
    inverse_index_path = os.path.join(output_path, inverse_index_filename)

    # Vérifier si les fichiers existent, sinon les créer
    if not os.path.exists(descriptor_path) or not os.path.exists(inverse_index_path):
        print(f"Les fichiers pour {tokenization} et {normalization} n'existent pas. Création en cours...")
        create_descriptor_and_inverse_files_with_weights(path, tokenization, normalization, output_path)

    # Charger les fichiers descripteur et inverse
    with open(descriptor_path, "r", encoding="utf-8") as desc_file:
        descripteur = json.load(desc_file)
    with open(inverse_index_path, "r", encoding="utf-8") as inverse_file:
        inverse_index = json.load(inverse_file)
    # print(descripteur)
    # Nettoyage de l'entrée
    termes = query.split()
    cleaned_query = [
        process_input(terme, normalization) if terme not in {'NOT', 'OR', 'AND'} else terme
        for terme in termes
    ]
    query_termes = [
        process_input(terme, normalization) for terme in termes
        if terme not in {'NOT', 'OR', 'AND'}
    ]

    # Reconstruire la requête après le preprocessing des termes
    q = ' '.join(cleaned_query).lower()
    print(q)
    # Initialisation de relevance_dict
    relevance_dict = {doc: {} for doc in descripteur}

    # Calcul de la pertinence
    for terme in query_termes:
        print("---------------")
        for doc_name, data in descripteur.items():
            if terme in data:  # Vérifiez si le terme existe dans le descripteur
                print(terme)
        # for doc_name, data in descripteur[terme].items():
        #         print(data)
        #         relevance_dict[doc_name][terme] = (terme == data)

    print(relevance_dict)

    return q, cleaned_query


In [79]:
q, cleaned_query = modele_booleen("effet AND distribution","None","split","output",path)


effet and distribution
---------------
---------------
distribution
distribution
distribution
distribution
{'D1': {}, 'D2': {}, 'D3': {}, 'D4': {}, 'D5': {}, 'D6': {}}


In [None]:
# Variables booléennes exemples (vous devez les définir avant utilisation)
effet = True  # ou False
distribution = False  # ou True
autre_condition = True  # ou False

# Phrase conditionnelle
q = 'effet AND distribution'

# Remplacement des opérateurs logiques pour Python
q_python = q.replace('AND', 'and').replace('OR', 'or').replace('NOT', 'not')

# Conversion dynamique
try:

    if eval(q_python):
        print("Condition remplie")
    else:
        print("Condition non remplie")
except NameError as e:
    print(f"Erreur : une des variables utilisées n'est pas définie. {e}")


Condition non remplie


In [None]:
# Liste de la condition
elements = ['effet', 'AND', 'distribution']

# Exemple de valeurs pour les variables
variables = {'effet': True, 'distribution': False}

# Reconstruction de la condition
expression = f"{variables['effet']} {elements[1].lower()} {variables['distribution']}"

# Évaluation de l'expression
result = eval(expression)

# Résultat
print("Condition remplie" if result else "Condition non remplie")
