## import BIB

In [1]:
from PyQt5.QtWidgets import (
    QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout,
    QLineEdit, QPushButton, QRadioButton, QLabel, QGroupBox,
    QTableWidget, QTableWidgetItem, QScrollArea, QTextEdit, QStackedWidget, QGridLayout,
    QMessageBox,
)
from PyQt5.QtGui import QIcon
from PyQt5.QtCore import Qt
import sys

In [2]:
import nltk
import os
import math
from collections import defaultdict
from nltk import FreqDist

STOPWORDS = set(nltk.corpus.stopwords.words('english'))
PORTER_STEMMER = nltk.PorterStemmer()
LANCASTER_STEMMER = nltk.LancasterStemmer()

## Fonctions

In [3]:
def get_processing_args():
    tokenization = "Split"
    normalization = "None",
    file_type = "TPD"
    return tokenization, normalization, file_type

In [4]:
def preprocessing(doc_path, tokenization, normalization):
    with open(doc_path, 'r') as file:
        text = file.read()
        
    # Tokenization
    if tokenization == "Split":
        tokens = text.split()
    else:
        exp_reg = nltk.RegexpTokenizer(r'\d+(?:\.\d+)?x\d+|\d+(?:\.\d+)|\w+(?:-\w+)*|(?:[A-Z]\.)+|\w+')
        tokens = exp_reg.tokenize(text)

    # Remove stopwords
    tokens = [term for term in tokens if term.lower() not in STOPWORDS]

    # Normalization
    if normalization == "Porter":
        tokens = [PORTER_STEMMER.stem(term) for term in tokens]
    elif normalization == "Lancaster":
        tokens = [LANCASTER_STEMMER.stem(term) for term in tokens]

    return tokens

In [5]:
def build_global_term_frequencies(tokenization, normalization):
    global_term_frequencies = defaultdict(int)

    for doc_name in os.listdir('Collections'):
        doc_path = os.path.join('Collections', doc_name)
        tokens = preprocessing(doc_path, tokenization, normalization)
        unique_terms = set(tokens)

        for term in unique_terms:
            global_term_frequencies[term] += 1
            
    return global_term_frequencies

In [6]:
def TPD_result(query, terms_freq, global_term_frequencies, N):
    max_freq = max(terms_freq.values())
    results=[]
    for idx, (term, freq) in enumerate(terms_freq.items(), start=1):
        poids = (freq / max_freq) * math.log10((N / global_term_frequencies[term]) + 1)
        results.append((idx, term, query, freq, round(poids, 4)))
        
    return results

In [7]:
def text_processing(query,tokenization, normalization, file_type):
    # tokenization, normalization, file_type = get_processing_args()
   
    global_term_frequencies = build_global_term_frequencies(tokenization, normalization)  # Calculate global term frequencies
    N = len(os.listdir('Collections'))
    results =[]
    if file_type == "TPD":
        doc_path = os.path.join('Collections', f"{query}.txt")
        tokens = preprocessing(doc_path, tokenization, normalization)
        terms_freq = FreqDist(tokens)
        
        result = TPD_result(query, terms_freq, global_term_frequencies, N)
        return result
        
    else :
        i=0
        for doc_name in os.listdir('Collections'):
            doc_path = os.path.join('Collections', doc_name)
            Tokens = preprocessing(doc_path, tokenization, normalization)
            terms_freq = FreqDist(Tokens)

            max_freq = max(terms_freq.values())
            for term, freq in terms_freq.items():  
                if term == query:  # Check if the term is the specific query term
                    poids = ((freq / max_freq) * math.log10((N / global_term_frequencies[term]) + 1))
                    i+=1
                    results.append((i, term, os.path.splitext(doc_name)[0], freq, round(poids, 4)))
        
        return results
   
                    

In [8]:
def get_text(query):
    # if raw:
    doc_path = os.path.join('Collections', f"{query}.txt")
    with open(doc_path, 'r') as file:
        text = file.read()
    return text
    # elif processed:
    #     text_processing(query)

## Interface

In [9]:

class SearchApp(QMainWindow):
    def __init__(self):
        super().__init__()

        self.setWindowTitle("Document Search and Processing")
        self.setGeometry(100, 100, 900, 700) #8,6
        self.setWindowIcon(QIcon("./icons/interface_icon.png")) 
        self.setFixedSize(900, 700)
        
        
        # Layout principal
        central_widget = QWidget()
        self.setCentralWidget(central_widget)
        self.main_layout = QVBoxLayout(central_widget)

        # Barre de recherche
        search_layout = QHBoxLayout()
        query_label = QLabel("Query: ", self)
        search_layout.addWidget(query_label)
        
        self.search_bar = QLineEdit(self)
        self.search_bar.setPlaceholderText("Enter document name...")
        self.search_button = QPushButton("Search", self)
        
        search_layout.addWidget(self.search_bar)
        search_layout.addWidget(self.search_button)
        self.main_layout.addLayout(search_layout)

        # Options de radio
        radio_layout = QHBoxLayout()
        self.raw_text_radio = QRadioButton("Raw Text", self)
        self.processed_text_radio = QRadioButton("Processed Text", self)
        radio_layout.addWidget(self.raw_text_radio)
        radio_layout.addWidget(self.processed_text_radio)
        self.main_layout.addLayout(radio_layout)
        
        # Section Tokenization
        tokenization_box = QGroupBox("Tokenization")
        tokenization_layout = QVBoxLayout()
        self.split_radio = QRadioButton("Split", self)
        self.regex_radio = QRadioButton("Regex", self)
        tokenization_layout.addWidget(self.split_radio)
        tokenization_layout.addWidget(self.regex_radio)
        tokenization_box.setLayout(tokenization_layout)
        
        # Section Normalization
        normalization_box = QGroupBox("Normalization")
        normalization_layout = QVBoxLayout()
        self.no_stem_radio = QRadioButton("No Stem", self)
        self.porter_radio = QRadioButton("Porter", self)
        self.lancaster_radio = QRadioButton("Lancaster", self)
        normalization_layout.addWidget(self.no_stem_radio)
        normalization_layout.addWidget(self.porter_radio)
        normalization_layout.addWidget(self.lancaster_radio)
        normalization_box.setLayout(normalization_layout)
        
        # Section Indexation
        indexation_box = QGroupBox("Indexation")
        indexation_layout = QVBoxLayout()
        self.doc_per_term_radio = QRadioButton("Documents per Term", self)
        self.term_per_doc_radio = QRadioButton("Terms per Document", self)
        indexation_layout.addWidget(self.doc_per_term_radio)
        indexation_layout.addWidget(self.term_per_doc_radio)
        indexation_box.setLayout(indexation_layout)
        
        # Disposition des sections
        sections_layout = QHBoxLayout()
        sections_layout.addWidget(tokenization_box)
        sections_layout.addWidget(normalization_box)
        sections_layout.addWidget(indexation_box)
        self.main_layout.addLayout(sections_layout)
        
        # Zone de résultats (QStackedWidget pour alterner entre texte et tableau)
        self.result_label = QLabel("Result: ", self)
        self.main_layout.addWidget(self.result_label)

        self.result_area = QStackedWidget(self)
        self.result_area.setFixedHeight(500)  # Taille fixe pour éviter d'étendre la mise en page
        self.result_area.setFixedWidth(600)  # Ajustez selon la largeur désirée
        
        # Widget pour afficher le texte brut
        self.raw_text_widget = QTextEdit(self)
        self.raw_text_widget.setReadOnly(True)  # Rendre le texte en lecture seule
        self.result_area.addWidget(self.raw_text_widget)
        
        # Widget pour afficher le tableau
        self.table = QTableWidget(0, 5, self)  # 5 colonnes pour N°, N° doc, terme, fréquence, poids
        self.table.setHorizontalHeaderLabels(["N°", "N° doc", "Term", "Frequency", "Weight"])
        self.table.setShowGrid(False)  # Masquer la grille du tableau
        self.result_area.addWidget(self.table)
        
        self.main_layout.addWidget(self.result_area)


        # Ajustements dans le code principal
        self.main_layout.setContentsMargins(2, 2, 2, 2)  # Réduire les marges globales
        self.main_layout.setSpacing(8)  # Diminuer l'espace entre les sections
        self.result_area.setContentsMargins(2, 0, 2, 0)  # Marges gauche et droite de 2px pour le tableau
        self.result_area.setFixedWidth(self.width() - 4) 
                
        self.search_button.clicked.connect(self.process_search)
        self.raw_text_radio.clicked.connect(self.raw_text_radio_process)
        self.processed_text_radio.clicked.connect(self.processed_text_radio_process)
        

    def raw_text_radio_process(self):
        self.split_radio.setEnabled(False) 
        self.regex_radio.setEnabled(False)
        self.lancaster_radio.setEnabled(False)
        self.porter_radio.setEnabled(False)
        self.doc_per_term_radio.setEnabled(False)
        self.term_per_doc_radio.setEnabled(False)
       
       
    def processed_text_radio_process(self):
        self.split_radio.setEnabled(True) 
        self.regex_radio.setEnabled(True)
        self.lancaster_radio.setEnabled(True)
        self.porter_radio.setEnabled(True)
        self.doc_per_term_radio.setEnabled(True)
        self.term_per_doc_radio.setEnabled(True)
         
    def process_search(self):
        # Obtenir le numéro de document
        document_number = self.search_bar.text()
        
        if not document_number:
            self.show_error("Veuillez entrer un numéro de document valide.")
            return

        # 
        # Vérifier le type de texte sélectionné
        if self.raw_text_radio.isChecked():
            # verification de nom_document
            result = get_text(document_number)
            self.show_raw_text(result)
        else:
            # Obtenir les méthodes sélectionnées
            tokenization_method = "Split" if self.split_radio.isChecked() else "Regex"
            if self.porter_radio.isChecked() :
                normalization_method = "Porter" 
            elif self.no_stem_radio.isChecked():
                normalization_method = "None" 
            else :
                normalization_method ="Lancaster"
            indexation_method = "DPT" if self.doc_per_term_radio.isChecked() else "TPD"
            
            

            # Appeler la fonction pour obtenir les données
            data = text_processing(document_number, tokenization_method, normalization_method, indexation_method)
            print(data)
            self.display_results(data)

 

    # def processed(self, document_number, tokenization_method, normalization_method, indexation_method):
    #     # Exemple de données à retourner en fonction de la logique
    #     if indexation_method == "Documents per Term":
    #         return [
    #             (1, "thought", 5, 12, 0.92),
    #             (2, "run", 5, 10, 0.5)
    #         ]
    #     else:  # "Terms per Document"
    #         return [
    #             (1, 5, "run", 12, 0.92),
    #             (2, 5, "move", 5, 0.2)
    #         ]

    def show_raw_text(self, text):
        self.raw_text_widget.setText(text)
        self.result_area.setCurrentWidget(self.raw_text_widget)  # Afficher le widget de texte brut

    def display_results(self, data):
        
        # Nettoyer le tableau et ajouter les résultats
        self.table.setRowCount(0)
        for index, data in enumerate(data):
            row_position = self.table.rowCount()
            self.table.insertRow(row_position)
            for column, value in enumerate(data):
                self.table.setItem(row_position, column, QTableWidgetItem(str(value)))
        
        self.result_area.setCurrentWidget(self.table)  # Afficher le widget de tableau
    def show_error(self, message):
        error_dialog = QMessageBox(self)
        error_dialog.setIcon(QMessageBox.Critical)
        error_dialog.setWindowTitle("Erreur")
        error_dialog.setText(message)
        error_dialog.exec_()



In [10]:
if __name__ == "__main__":
    app = QApplication(sys.argv)
    app.setStyleSheet("""
    QMainWindow {
        background-color: #f5f5f5;
    }
    QLabel {
        color: #333333;
        font-size: 14px;
    }
    QLineEdit {
        border: 1px solid #CCCCCC;
        border-radius: 5px;
        padding: 5px;
    }
    QPushButton {
        background-color: #4CAF50;
        color: white;
        font-size: 14px;
        padding: 5px 10px;
        border-radius: 5px;
    }
    QPushButton:hover {
        background-color: #45a049;
    }
    
    QRadioButton {
        font-size: 13px;
    }
    QGroupBox {
        font-size: 15px;
        color: #333333;
        border: 1px solid #CCCCCC;
        border-radius: 8px;
        margin-top: 10px;
        padding: 10px;
    }
    QTextEdit {
        background-color: #f0f0f0;
        border: 1px solid #CCCCCC;
        border-radius: 5px;
        padding: 5px;
    }
    QTableWidget {
        background-color: #FFFFFF;
        border: 1px solid #CCCCCC;
        border-radius: 5px;
        padding: 2px;
        gridline-color: #E0E0E0;
    }
    QTableWidget::item {
        padding: 5px;
        border-bottom: 1px solid #E0E0E0;
    }
    QHeaderView::section {
        background-color: #f0f0f0;
        padding: 5px;
        border: 1px solid #CCCCCC;
        font-weight: bold;
    }
""")

    window = SearchApp()
    window.show()
    sys.exit(app.exec_())


[]
[(1, 'secondari', 'D5', 2, 0.169), (2, 'flow', 'D5', 5, 0.1712), (3, 'field', 'D5', 1, 0.0602), (4, 'embed', 'D5', 3, 0.2535), (5, 'hyperson', 'D5', 3, 0.1806), (6, 'shock', 'D5', 5, 0.301), (7, 'layer', 'D5', 3, 0.1194), (8, 'ramp', 'D5', 2, 0.169), (9, 'compress', 'D5', 2, 0.169), (10, 'surfac', 'D5', 3, 0.1806), (11, 'locat', 'D5', 2, 0.169), (12, 'local', 'D5', 2, 0.169), (13, 'superson', 'D5', 1, 0.0845), (14, 'region', 'D5', 1, 0.0602), (15, 'behind', 'D5', 1, 0.0845), (16, 'bow', 'D5', 2, 0.169), (17, 'wave', 'D5', 4, 0.2408), (18, 'gener', 'D5', 1, 0.0602), (19, 'disturb', 'D5', 1, 0.0845), (20, 'may', 'D5', 1, 0.0845), (21, 'view', 'D5', 1, 0.0845), (22, 'newtonian', 'D5', 7, 0.5916), (23, 'impact', 'D5', 1, 0.0845), (24, 'thin', 'D5', 1, 0.0845), (25, 'examin', 'D5', 1, 0.0845), (26, 'applic', 'D5', 1, 0.0845), (27, 'theori', 'D5', 4, 0.2408), (28, 'cone', 'D5', 2, 0.169), (29, 'wedg', 'D5', 1, 0.0845), (30, 'uniform', 'D5', 1, 0.0602), (31, 'stream', 'D5', 2, 0.0954), (32

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
