In [246]:
import nltk
import os
import math
from collections import defaultdict
from nltk import FreqDist

STOPWORDS = set(nltk.corpus.stopwords.words('english'))
PORTER_STEMMER = nltk.PorterStemmer()
LANCASTER_STEMMER = nltk.LancasterStemmer()


In [268]:
def get_processing_args():
    tokenization = "Split"
    normalization = "None",
    file_type = "TPD"
    return tokenization, normalization, file_type

In [269]:
def preprocessing(doc_path, tokenization, normalization):
    with open(doc_path, 'r') as file:
        text = file.read()
        
    # Tokenization
    if tokenization == "Split":
        tokens = text.split()
    else:
        exp_reg = nltk.RegexpTokenizer(r'\d+(?:\.\d+)?x\d+|\d+(?:\.\d+)|\w+(?:-\w+)*|(?:[A-Z]\.)+|\w+')
        tokens = exp_reg.tokenize(text)

    # Remove stopwords
    tokens = [term for term in tokens if term.lower() not in STOPWORDS]

    # Normalization
    if normalization == "Porter":
        tokens = [PORTER_STEMMER.stem(term) for term in tokens]
    elif normalization == "Lancaster":
        tokens = [LANCASTER_STEMMER.stem(term) for term in tokens]

    return tokens

In [270]:
def build_global_term_frequencies(tokenization, normalization):
    global_term_frequencies = defaultdict(int)

    for doc_name in os.listdir('Collections'):
        doc_path = os.path.join('Collections', doc_name)
        tokens = preprocessing(doc_path, tokenization, normalization)
        unique_terms = set(tokens)

        for term in unique_terms:
            global_term_frequencies[term] += 1
            
    return global_term_frequencies

In [271]:
def Display_TPD_result(query, terms_freq, global_term_frequencies, N):
    max_freq = max(terms_freq.values())

    for idx, (term, freq) in enumerate(terms_freq.items(), start=1):
        poids = (freq / max_freq) * math.log10((N / global_term_frequencies[term]) + 1)
        print(idx, query, term, freq, format(poids, '.4f'))

In [272]:
def text_processing(query):
    tokenization, normalization, file_type = get_processing_args()
   
    global_term_frequencies = build_global_term_frequencies(tokenization, normalization)  # Calculate global term frequencies
    N = len(os.listdir('Collections'))
    
    if file_type == "TPD":
        doc_path = os.path.join('Collections', f"{query}.txt")
        tokens = preprocessing(doc_path, tokenization, normalization)
        terms_freq = FreqDist(tokens)

        Display_TPD_result(query, terms_freq, global_term_frequencies, N)
        
    else :
        i=0
        for doc_name in os.listdir('Collections'):
            doc_path = os.path.join('Collections', doc_name)
            Tokens = preprocessing(doc_path, tokenization, normalization)
            terms_freq = FreqDist(Tokens)

            max_freq = max(terms_freq.values())
            for term, freq in terms_freq.items():  
                if term == query:  # Check if the term is the specific query term
                    poids = ((freq / max_freq) * math.log10((N / global_term_frequencies[term]) + 1))
                    i+=1
                    print(f"{i} Term: {term}, Document: {os.path.splitext(doc_name)[0]}, Frequency: {freq}, Weight: {format(poids, '.4f')}")
                    

In [273]:
def get_text_args(query, raw, processed):
    if raw:
        doc_path = os.path.join('Collections', f"{query}.txt")
        with open(doc_path, 'r') as file:
            text = file.read()
        print(text)
    elif processed:
        text_processing(query)

In [275]:
get_text_args("D5", False, True)

1 D5 secondary 2 0.1408
2 D5 flow 5 0.1427
3 D5 fields 1 0.0704
4 D5 embedded 3 0.2113
5 D5 hypersonic 3 0.1505
6 D5 shock 5 0.2509
7 D5 layers 1 0.0704
8 D5 . 12 0.3010
9 D5 ramp 2 0.1408
10 D5 compression 2 0.1408
11 D5 surface 3 0.1505
12 D5 located 1 0.0704
13 D5 locally 1 0.0704
14 D5 supersonic 1 0.0704
15 D5 region 1 0.0502
16 D5 behind 1 0.0704
17 D5 bow 2 0.1408
18 D5 wave, 2 0.1408
19 D5 generates 1 0.0704
20 D5 wave 2 0.1003
21 D5 disturbance 1 0.0704
22 D5 may 1 0.0704
23 D5 viewed 1 0.0704
24 D5 newtonian 7 0.4930
25 D5 impact 1 0.0704
26 D5 layer 2 0.0663
27 D5 thin 1 0.0704
28 D5 examination 1 0.0704
29 D5 applicability 1 0.0704
30 D5 theory 4 0.2007
31 D5 cones 1 0.0704
32 D5 wedges 1 0.0704
33 D5 uniform 1 0.0502
34 D5 streams 1 0.0704
35 D5 suggests 1 0.0704
36 D5 expected 1 0.0704
37 D5 give 1 0.0502
38 D5 useful 1 0.0704
39 D5 approximation 1 0.0704
40 D5 pressures 4 0.2817
41 D5 pressure 6 0.3010
42 D5 equation 1 0.0502
43 D5 based 1 0.0704
44 D5 concept 1 0.0704
4