In [1]:
import nltk
import os
import math
from collections import defaultdict
from nltk import FreqDist

STOPWORDS = set(nltk.corpus.stopwords.words('english'))
PORTER_STEMMER = nltk.PorterStemmer()
LANCASTER_STEMMER = nltk.LancasterStemmer()


In [2]:
def get_processing_args():
    tokenization = "Split"
    normalization = "None",
    file_type = "TPD"
    return tokenization, normalization, file_type

In [3]:
def preprocessing(doc_path, tokenization, normalization):
    with open(doc_path, 'r') as file:
        text = file.read()
        
    # Tokenization
    if tokenization == "Split":
        tokens = text.split()
    else:
        exp_reg = nltk.RegexpTokenizer(r'\d+(?:\.\d+)?x\d+|\d+(?:\.\d+)|\w+(?:-\w+)*|(?:[A-Z]\.)+|\w+')
        tokens = exp_reg.tokenize(text)

    # Remove stopwords
    tokens = [term for term in tokens if term.lower() not in STOPWORDS]

    # Normalization
    if normalization == "Porter":
        tokens = [PORTER_STEMMER.stem(term) for term in tokens]
    elif normalization == "Lancaster":
        tokens = [LANCASTER_STEMMER.stem(term) for term in tokens]

    return tokens

In [4]:
def build_global_term_frequencies(tokenization, normalization):
    global_term_frequencies = defaultdict(int)

    for doc_name in os.listdir('Collections'):
        doc_path = os.path.join('Collections', doc_name)
        tokens = preprocessing(doc_path, tokenization, normalization)
        unique_terms = set(tokens)

        for term in unique_terms:
            global_term_frequencies[term] += 1
            
    return global_term_frequencies

In [5]:
def Display_TPD_result(query, terms_freq, global_term_frequencies, N):
    max_freq = max(terms_freq.values())

    for idx, (term, freq) in enumerate(terms_freq.items(), start=1):
        poids = (freq / max_freq) * math.log10((N / global_term_frequencies[term]) + 1)
        print(idx, query, term, freq, format(poids, '.4f'))

In [6]:
def text_processing(query):
    tokenization, normalization, file_type = get_processing_args()
   
    global_term_frequencies = build_global_term_frequencies(tokenization, normalization)  # Calculate global term frequencies
    N = len(os.listdir('Collections'))
    
    if file_type == "TPD":
        doc_path = os.path.join('Collections', f"{query}.txt")
        tokens = preprocessing(doc_path, tokenization, normalization)
        terms_freq = FreqDist(tokens)

        Display_TPD_result(query, terms_freq, global_term_frequencies, N)
        
    else :
        i=0
        for doc_name in os.listdir('Collections'):
            doc_path = os.path.join('Collections', doc_name)
            Tokens = preprocessing(doc_path, tokenization, normalization)
            terms_freq = FreqDist(Tokens)

            max_freq = max(terms_freq.values())
            for term, freq in terms_freq.items():  
                if term == query:  # Check if the term is the specific query term
                    poids = ((freq / max_freq) * math.log10((N / global_term_frequencies[term]) + 1))
                    i+=1
                    print(f"{i} Term: {term}, Document: {os.path.splitext(doc_name)[0]}, Frequency: {freq}, Weight: {format(poids, '.4f')}")
                    

In [7]:
def get_text_args(query, raw, processed):
    if raw:
        doc_path = os.path.join('Collections', f"{query}.txt")
        with open(doc_path, 'r') as file:
            text = file.read()
        print(text)
    elif processed:
        text_processing(query)

In [8]:
get_text_args("D1", False, True)

1 D1 experimental 2 0.2817
2 D1 investigation 1 0.1408
3 D1 aerodynamics 1 0.1408
4 D1 wing 3 0.3010
5 D1 slipstream 5 0.7042
6 D1 . 6 0.3010
7 D1 study 1 0.1003
8 D1 propeller 1 0.1408
9 D1 made 2 0.2007
10 D1 order 1 0.1408
11 D1 determine 1 0.1408
12 D1 spanwise 1 0.1408
13 D1 distribution 1 0.0663
14 D1 lift 3 0.4225
15 D1 increase 1 0.1408
16 D1 due 2 0.2817
17 D1 different 3 0.3010
18 D1 angles 1 0.1408
19 D1 attack 1 0.1408
20 D1 free 1 0.1003
21 D1 stream 1 0.1003
22 D1 velocity 1 0.1003
23 D1 ratios 1 0.1408
24 D1 results 1 0.1003
25 D1 intended 1 0.1408
26 D1 part 2 0.2817
27 D1 evaluation 2 0.2817
28 D1 basis 1 0.1003
29 D1 theoretical 1 0.1408
30 D1 treatments 1 0.1408
31 D1 problem 1 0.0663
32 D1 comparative 1 0.1408
33 D1 span 1 0.1408
34 D1 loading 1 0.1408
35 D1 curves, 1 0.1408
36 D1 together 1 0.1408
37 D1 supporting 1 0.1408
38 D1 evidence, 1 0.1408
39 D1 showed 1 0.1408
40 D1 substantial 1 0.1408
41 D1 increment 1 0.1408
42 D1 produced 1 0.1408
43 D1 /destalling/ 1 