In [1]:
import linecache
from symspellpy import SymSpell, Verbosity
from rich import print
import inspect
import os
from collections import Counter
import re
import textstat


In [2]:
CONFIG_DIR = '/home/silvio/miniconda3/envs/classy3/prg/config/'
STOPWORD_ES = 'stopwords_es.txt'
STOPWORD_RED = 'stopwords_reddit.txt'
DICTIONARY = 'new_dic.txt'

dictionary = os.path.join(CONFIG_DIR, DICTIONARY)
stopwords_files = [os.path.join(CONFIG_DIR, STOPWORD_ES),
                  os.path.join(CONFIG_DIR,STOPWORD_RED)]
# stop_words_es = os.path.join(CONFIG_DIR, STOPWORD_ES)
# stop_words_red = os.path.join(CONFIG_DIR,STOPWORD_RED)


In [3]:
class Textmetrics:
    def __init__(self, lang="es", steps=10):
        self._lang = lang
        self._textstats = textstat
        self._textstats.set_lang(lang)
        self._dictionary_file = None
        self._dictionary = None
        self._dictionay_terms = 0
        self._dictionary_steps = steps
        self._dictionary_thresholds = []
        self._stopwords = Counter()
        self._stopwords_files = []
        self._misspellings = []
        self._num_words = 0
        self._num_stopwords = 0
        self._text = ""

    def get_frequency(line_num):
        """
            When splitting the dictionary for creating the steps,
            it reads a specific line of the dictionary and returns the
            frequency of the associated word.
            SymSpell returns the original word, or the correction, toghether
            with the frequency: this number will be usde to classify the difficulty
            of a word.
        """
        line = linecache.getline(self._dictionary_file, line_num)
        if line:
            words = line.split()
            if len(words) >= 2:
                second_word = words[1]
                try:
                    second_word_as_int = int(second_word)
                except ValueError:
                    print("Second word is not an integer.")
            else:
                print("Line doesn't contain at least two words.")
        return second_word_as_int
        

    def stat(self, function_name):
        """
            applies the textstat routine specified in function_name
            to self._textand and returns the result
            Is like calling self._textstats.function_name(self._text)
            but allows to decouple the two objects
        """
        # Check if the function_name exists in textstat
        if hasattr(self._textstats, function_name) and callable(getattr(self._textstats, function_name)):
            # Call the function dynamically with self._text as a parameter
            func = getattr(self._textstats, function_name)
            result = func(self._text)
            return result
        else:
            # Handle the case where the function doesn't exist
            return "Function not found"

    @property
    def stopwords_files(self):
        return self._stopwords_files
    
    @stopwords_files.setter
    def stopwords_files(self, files_list):
        """
            Loads stopwords rom as many file as are contained in files_list.
            The purpose is to allow stopwords for specifica fields to be loaded.
            For example, if it were a medical supplement to the standard stopwords,
            it would contain words like, hospital, IV, transfusion, ER, OR,  nurse, injection, etc
            NOTE: we are talking about stopwords, that is, words that are considered imprescindible
            to read and understand a medical text, but they could be important 
            to analize the content.
        """
        self._stopwords_files = files_list
        self._stopwords = Counter()
        for file_path in self._stopwords_files:
            try:
                # Open and read the file
                with open(file_path, 'r', encoding='utf-8') as file:
                    lines = file.readlines()
                    # Update the Counter with words (stripped of whitespace and newline characters)
                    for line in lines:
                        word = line.strip()
                        if word:
                            self._stopwords.update([word.lower()])
            except FileNotFoundError:
                print(f"File not found: {file_path}")
            except Exception as e:
                print(f"An error occurred while reading {file_path}: {str(e)}")

        # self._stopwords = stopwords
    

    """
    _text contains the string for which we want statistics in general
    """
    @property
    def text(self):
        return self._text
    
    @text.setter
    def text(self, txt):
        self._text = txt

    """
    _dictionary_steps containes the blocks into which we want to split the dictionary.
    As an example, let's say that we divide a dictionay in 10 parts with the purpose
    of assessing students from first to tenth grade. In first grade the should be able to 
    understand words extracted from the first part of the dictionry, and the same applies
    to students of the fifth grade.
    """
    @property
    def dictionary_steps(self):
        return self._dictionary_steps
    
    @dictionary_steps.setter
    def dictionary_steps(self, number_of_steps):
        self._dictionary_steps = number_of_steps

    #TODO: add the possibility to merg two dictionaries
    #TODO: add the possibility to insert specialized dictionaries at a given position
    @property
    def dictionary_file(self):
        return self._dictionary

    @dictionary_file.setter
    def dictionary_file(self, file):
        self._dictionary_file = file
        with open(file, "rbU") as f:
            self._dictionay_terms = sum(1 for _ in f)
        self._dictionary_block_size = self._dictionay_terms //  self._dictionary_steps
        
        line_numbers = []
        thresholds = []
        for i in range(1, self._dictionary_steps):
            line_numbers.append(dictionary_block_size * i)
            self._dictionary_thresholds.append(get_frequency(self._dictionary_steps))
        self._dictionary_thresholds.append(self._dictionay_terms)

In [4]:
tm = Textmetrics(lang='es')

In [5]:
tm.stopwords_files = stopwords_files

In [6]:
print(tm.stopwords_files)

In [7]:

tm.text = "Nunca jamás"

In [8]:
print(tm.text)