# AI MODEL DESCRIPTION PREPROCESSING

### IMPORTS

In [9]:
from nltk.stem import WordNetLemmatizer
from rapidfuzz import process, fuzz
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import nltk
import json
import os
import re

### LOAD DATA

In [10]:
# Ruta al directorio que contiene los archivos JSON
json_directory_path = '../../Invoice_Downloader'

# Inicializar lista para almacenar los datos
data_list = []

# Iterar sobre todos los archivos en el directorio
for filename in os.listdir(json_directory_path):
    if filename.endswith('.json'):
        file_path = os.path.join(json_directory_path, filename)
        
        # Cargar los datos del archivo JSON
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        
        # Filtrar y extraer descripciones de productos
        for item in data:
            if item.get('accounting_classification', '') != '' and item.get('initial_description', '') != '':
                data_list.append({
                    'initial_description': item['initial_description'],
                    'final_description': item['final_description'],
                    'classification': item['accounting_classification'],
                    'unit_total': item['unit_total'],
                    'company_tid': item['company_tid'],
                    'establishment_id':item['establishment_id']
                })

# Crear un DataFrame con los datos recopilados
df = pd.DataFrame(data_list)

### PREPROCESSING RESOURCES

In [11]:
# Descargar recursos necesarios de NLTK
nltk.download('stopwords')
nltk.download('wordnet')

# Inicializar lematizador y lista de stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('spanish'))

# Cargar listado de palabras en español (listado corto)
with open('spanish_words.txt', 'r', encoding='utf-8') as file:
    spanish_words = set(file.read().splitlines())

# Lista para almacenar palabras no válidas
invalid_words = []

global_word_frequencies = Counter()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\juanc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\juanc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
# Función para encontrar la palabra más cercana en el diccionario usando rapidfuzz
def get_closest_word(word, dictionary):
    closest_match = process.extractOne(word, dictionary, scorer=fuzz.ratio, score_cutoff=80)  # Usar umbral del 80% de similitud
    if closest_match:
        return closest_match[0]
    else:
        invalid_words.append(word)
        return ''
    

def save_frequencies_to_file(filename, frequencies):
    # Guarda las frecuencias en un archivo de texto legible
    with open(filename, 'w') as file:
        for word, frequency in frequencies.items():
            file.write(f"{word}: {frequency}\n")

def save_invalid_words_to_file(filename, invalid_words):
    # Guarda las palabras inválidas en un archivo de texto legible
    with open(filename, 'w') as file:
        for word in invalid_words:
            file.write(f"{word}\n")

### Description Preprocessing

In [13]:
# Paso 1
# Clean unexpected characters and lower description
def step_1(description):
    # Convertir a minúsculas
    description = description.lower()
    # Reemplazar caracteres especiales con espacios, excepto el signo de interrogación
    description = re.sub(r'[^\w\s\?]', ' ', description)
    # Eliminar signos de interrogación
    description = description.replace('?', '')
    # Eliminar dígitos
    description = re.sub(r'\d+', '', description)
    # Eliminar espacios extra
    description = re.sub(r'\s+', ' ', description).strip()
    # Eliminar stop words
    #words = description.split()
    #filtered_words = [word for word in words if word not in stop_words]
    return ''.join(description)

# Paso 2
# Tokenize and filter small words
def step_2(description):
    # Tokenizar la descripción en palabras
    words = description.split()
    # Filtrar palabras con tres o menos caracteres
    filtered_words = [word for word in words if len(word) > 3]
    return filtered_words

# Paso 3
# Ortografic Changes
def step_3(words):
    # Corrige las palabras usando la función get_closest_word
    corrected_words = [get_closest_word(word, spanish_words) for word in words]
    
    return corrected_words

def step_4(words):
    # Actualiza el diccionario global con la frecuencia de cada palabra
    global_word_frequencies.update(words)
    return dict(global_word_frequencies)

In [14]:
def process_description(description):
    description = step_1(description)

    #words = step_2(description)

    #or_changes = step_3(words)

    #step_4(or_changes)

    return description

In [15]:
# Aplica la función a la columna 'description'
df['cleaned_initial_description'] = df['initial_description'].apply(process_description)
print(df.shape)

(3458, 7)


In [16]:
# Guarda las frecuencias en un archivo de texto
#save_frequencies_to_file('word_frequencies.txt', global_word_frequencies)
#save_invalid_words_to_file('invalid_words.txt', invalid_words)

# Guardar el DataFrame en un archivo CSV con codificación UTF-8-SIG
csv_file_path = 'cleaned_descriptions.csv'
df.to_csv(csv_file_path, index=False, encoding='utf-8-sig')

print(f"DataFrame guardado en el archivo CSV: {csv_file_path}")

DataFrame guardado en el archivo CSV: cleaned_descriptions.csv
