In [1]:
import re
import pandas as pd
import numpy as np
import os
import json
from typing import List, Dict

In [2]:
def fix_negation_contractions(text: str) -> str:
    """
    Corrige contrações negativas no formato "n ' t" para "n't" e padroniza variações.
    
    Exemplos:
    - "would n ' t" → "wouldn't"
    - "do n't" → "don't"
    - "can not" → "cannot"
    """
    if not isinstance(text, str):
        return text
    
    # Lista de verbos auxiliares comuns que formam contrações negativas
    aux_verbs = r"(?:is|are|was|were|have|has|had|do|does|did|can|could|will|would|shall|should|may|might|must)"
    
    # Padrão regex para capturar diferentes formatos de contrações negativas
    patterns = [
        # Caso 1: "n ' t" (com espaços)
        (fr"(\w+)\s+n\s*'\s*t\b", r"\1n't"),
        # Caso 2: " n't" (sem apóstrofo)
        (fr"(\w+)\s+n't\b", r"\1n't"),
        # Caso 3: "not" após verbo (ex: "can not" → "cannot")
        (fr"({aux_verbs})\s+not\b", r"\1not"),
        # Caso especial para "can not"
        (r"\bcan not\b", "cannot")
    ]
    
    # Aplicar todas as transformações
    for pattern, replacement in patterns:
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    
    return text

def apply_text_normalization(df: pd.DataFrame, text_column: str = 'comment_cleaned') -> pd.DataFrame:
    """
    Aplica todas as normalizações de texto no DataFrame.
    
    Args:
        df: DataFrame contendo os textos
        text_column: Nome da coluna com os textos a serem processados
    
    Returns:
        DataFrame com os textos normalizados
    """
    df = df.copy()
    
    # Aplicar correção de contrações negativas
    df[text_column] = df[text_column].apply(fix_negation_contractions)
    
    return df

In [3]:
def load_data(parquet_path: str) -> pd.DataFrame:
        """Carrega os dados do arquivo parquet"""
        min_length = 10  # Comentários muito curtos serão ignorados    
        df = pd.read_parquet(parquet_path)

        # Pré-filtro básico
        df = df.dropna(subset=['comment_cleaned'])
        df = df[df['comment_cleaned'].str.len() >= min_length]

        return df.reset_index(drop=True)

def process(input_path: str, output_path: str) -> pd.DataFrame:
    """Pipeline completo de processamento"""
    # 1. Carregar dados
    df = load_data(input_path)
    print(f"Total de comentários inicial: {len(df)}")
    
    # 2. Aplicar normalização de texto (NOVA ETAPA)
    df = apply_text_normalization(df)
    
    # 5. Salvar resultados
    df.to_parquet(output_path, index=False)
    print(f"Dados processados salvos em {output_path}")
    
    return df

# INPUTS

In [4]:
input_parquet = "../data/dataset_valid_with_sentiment.parquet"
output_parquet = "../data/dataset_valid_with_sentiment_fix_negative.parquet"
df = process(input_path=input_parquet, output_path=output_parquet)


Total de comentários inicial: 199
Dados processados salvos em ../data/dataset_valid_with_sentiment_fix_negative.parquet


In [5]:
pd.concat([df['comment'], df['comment_cleaned']], axis=1).head(20)


Unnamed: 0,comment,comment_cleaned
0,The pizza was really good .,the pizza was really good .
1,Knowledge of the chef and the waitress are bel...,knowledge of the chef and the waitress are bel...
2,The service was ok .,the service was ok .
3,I 'm happy to have Nosh in the neighborhood an...,i 'm happy to have nosh in the neighborhood an...
4,Indoor was very cozy and cute .,indoor was very cozy and cute .
5,Ballato 's is consistently delicious authentic...,ballato 's is consistently delicious authentic...
6,While the room is not particularly comfortable...,while the room isnot particularly comfortable ...
7,The dim sum however was very good .,the dim sum however was very good .
8,We have never had any problems with charging t...,we have never had any problems with charging t...
9,Spreads and toppings are great - though a bit ...,spreads and toppings are great - though a bit ...
