In [43]:
import pandas as pd
import yaml
import unicodedata
import numpy as np
import re
import Levenshtein
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from jiwer import wer, cer
from tqdm import tqdm

warnings.filterwarnings('ignore')
# Set plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

In [44]:
JACCARD_MODE = "ngram"

# 1. Carregando os Dados


In [45]:

confs = yaml.safe_load(open("confs.yaml"))
predictors = confs["predictors"] ### Importante! O cientista poderá usar apenas estas features para criar/aperfeiçoar o modelo
text_target = confs["text_target"]
cols_to_keep = predictors + text_target
df = pd.read_parquet("dados/train.parquet")[cols_to_keep]
df.to_csv("data.csv")
print("\nMissing values per column:")
print(df.isnull().sum())
print(f"\nTotal rows with any missing values: {df.isnull().any(axis=1).sum()}")



Missing values per column:
user_input       0
uf               0
razaosocial      0
nome_fantasia    0
dtype: int64

Total rows with any missing values: 0


# 2. Limpando os Dados

Iremos remover palavras como "S.A.", "LTDA", "LTDA.", "S/A", "S.A", "Ltda", "Ltda.", "S/A.", "S.A.", "S.A", "Ltda" e "Ltda" dos nomes reais das empresas a serem previstos, usando a seguinte suposicao:

- Suposicao 1: usuários tem o hábito de pesquisar por nomes de empresas sem essas palavras, então elas não devem ser consideradas na previsão.


In [46]:
def comprehensive_text_cleaning(text, 
                               remove_accents=True,
                               remove_stop_words=True, 
                               remove_company_suffixes=True,
                               custom_stop_words=None,
                               to_lowercase=True):
    """
    Comprehensive text cleaning function
    
    Parameters:
    text (str): Input text
    remove_accents (bool): Remove accents and normalize characters
    remove_stop_words (bool): Remove Portuguese stop words
    remove_company_suffixes (bool): Remove common company suffixes
    custom_stop_words (set): Additional stop words to remove
    to_lowercase (bool): Convert to lowercase
    
    Returns:
    str: Cleaned text
    """
    
    if pd.isna(text):
        return text
    
    text = str(text)
    
    # 1. Remove accents and normalize characters
    if remove_accents:
        # Normalize unicode
        text = unicodedata.normalize('NFD', text)
        text = ''.join(char for char in text if unicodedata.category(char) != 'Mn')
        
        # Handle specific cases
        text = text.replace('ç', 'c').replace('Ç', 'C')
    
    # 2. Convert to lowercase
    if to_lowercase:
        text = text.lower()
    
    # 3. Remove company suffixes
    if remove_company_suffixes:
        patterns_to_remove = [
        r'\bS\.?A\.?\b',           # S.A, SA, S.A., SA.
        r'\bS/A\.?\b',             # S/A, S/A.
        r'\bLTDA\.?\b',            # LTDA, LTDA.
        r'\bLIMITADA\b',           # LIMITADA
        r'\bCIA\.?\b',             # CIA, CIA.
        r'\bCOMPANHIA\b',          # COMPANHIA
        r'\bEMPRESA\b',            # EMPRESA
        r'\bCOMERCIO\b',           # COMERCIO
        r'\bSERVICOS?\b',          # SERVICO, SERVICOS
        r'\bME\b',                 # ME (Microempresa)
        r'\bEPP\b',                # EPP (Empresa de Pequeno Porte)
        r'\bEIRELI\b',             # EIRELI
        r'\bSOCIEDADE\b',          # SOCIEDADE
        r'ADMINISTRADORA\b',       # ADMINISTRADORA
        r'GERAL\b',                # GERAL
    ]
        
        for pattern in patterns_to_remove:
            text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    
    # 4. Remove stop words
    if remove_stop_words:
        portuguese_stop_words = {
            'a', 'ao', 'aos', 'as', 'da', 'das', 'de', 'do', 'dos', 'e', 'em', 'na', 
            'nas', 'no', 'nos', 'o', 'os', 'para', 'por', 'com', 'um', 'uma', 'uns', 
            'umas', 'se', 'que', 'ou', 'mas', 'como', 'mais', 'muito', 'sua', 'seu',
            'seus', 'suas', 'este', 'esta', 'estes', 'estas', 'esse', 'essa', 'esses',
            'essas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'isso', 'aquilo'
        }
        
        if custom_stop_words:
            portuguese_stop_words.update(custom_stop_words)
        
        words = text.split()
        words = [word for word in words if word.lower() not in portuguese_stop_words]
        text = ' '.join(words)
    
    # 5. Clean up extra whitespace and special characters
    text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)      # Multiple spaces to single space
    text = text.strip()                   # Remove leading/trailing spaces
    
    return text

# Usage
df['razaosocial'] = df['razaosocial'].apply(comprehensive_text_cleaning)
df['nome_fantasia'] = df['nome_fantasia'].apply(comprehensive_text_cleaning)
df['user_input'] = df['user_input'].apply(comprehensive_text_cleaning)

# 2. Análise de Métricas de Character Error Rate (CER), Word Error Rate (WER) e Distancia de Levenshtein

- **Word Error Rate (WER)**: fórmula para calcular a taxa de erro a nível de palavras: 
  $$WER = \frac{S + D + I}{N}$$
  onde:
  - $S$ é o número de substituições. Por exemplo, se o usuário digitou "Empresa X" e a referência é "Empresa Y", então há uma substituição.
  - $D$ é o número de deleções. Por exemplo, se o usuário digitou "Empresa" e a referência é "Empresa X", então há uma deleção.
  - $I$ é o número de inserções. Por exemplo, se o usuário digitou "Empresa X Y" e a referência é "Empresa X", então há uma inserção.
  - $N$ é o número total de palavras na referência. Por exemplo, se a referência é "Empresa X", então $N$ é 2.

- **Character Error Rate (CER)**: fórmula para calcular a taxa de erro a nível de caracteres:
  $$CER = \frac{S + D + I}{N}$$
  onde:
  - $S$ é o número de substituições. Por exemplo, se o usuário digitou "EmpresaXY" e a referência é "EmpresaXZ", então há uma substituição.
  - $D$ é o número de deleções. Por exemplo, se o usuário digitou "Empresa" e a referência é "EmpresaX", então há uma deleção.
  - $I$ é o número de inserções. Por exemplo, se o usuário digitou "Empresa XY" e a referência é "Empresa X", então há uma inserção.
  - $N$ é o número total de caracteres na referência. Por exemplo, se a referência é "Empresa X", então $N$ é 9 (contando espaços).

- **Distância de Levenshtein**: é uma métrica que mede a diferença entre duas sequências. É definida como o número mínimo de operações de edição (inserções, deleções ou substituições) necessárias para transformar uma sequência em outra.

- **Similaridade de Jaccard**: é uma métrica que mede a similaridade entre dois conjuntos. É definida como o tamanho da interseção dividido pelo tamanho da união dos conjuntos.
  $$J(A, B) = \frac{|A \cap B|}{|A \cup B|}$$


Essas métricas serao úteis para avaliar o quão diferente os inputs de usuário (`user_input`) são dos outputs esperados `razaosocial` e `nome_fantasia` e também dos outputs nao esperados, i.e., de todas as empresas que nao correspondem ao input do usuário. 

Caso o `CER` e/ou  `WER` entre o `user_input` e dos outputs nao esperados seja significativamente maior do que o `CER` e/ou `WER` entre o `user_input` e dos outputs esperados, podemos concluir que o input do usuário é mais próximo dos outputs esperados do que dos outputs não esperados e utilizar a minimização de `CER` e `WER` como critério para selecionar a empresa correta.



## 2.1 Calculando o CER, WER e a Distância de Levenshtein

Vamos usar a implementação do pacote já importado `jiwer` para calcular CER e WER. Para a Distância de Levenshtein, vamos usar a função `distance` do pacote `Levenshtein`.


In [47]:
from typing import Callable, Union, Set, Dict
import pandas as pd
import numpy as np
import Levenshtein
from functools import wraps

def validate_inputs(func: Callable) -> Callable:
    """
    Decorator to handle input validation and error handling for text comparison metrics.
    """
    @wraps(func)
    def wrapper(reference: Union[str, float], hypothesis: Union[str, float], *args, **kwargs) -> float:
        try:
            # Handle NaN values
            if pd.isna(reference) or pd.isna(hypothesis):
                return np.nan
            
            # Convert to string and clean
            reference = str(reference).strip()
            hypothesis = str(hypothesis).strip()
            
            # Handle empty strings
            if len(reference) == 0 or len(hypothesis) == 0:
                return np.nan
                
            return func(reference, hypothesis, *args, **kwargs)
            
        except Exception as e:
            print(f"Error calculating {func.__name__}: {e}")
            return np.nan
            
    return wrapper

class TextMetrics:
    """
    A class containing various text comparison metrics with input validation.
    """
    
    @staticmethod
    @validate_inputs
    def calculate_cer(reference: str, hypothesis: str) -> float:
        """Calculate Character Error Rate."""
        return cer(reference, hypothesis)
    
    @staticmethod
    @validate_inputs
    def calculate_wer(reference: str, hypothesis: str) -> float:
        """Calculate Word Error Rate."""
        return wer(reference, hypothesis)
    
    @staticmethod
    @validate_inputs
    def calculate_normalized_levenshtein(reference: str, hypothesis: str) -> float:
        """
        Calculate normalized Levenshtein distance (0-1).
        Returns:
            float: Normalized Levenshtein distance between 0 and 1
        """
        max_len = max(len(reference), len(hypothesis))
        if max_len == 0:
            return 0.0
        
        distance = Levenshtein.distance(reference, hypothesis)
        return distance / max_len

    @staticmethod
    def _get_character_set(text: str) -> Set[str]:
        """
        Convert text to a set of characters.
        
        Args:
            text (str): Input text
            
        Returns:
            Set[str]: Set of characters from the input text
        """
        return set(text)

    @staticmethod
    def _get_word_set(text: str) -> Set[str]:
        """
        Convert text to a set of words.
        
        Args:
            text (str): Input text
            
        Returns:
            Set[str]: Set of words from the input text
        """
        return set(text.lower().split())

    @staticmethod
    def _calculate_jaccard_similarity(set1: Set[str], set2: Set[str]) -> float:
        """
        Calculate Jaccard similarity between two sets.
        
        Args:
            set1 (Set[str]): First set
            set2 (Set[str]): Second set
            
        Returns:
            float: Jaccard similarity score between 0 and 1
        """
        if not set1 and not set2:  # Both sets are empty
            return 1.0
        if not set1 or not set2:   # One set is empty
            return 0.0
            
        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))
        return intersection / union

    @staticmethod
    @validate_inputs
    def calculate_jaccard_similarity_chars(reference: str, hypothesis: str) -> float:
        """
        Calculate Jaccard similarity based on character sets.
        
        Args:
            reference (str): Reference text
            hypothesis (str): Hypothesis text
            
        Returns:
            float: Jaccard similarity score between 0 and 1
        """
        ref_chars = TextMetrics._get_character_set(reference)
        hyp_chars = TextMetrics._get_character_set(hypothesis)
        return TextMetrics._calculate_jaccard_similarity(ref_chars, hyp_chars)

    @staticmethod
    @validate_inputs
    def calculate_jaccard_similarity_words(reference: str, hypothesis: str) -> float:
        """
        Calculate Jaccard similarity based on word sets.
        
        Args:
            reference (str): Reference text
            hypothesis (str): Hypothesis text
            
        Returns:
            float: Jaccard similarity score between 0 and 1
        """
        ref_words = TextMetrics._get_word_set(reference)
        hyp_words = TextMetrics._get_word_set(hypothesis)
        return TextMetrics._calculate_jaccard_similarity(ref_words, hyp_words)

    @staticmethod 
    def _ngram_jaccard_similarity(reference: str, hypothesis: str, n=2):
        """
        Calculate Jaccard similarity using character n-grams.
        This handles inversions and some misspellings well.
        
        Args:
            str1, str2: Input strings
            n: N-gram size (2=bigrams, 3=trigrams, etc.)
        """
        def get_ngrams(text, n):
            """Generate n-grams from text with padding."""
            # Add padding to capture beginning/end patterns
            padded = '#' * (n-1) + text.lower() + '#' * (n-1)
            return set(padded[i:i+n] for i in range(len(padded) - n + 1))
        
        ngrams1 = get_ngrams(reference, n)
        ngrams2 = get_ngrams(hypothesis, n)
        
        intersection = len(ngrams1 & ngrams2)
        union = len(ngrams1 | ngrams2)
        
        return intersection / union if union > 0 else 1.0 if len(reference) == len(hypothesis) == 0 else 0.0
 
    @staticmethod
    def multi_ngram_jaccard_similarity(reference: str, hypothesis: str, ngram_sizes=[2, 3], weights=None):
        """
        Combine multiple n-gram sizes for better robustness.
        """
        if weights is None:
            weights = [1.0] * len(ngram_sizes)
        
        if len(weights) != len(ngram_sizes):
            raise ValueError("Number of weights must match number of n-gram sizes")
        
        total_score = 0
        total_weight = sum(weights)
        
        for size, weight in zip(ngram_sizes, weights):
            score = TextMetrics._ngram_jaccard_similarity(reference, hypothesis, size)
            total_score += score * weight
        
        return total_score / total_weight

## 2.2 CER, WER e Distancia de Levenshtein: `user_input` vs Ground Truth (`razaosocial` e `nome_fantasia`)

Vamos calcular o CER, o WER  e a distancia de Levenshtein entre o `user_input` e as colunas `razaosocial` e `nome_fantasia` do DataFrame e adicionar essas métricas como novas colunas no DataFrame. 


- `cer_razaosocial`: CER entre `user_input` e `razaosocial`
- `wer_razaosocial`: WER entre `user_input` e `razaosocial`
- `lev_dist__razaosocial`: Distância de Levenshtein entre `user_input` e `razaosocial`
- `cer_nome_fantasia`: CER entre `user_input` e `nome_fantasia`
- `wer_nome_fantasia`: WER entre `user_input` e `nome_fantasia`
- `lev_dist__nome_fantasia`: Distância de Levenshtein entre `user_input` e `nome_fantasia`
- `jac_sim_razaosocial`: Similaridade de Jaccard entre `user_input` e `razaosocial`
- `jac_sim_nome_fantasia`: Similaridade de Jaccard entre `user_input` e `nome_fantasia`

In [None]:
import pandas as pd
from typing import List, Tuple


def calculate_metrics(row: pd.Series,
                      reference_col: str,
                      hypothesis_col: str,
                      metrics: TextMetrics,
                      jacc_mode: str) -> Tuple[float, float, float, float]:
    """
    Calculate CER, WER, Levenshtein distance, and Jaccard similarity for a single row.
    
    Args:
        row (pd.Series): A row from the DataFrame
        reference_col (str): Name of the column containing the reference text
        hypothesis_col (str): Name of the column containing the hypothesis text
        metrics (TextMetrics): An instance of the TextMetrics class
    
    Returns:
        Tuple[float, float, float, float]: CER, WER, Levenshtein distance, and Jaccard similarity
    """
    reference = row[reference_col]
    hypothesis = row[hypothesis_col]
    
    cer = metrics.calculate_cer(reference, hypothesis)
    wer = metrics.calculate_wer(reference, hypothesis)
    levenshtein = metrics.calculate_normalized_levenshtein(reference, hypothesis)
    if jacc_mode == "char":
        jaccard = metrics.calculate_jaccard_similarity_chars(reference, hypothesis)
    elif jacc_mode == "word":
        jaccard = metrics.calculate_jaccard_similarity_words(reference, hypothesis)
    elif jacc_mode == "ngram":
        jaccard = metrics.multi_ngram_jaccard_similarity(reference, hypothesis, ngram_sizes=[2, 3], weights=[0.5, 0.5])
    
    return cer, wer, levenshtein, jaccard

def apply_metrics(df: pd.DataFrame,
                  reference_cols: List[str],
                  hypothesis_col: str,
                  metrics: TextMetrics,
                  jacc_mode: str) -> pd.DataFrame:
    """
    Apply text metrics to multiple reference columns against a single hypothesis column.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        reference_cols (List[str]): List of column names to use as reference texts
        hypothesis_col (str): Column name to use as hypothesis text
        metrics (TextMetrics): An instance of the TextMetrics class
    
    Returns:
        pd.DataFrame: DataFrame with added metric columns
    """
    for ref_col in reference_cols:
        print(f"Calculating metrics for {hypothesis_col} vs {ref_col}...")
        
        cer_col = f'cer_{ref_col}'
        wer_col = f'wer_{ref_col}'
        lev_dist_col = f'lev_dist_{ref_col}'
        jaccard_col = f'jacc_sim_{jacc_mode}_{ref_col}'
        
        df[[cer_col, wer_col, lev_dist_col, jaccard_col]] = df.apply(
            lambda row: calculate_metrics(row, ref_col, hypothesis_col, metrics, jacc_mode),
            axis=1,
            result_type='expand'
        )
    
    return df

def calculate_error_rates(df: pd.DataFrame,
                          jacc_mode: str = "word") -> pd.DataFrame:
    """
    Calculate CER, WER, Levenshtein distance, and Jaccard 
    similarity for user_input vs razaosocial and nome_fantasia.
    
    Args:
        df (pd.DataFrame): Input DataFrame
    
    Returns:
        pd.DataFrame: DataFrame with added metric columns
    """
    print("\nCalculating CER, WER, Levenshtein distance, and Jaccard similarity...")
    
    metrics = TextMetrics()
    reference_columns = ['razaosocial', 'nome_fantasia']
    hypothesis_column = 'user_input'
    
    df = apply_metrics(df, reference_columns, hypothesis_column, metrics, jacc_mode=jacc_mode)
    
    print("Calculations completed!")
    return df

# Usage
df = calculate_error_rates(df, jacc_mode=JACCARD_MODE)

In [None]:
def analyze_metrics_statistics(df: pd.DataFrame):
    """Calculate and display statistical summaries"""
    # Calculate average metrics
    avg_cer_razaosocial = df['cer_razaosocial'].mean()
    avg_wer_razaosocial = df['wer_razaosocial'].mean()
    avg_lev_dist_razaosocial = df['lev_dist_razaosocial'].mean()
    avg_cer_nome_fantasia = df['cer_nome_fantasia'].mean()
    avg_wer_nome_fantasia = df['wer_nome_fantasia'].mean()
    avg_lev_dist_nome_fantasia = df['lev_dist_nome_fantasia'].mean()
    avg_jac_sim_razaosocial = df[f'jacc_sim_{JACCARD_MODE}_razaosocial'].mean()
    avg_jac_sim_nome_fantasia = df[f'jacc_sim_{JACCARD_MODE}_nome_fantasia'].mean()
    
    print("\n" + "="*50)
    print("AVERAGE ERROR RATES")
    print("="*50)
    print(f"Average CER (user_input vs razaosocial): {avg_cer_razaosocial:.4f}")
    print(f"Average WER (user_input vs razaosocial): {avg_wer_razaosocial:.4f}")
    print(f"Average Levenshtein (user_input vs razaosocial): {avg_lev_dist_razaosocial:.4f}")
    print(f"Average CER (user_input vs nome_fantasia): {avg_cer_nome_fantasia:.4f}")
    print(f"Average WER (user_input vs nome_fantasia): {avg_wer_nome_fantasia:.4f}")
    print(f"Average Levenshtein (user_input vs nome_fantasia): {avg_lev_dist_nome_fantasia:.4f}")
    print(f"Average Jaccard Similarity ({JACCARD_MODE} level) (user_input vs razaosocial): {avg_jac_sim_razaosocial:.4f}")
    print(f"Average Jaccard Similarity ({JACCARD_MODE} level) (user_input vs nome_fantasia): {avg_jac_sim_nome_fantasia:.4f}")
    
    print("\n" + "="*50)
    print("DETAILED STATISTICS")
    print("="*50)
    print("\nCER Statistics:")
    print(df[['cer_razaosocial', 'cer_nome_fantasia']].describe())
    
    print("\nWER Statistics:")
    print(df[['wer_razaosocial', 'wer_nome_fantasia']].describe())

    print("\nLevenshtein Statistics:")
    print(df[['lev_dist_razaosocial', 'lev_dist_nome_fantasia']].describe())

    print("\nJaccard Similarity Statistics:")
    print(df[[f'jacc_sim_{JACCARD_MODE}_razaosocial', f'jacc_sim_{JACCARD_MODE}_nome_fantasia']].describe())

    
    return {
        'avg_cer_razaosocial': avg_cer_razaosocial,
        'avg_wer_razaosocial': avg_wer_razaosocial,
        'avg_cer_nome_fantasia': avg_cer_nome_fantasia,
        'avg_wer_nome_fantasia': avg_wer_nome_fantasia,
        'avg_lev_dist_razaosocial': avg_lev_dist_razaosocial,
        'avg_lev_dist_nome_fantasia': avg_lev_dist_nome_fantasia,
        'avg_jac_sim_razaosocial': avg_jac_sim_razaosocial,
        'avg_jac_sim_nome_fantasia': avg_jac_sim_nome_fantasia
    }

metrics_stats = analyze_metrics_statistics(df)

## 2.3 CER, WER e Distancia de Levshenstein: `user_input` vs Outras Empresas (Outputs Não Esperados)

Vamos calcular CER, WER e a Distancia de Levshenstein entre o `user_input` e as colunas `razaosocial` e `nome_fantasia` de todas as outras empresas (outputs não esperados) e adicionar essas métricas como novas colunas no DataFrame.


In [48]:
from typing import List, Tuple, Callable
from enum import Enum
import pandas as pd

class SimilarityMetric(Enum):
    """Enum for available similarity metrics."""
    CER = "cer"
    WER = "wer"
    LEVENSHTEIN = "levenshtein"
    JACCARD_CHARS = "jaccard_chars"
    JACCARD_WORDS = "jaccard_words"
    NGRAM_JACCARD = "ngram_jaccard"

class TextRetrieval:
    """
    A modular class for text retrieval using various similarity metrics.
    All methods return matches for both 'razaosocial' and 'nome_fantasia' columns.
    """
    
    # Define metric configurations: (metric_function, reverse_sort)
    # reverse_sort=False for distance metrics (lower is better)
    # reverse_sort=True for similarity metrics (higher is better)
    _METRIC_CONFIG = {
        SimilarityMetric.CER: (TextMetrics.calculate_cer, False),
        SimilarityMetric.WER: (TextMetrics.calculate_wer, False),
        SimilarityMetric.LEVENSHTEIN: (TextMetrics.calculate_normalized_levenshtein, False),
        SimilarityMetric.JACCARD_CHARS: (TextMetrics.calculate_jaccard_similarity_chars, True),
        SimilarityMetric.JACCARD_WORDS: (TextMetrics.calculate_jaccard_similarity_words, True),
        SimilarityMetric.NGRAM_JACCARD: (
            lambda text1, text2: TextMetrics.multi_ngram_jaccard_similarity(
                text1, text2, ngram_sizes=[2, 3], weights=[0.5, 0.5]
            ), 
            True
        ),
    }
    
    @staticmethod
    def find_best_matches(
        user_input: str, 
        df: pd.DataFrame, 
        metric: SimilarityMetric, 
        top_k: int = 1
    ) -> Tuple[List[str], List[str], List[float], List[float]]:
        """
        Find the top-k best matches using the specified similarity metric.
        
        Args:
            user_input (str): Input text to match against
            df (pd.DataFrame): DataFrame containing 'razaosocial' and 'nome_fantasia' columns
            metric (SimilarityMetric): The similarity metric to use
            top_k (int): Number of top matches to return
            
        Returns:
            Tuple[List[str], List[str], List[float], List[float]]: 
                razaosocial matches, nome_fantasia matches, razaosocial scores, nome_fantasia scores
                
        Raises:
            ValueError: If the metric is not supported
        """
        if metric not in TextRetrieval._METRIC_CONFIG:
            raise ValueError(f"Unsupported metric: {metric}")
        
        metric_func, reverse_sort = TextRetrieval._METRIC_CONFIG[metric]
        
        return TextRetrieval._find_matches_with_metric_vectorized(
            user_input, df, metric_func, reverse_sort, top_k
        )
    

    @staticmethod
    def _find_matches_with_metric_vectorized(
        user_input: str,
        df: pd.DataFrame,
        metric_func: Callable[[str, str], float],
        reverse_sort: bool,
        top_k: int
    ) -> Tuple[List[str], List[str], List[float], List[float]]:
        """
        Vectorized version using pandas operations for better performance.
        """
        # Filter out rows where both columns are NaN
        valid_mask = ~(df['razaosocial'].isna() & df['nome_fantasia'].isna())
        if not valid_mask.any():
            return [], [], [], []
        
        df_valid = df[valid_mask].copy()
        
        # Vectorized score calculation
        razao_scores = df_valid['razaosocial'].apply(
            lambda x: metric_func(x, user_input) if pd.notna(x) else None
        )
        nome_scores = df_valid['nome_fantasia'].apply(
            lambda x: metric_func(x, user_input) if pd.notna(x) else None
        )
        
        # Calculate max scores efficiently
        if reverse_sort:  # Similarity metrics (higher is better)
            max_scores = np.maximum(
                razao_scores.fillna(-np.inf), 
                nome_scores.fillna(-np.inf)
            )
        else:  # Distance metrics (lower is better)
            max_scores = np.minimum(
                razao_scores.fillna(np.inf), 
                nome_scores.fillna(np.inf)
            )
        
        # Sort indices by max scores
        sorted_indices = np.argsort(max_scores)
        if reverse_sort:
            sorted_indices = sorted_indices[::-1]
        
        # Get top-k results
        top_indices = sorted_indices[:top_k]
        
        best_razao_matches = df_valid.iloc[top_indices]['razaosocial'].tolist()
        best_nome_fantasia_matches = df_valid.iloc[top_indices]['nome_fantasia'].tolist()
        best_razao_scores = razao_scores.iloc[top_indices].tolist()
        best_nome_fantasia_scores = nome_scores.iloc[top_indices].tolist()
        
        return best_razao_matches, best_nome_fantasia_matches, best_razao_scores, best_nome_fantasia_scores
    
    @staticmethod
    def _find_matches_with_metric(
        user_input: str,
        df: pd.DataFrame,
        metric_func: Callable[[str, str], float],
        reverse_sort: bool,
        top_k: int
    ) -> Tuple[List[str], List[str], List[float], List[float]]:
        """
        Core method to find matches using any metric function.
        
        Args:
            user_input (str): Input text to match against
            df (pd.DataFrame): DataFrame containing 'razaosocial' and 'nome_fantasia' columns
            metric_func (Callable): Function to calculate similarity/distance
            reverse_sort (bool): Whether to sort in descending order (True for similarity, False for distance)
            top_k (int): Number of top matches to return
            
        Returns:
            Tuple[List[str], List[str], List[float], List[float]]: 
                razaosocial matches, nome_fantasia matches, razaosocial scores, nome_fantasia scores
        """
        razao_social_cands = []
        nome_fantasia_cands = []
        razao_social_cand_sims = []
        nome_fantasia_cand_sims = []
        max_sims = []
        
        for _, row in df.iterrows():
            razaosocial_cand = row['razaosocial']
            nome_fantasia_cand = row['nome_fantasia']
            
            if pd.isna(razaosocial_cand) and pd.isna(nome_fantasia_cand):
                continue
            
            # Calculate scores for both columns
            razaosocial_cand_sim = None
            nome_fantasia_cand_sim = None
            
            if not pd.isna(razaosocial_cand):
                razaosocial_cand_sim = metric_func(razaosocial_cand, user_input)
            
            if not pd.isna(nome_fantasia_cand):
                nome_fantasia_cand_sim = metric_func(nome_fantasia_cand, user_input)
            
            # Calculate max similarity for ranking
            valid_scores = [
                score for score in [razaosocial_cand_sim, nome_fantasia_cand_sim] 
                if score is not None and not pd.isna(score)
            ]
            
            if valid_scores:
                if reverse_sort:  # For similarity metrics (higher is better)
                    max_similarity = max(valid_scores)
                else:  # For distance metrics (lower is better)
                    max_similarity = min(valid_scores)
                
                razao_social_cands.append(razaosocial_cand)
                nome_fantasia_cands.append(nome_fantasia_cand)
                razao_social_cand_sims.append(razaosocial_cand_sim)
                nome_fantasia_cand_sims.append(nome_fantasia_cand_sim)
                max_sims.append(max_similarity)
        
        if not razao_social_cands:
            return [], [], [], []
        
        # Create DataFrame for sorting
        matches_scores = pd.DataFrame({
            'razaosocial_cand': razao_social_cands,
            'nome_fantasia_cand': nome_fantasia_cands,
            'razaosocial_cand_sim': razao_social_cand_sims,
            'nome_fantasia_cand_sim': nome_fantasia_cand_sims,
            'max_similarity': max_sims
        })
        
        # Sort by max similarity
        matches_scores.sort_values(
            by='max_similarity', 
            ascending=not reverse_sort, 
            inplace=True
        )
        
        # Return top-k matches
        top_matches = matches_scores.head(top_k)
        best_razao_matches = top_matches['razaosocial_cand'].tolist()
        best_nome_fantasia_matches = top_matches['nome_fantasia_cand'].tolist()
        best_razao_scores = top_matches['razaosocial_cand_sim'].tolist()
        best_nome_fantasia_scores = top_matches['nome_fantasia_cand_sim'].tolist()
        
        return best_razao_matches, best_nome_fantasia_matches, best_razao_scores, best_nome_fantasia_scores
    
    # All metric-specific methods now return the same 4-element tuple
    @staticmethod
    def find_best_matches_cer(user_input: str, df: pd.DataFrame, top_k: int = 1) -> Tuple[List[str], List[str], List[float], List[float]]:
        """Find the top-k best matches using Character Error Rate (CER)."""
        return TextRetrieval.find_best_matches(user_input, df, SimilarityMetric.CER, top_k)
    
    @staticmethod
    def find_best_matches_wer(user_input: str, df: pd.DataFrame, top_k: int = 1) -> Tuple[List[str], List[str], List[float], List[float]]:
        """Find the top-k best matches using Word Error Rate (WER)."""
        return TextRetrieval.find_best_matches(user_input, df, SimilarityMetric.WER, top_k)
    
    @staticmethod
    def find_best_matches_levenshtein(user_input: str, df: pd.DataFrame, top_k: int = 1) -> Tuple[List[str], List[str], List[float], List[float]]:
        """Find the top-k best matches using normalized Levenshtein distance."""
        return TextRetrieval.find_best_matches(user_input, df, SimilarityMetric.LEVENSHTEIN, top_k)
    
    @staticmethod
    def find_best_matches_jaccard_chars(user_input: str, df: pd.DataFrame, top_k: int = 1) -> Tuple[List[str], List[str], List[float], List[float]]:
        """Find the top-k best matches using character-level Jaccard similarity."""
        return TextRetrieval.find_best_matches(user_input, df, SimilarityMetric.JACCARD_CHARS, top_k)
    
    @staticmethod
    def find_best_matches_jaccard_words(user_input: str, df: pd.DataFrame, top_k: int = 1) -> Tuple[List[str], List[str], List[float], List[float]]:
        """Find the top-k best matches using word-level Jaccard similarity."""
        return TextRetrieval.find_best_matches(user_input, df, SimilarityMetric.JACCARD_WORDS, top_k)
    
    @staticmethod
    def find_best_matches_ngram_jaccard(user_input: str, df: pd.DataFrame, top_k: int = 1) -> Tuple[List[str], List[str], List[float], List[float]]:
        """Find the top-k best matches using n-gram Jaccard similarity."""
        return TextRetrieval.find_best_matches(user_input, df, SimilarityMetric.NGRAM_JACCARD, top_k)


In [49]:
def analyze_combined_metric_performance(results_df: pd.DataFrame, 
                                        top_k: int = 1) -> pd.DataFrame:
    """
    Analyze the performance of different metrics including the combined metric
    
    Args:
        results_df: DataFrame returned from evaluate_retrieval_accuracy_top_k
        top_k: Which top-k accuracy to analyze
        
    Returns:
        DataFrame with performance metrics for each method
    """
    
    # Extract available metrics from column names
    available_metrics = set()
    
    # Look for columns that match the pattern: {field}_top_{k}_{metric}_pred
    for col in results_df.columns:
        if '_top_' in col and col.endswith('_pred'):
            # Split the column name to extract the metric
            parts = col.split('_')
            if len(parts) >= 4:
                # Find the position of 'top' in the parts
                try:
                    top_idx = parts.index('top')
                    if top_idx + 2 < len(parts):  # Ensure we have enough parts after 'top'
                        # Extract metric name (everything between the number and 'pred')
                        metric_parts = parts[top_idx + 2:-1]  # Skip 'top', number, and 'pred'
                        metric = '_'.join(metric_parts)
                        available_metrics.add(metric)
                except ValueError:
                    continue
    
    print(f"Found metrics: {sorted(available_metrics)}")
    
    performance_results = []
    
    for metric in available_metrics:
        # Construct column names
        razaosocial_top_k_col = f'razaosocial_top_{top_k}_{metric}_pred'
        nome_fantasia_top_k_col = f'nome_fantasia_top_{top_k}_{metric}_pred'
        razaosocial_top_1_col = f'razaosocial_top_1_{metric}_pred'
        nome_fantasia_top_1_col = f'nome_fantasia_top_1_{metric}_pred'
        razaosocial_rank_col = f'razaosocial_rank_{metric}'
        nome_fantasia_rank_col = f'nome_fantasia_rank_{metric}'
        
        # Check if required columns exist
        required_cols = [razaosocial_top_k_col, nome_fantasia_top_k_col, 
                        razaosocial_top_1_col, nome_fantasia_top_1_col,
                        razaosocial_rank_col, nome_fantasia_rank_col]
        
        missing_cols = [col for col in required_cols if col not in results_df.columns]
        if missing_cols:
            print(f"Warning: Missing columns for metric '{metric}': {missing_cols}")
            continue
        
        try:
            # Calculate top-k accuracy
            razaosocial_top_k_accuracy = results_df[razaosocial_top_k_col].mean()
            nome_fantasia_top_k_accuracy = results_df[nome_fantasia_top_k_col].mean()
            overall_top_k_accuracy = (razaosocial_top_k_accuracy + nome_fantasia_top_k_accuracy) / 2
            
            # Calculate top-1 accuracy
            razaosocial_top_1_accuracy = results_df[razaosocial_top_1_col].mean()
            nome_fantasia_top_1_accuracy = results_df[nome_fantasia_top_1_col].mean()
            overall_top_1_accuracy = (razaosocial_top_1_accuracy + nome_fantasia_top_1_accuracy) / 2
            
            # Calculate mean rank (lower is better, 0 means not found)
            # Only consider non-zero ranks for mean calculation
            razaosocial_ranks = results_df[razaosocial_rank_col]
            nome_fantasia_ranks = results_df[nome_fantasia_rank_col]
            
            # Calculate mean rank excluding 0s (not found cases)
            razaosocial_found_ranks = razaosocial_ranks[razaosocial_ranks > 0]
            nome_fantasia_found_ranks = nome_fantasia_ranks[nome_fantasia_ranks > 0]
            
            mean_razaosocial_rank = razaosocial_found_ranks.mean() if len(razaosocial_found_ranks) > 0 else float('inf')
            mean_nome_fantasia_rank = nome_fantasia_found_ranks.mean() if len(nome_fantasia_found_ranks) > 0 else float('inf')
            
            # Overall mean rank
            if mean_razaosocial_rank != float('inf') and mean_nome_fantasia_rank != float('inf'):
                overall_mean_rank = (mean_razaosocial_rank + mean_nome_fantasia_rank) / 2
            elif mean_razaosocial_rank != float('inf'):
                overall_mean_rank = mean_razaosocial_rank
            elif mean_nome_fantasia_rank != float('inf'):
                overall_mean_rank = mean_nome_fantasia_rank
            else:
                overall_mean_rank = float('inf')
            
            # Calculate retrieval rate (percentage of cases where target was found in top-k)
            razaosocial_retrieval_rate = (razaosocial_ranks > 0).mean()
            nome_fantasia_retrieval_rate = (nome_fantasia_ranks > 0).mean()
            overall_retrieval_rate = (razaosocial_retrieval_rate + nome_fantasia_retrieval_rate) / 2
            
            performance_results.append({
                'metric': metric,
                f'razaosocial_top_{top_k}_accuracy': razaosocial_top_k_accuracy,
                f'nome_fantasia_top_{top_k}_accuracy': nome_fantasia_top_k_accuracy,
                f'overall_top_{top_k}_accuracy': overall_top_k_accuracy,
                'razaosocial_top_1_accuracy': razaosocial_top_1_accuracy,
                'nome_fantasia_top_1_accuracy': nome_fantasia_top_1_accuracy,
                'overall_top_1_accuracy': overall_top_1_accuracy,
                'mean_razaosocial_rank': mean_razaosocial_rank,
                'mean_nome_fantasia_rank': mean_nome_fantasia_rank,
                'overall_mean_rank': overall_mean_rank,
                'razaosocial_retrieval_rate': razaosocial_retrieval_rate,
                'nome_fantasia_retrieval_rate': nome_fantasia_retrieval_rate,
                'overall_retrieval_rate': overall_retrieval_rate
            })
            
        except Exception as e:
            print(f"Error processing metric '{metric}': {e}")
            continue
    
    if not performance_results:
        print("No valid metrics found for analysis")
        return pd.DataFrame()
    
    performance_df = pd.DataFrame(performance_results)
    
    # Sort by overall top-1 accuracy (descending)
    performance_df = performance_df.sort_values('overall_top_1_accuracy', ascending=False)
    
    # Round numeric columns for better readability
    numeric_columns = performance_df.select_dtypes(include=[np.number]).columns
    performance_df[numeric_columns] = performance_df[numeric_columns].round(4)
    
    return performance_df

def print_performance_summary(performance_df: pd.DataFrame, top_k: int = 1):
    """
    Print a formatted summary of the performance analysis
    
    Args:
        performance_df: DataFrame from analyze_combined_metric_performance
        top_k: The top-k value used in analysis
    """
    
    if performance_df.empty:
        print("No performance data to display")
        return
    
    print(f"\n{'='*80}")
    print(f"PERFORMANCE SUMMARY - TOP-{top_k} RETRIEVAL ANALYSIS")
    print(f"{'='*80}")
    
    print(f"\n📊 RANKING BY OVERALL TOP-1 ACCURACY:")
    print("-" * 50)
    for idx, row in performance_df.iterrows():
        print(f"{idx+1:2d}. {row['metric']:<15} | Top-1: {row['overall_top_1_accuracy']:.3f} | "
              f"Top-{top_k}: {row[f'overall_top_{top_k}_accuracy']:.3f} | "
              f"Avg Rank: {row['overall_mean_rank']:.2f}")
    
    print(f"\n🎯 DETAILED BREAKDOWN:")
    print("-" * 80)
    
    for idx, row in performance_df.head(5).iterrows():  # Show top 5 metrics
        print(f"\n{row['metric'].upper()}:")
        print(f"  • Razão Social    - Top-1: {row['razaosocial_top_1_accuracy']:.3f}, "
              f"Top-{top_k}: {row[f'razaosocial_top_{top_k}_accuracy']:.3f}, "
              f"Retrieval Rate: {row['razaosocial_retrieval_rate']:.3f}")
        print(f"  • Nome Fantasia   - Top-1: {row['nome_fantasia_top_1_accuracy']:.3f}, "
              f"Top-{top_k}: {row[f'nome_fantasia_top_{top_k}_accuracy']:.3f}, ")

In [69]:
def evaluate_retrieval_accuracy_top_k(df: pd.DataFrame,
                                      top_k: int=1,
                                      sample_size: int=None) -> pd.DataFrame:
    """Enhanced evaluation with all available metrics including Jaccard similarity and combined metrics"""
    
    # Sample data if specified
    if sample_size and sample_size < len(df):
        df_sample = df.sample(n=sample_size, random_state=42)
        print(f"Using sample of {sample_size} records for evaluation")
    else:
        df_sample = df
        print(f"Using all {len(df_sample)} records for evaluation")
    
    results = []
    
    # Define base metrics
    metrics_to_evaluate = ['jaccard_ngram',] #'wer', 'cer', 'levenshtein', 'jaccard_chars', 'jaccard_words']
    metrics_funcs = {
        'cer': TextRetrieval.find_best_matches_cer,
        'wer': TextRetrieval.find_best_matches_wer,
        'levenshtein': TextRetrieval.find_best_matches_levenshtein,
        'jaccard_chars': TextRetrieval.find_best_matches_jaccard_chars,
        'jaccard_words': TextRetrieval.find_best_matches_jaccard_words,
        'jaccard_ngram': TextRetrieval.find_best_matches_ngram_jaccard
    }
    print(f"Evaluating retrieval accuracy with top-{top_k} results using metrics: {metrics_to_evaluate}")
    for _, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc="Processing"):
        user_input = row['user_input']
        uf = row['uf']
        true_razaosocial = row['razaosocial']
        true_nome_fantasia = row['nome_fantasia']
        result_row = {
            'user_input': user_input,
            'uf': uf,
            'true_razaosocial': true_razaosocial,
            'true_nome_fantasia': true_nome_fantasia,
        }
        df_uf = df_sample[df_sample['uf'] == uf]
        
        # Dictionary to store all matches for each metric
        all_matches = {}

        for metric in metrics_to_evaluate:
            # Get the function for the current metric
            metric_func = metrics_funcs[metric]
            
            # Find matches using the current metric
            best_razao_matches, best_nome_fantasia_matches, best_razao_scores, best_nome_fantasia_scores = metric_func(user_input, df_uf, top_k)
            
            # Store results in the all_matches dictionary
            all_matches[metric] = {
                'razaosocial': (best_razao_matches, best_razao_scores),
                'nome_fantasia': (best_nome_fantasia_matches, best_nome_fantasia_scores),
                'uf': uf
            }

        # Process results for each metric
        for metric in metrics_to_evaluate:
            for field in ['razaosocial', 'nome_fantasia']:
                matches, scores = all_matches[metric][field]
                true_value = true_razaosocial if field == 'razaosocial' else true_nome_fantasia
                
                # Top-k accuracy
                top_k_pred = true_value in matches
                metric_col = f'{field}_top_{top_k}_{metric}_pred'
                result_row[metric_col] = top_k_pred

                # Top-1 accuracy
                top_1_pred = matches[0] == true_value if matches else False
                metric_col = f'{field}_top_1_{metric}_pred'
                result_row[metric_col] = top_1_pred
                
                # Ranking
                rank = matches.index(true_value) + 1 if true_value in matches else 0
                result_row[f'{field}_rank_{metric}'] = rank
                
                # Store the actual scores for analysis
                if matches:
                    result_row[f'best_{metric}_score_{field}'] = scores[0] if scores else None
        
        results.append(result_row)
    
    return pd.DataFrame(results), matches


import json, os
def evaluate_retrieval_accuracy_top_k_with_checkpoint(df: pd.DataFrame,
                                                     top_k: int=1,
                                                     sample_size: int=None,
                                                     output_file: str="retrieval_results.json",
                                                     checkpoint_every: int=10) -> Tuple[pd.DataFrame, Dict]:
    """
    Optimized version that writes results iteratively to JSON and can resume from checkpoints
    
    Args:
        df: DataFrame with test data
        top_k: Number of top matches to return
        sample_size: Number of samples to use (None for all)
        output_file: Path to JSON file for storing results
        checkpoint_every: Save to file every N records
    """
    
    # Sample data if specified
    if sample_size and sample_size < len(df):
        df_sample = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
        print(f"Using sample of {sample_size} records for evaluation")
    else:
        df_sample = df.reset_index(drop=True)
        print(f"Using all {len(df_sample)} records for evaluation")
    
    # Load existing results if file exists
    processed_indices = set()
    existing_results = []
    existing_matches = {}
    
    if os.path.exists(output_file):
        try:
            with open(output_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                existing_results = data.get('results', [])
                existing_matches = data.get('matches', {})
                processed_indices = {item['test_index'] for item in existing_results}
            print(f"Loaded {len(existing_results)} existing results from {output_file}")
            print(f"Resuming from index {max(processed_indices) + 1 if processed_indices else 0}")
        except Exception as e:
            print(f"Error loading existing file: {e}. Starting fresh.")
            processed_indices = set()
            existing_results = []
            existing_matches = {}
    
    # Group by UF once to avoid repeated filtering
    uf_groups = df_sample.groupby('uf')
    
    results = existing_results.copy()
    all_matches = existing_matches.copy()
    
    metrics_to_evaluate = ['jaccard_ngram']
    metrics_funcs = {
        'jaccard_ngram': TextRetrieval.find_best_matches_ngram_jaccard
    }
    
    print(f"Evaluating retrieval accuracy with top-{top_k} results using metrics: {metrics_to_evaluate}")
    
    records_processed = 0
    total_new_records = 0
    
    # Count total records to process
    for uf, uf_group in uf_groups:
        for idx in uf_group.index:
            if idx not in processed_indices:
                total_new_records += 1
    
    print(f"Total new records to process: {total_new_records}")
    
    def save_checkpoint():
        """Save current progress to JSON file"""
        checkpoint_data = {
            'metadata': {
                'total_records': len(df_sample),
                'processed_records': len(results),
                'top_k': top_k,
                'metrics': metrics_to_evaluate,
                'last_updated': pd.Timestamp.now().isoformat()
            },
            'results': results,
            'matches': all_matches
        }
        
        # Write to temporary file first, then rename (atomic operation)
        temp_file = output_file + '.tmp'
        with open(temp_file, 'w', encoding='utf-8') as f:
            json.dump(checkpoint_data, f, ensure_ascii=False, indent=2)
        os.rename(temp_file, output_file)
    
    # Process each UF group separately
    try:
        with tqdm(total=total_new_records, desc="Processing records") as pbar:
            for uf, uf_group in uf_groups:
                # Process all records in this UF group
                for original_idx in uf_group.index:
                    # Skip if already processed
                    if original_idx in processed_indices:
                        continue
                    
                    row = df_sample.loc[original_idx]
                    user_input = row['user_input']
                    true_razaosocial = row['razaosocial']
                    true_nome_fantasia = row['nome_fantasia']
                    
                    # Create a unique key for this record
                    record_key = f"test_{original_idx}"
                    
                    result_row = {
                        'test_index': original_idx,
                        'record_key': record_key,
                        'user_input': user_input,
                        'uf': uf,
                        'true_razaosocial': true_razaosocial,
                        'true_nome_fantasia': true_nome_fantasia,
                        'processed_at': pd.Timestamp.now().isoformat()
                    }
                    
                    # Initialize matches dictionary for this record
                    all_matches[record_key] = {}
                    
                    # Process metrics
                    for metric in metrics_to_evaluate:
                        metric_func = metrics_funcs[metric]
                        
                        try:
                            # Find matches using the current metric
                            best_razao_matches, best_nome_fantasia_matches, best_razao_scores, best_nome_fantasia_scores = metric_func(user_input, uf_group, top_k)
                            
                            # Store matches for this record and metric
                            all_matches[record_key][metric] = {
                                'razaosocial_matches': best_razao_matches,
                                'nome_fantasia_matches': best_nome_fantasia_matches,
                                'razaosocial_scores': best_razao_scores,
                                'nome_fantasia_scores': best_nome_fantasia_scores,
                                'uf': uf
                            }
                            
                            # Process results for each field
                            for field, matches, scores, true_value in [
                                ('razaosocial', best_razao_matches, best_razao_scores, true_razaosocial),
                                ('nome_fantasia', best_nome_fantasia_matches, best_nome_fantasia_scores, true_nome_fantasia)
                            ]:
                                # Top-k accuracy
                                top_k_pred = true_value in matches
                                result_row[f'{field}_top_{top_k}_{metric}_pred'] = top_k_pred
                                
                                # Top-1 accuracy
                                top_1_pred = matches[0] == true_value if matches else False
                                result_row[f'{field}_top_1_{metric}_pred'] = top_1_pred
                                
                                # Ranking
                                rank = matches.index(true_value) + 1 if true_value in matches else 0
                                result_row[f'{field}_rank_{metric}'] = rank
                                
                                # Store the actual scores
                                result_row[f'best_{metric}_score_{field}'] = scores[0] if scores else None
                                
                                # Add detailed top-k and top-1 information to matches
                                field_key = f'{field}_evaluation'
                                if field_key not in all_matches[record_key][metric]:
                                    all_matches[record_key][metric][field_key] = {}
                                
                                all_matches[record_key]['user_input'] = user_input  
                                all_matches[record_key][metric][field_key] = {
                                    'true_value': true_value,
                                    'top_1': {
                                        'predicted': matches[0] if matches else None,
                                        'score': scores[0] if scores else None,
                                        'is_correct': top_1_pred,
                                        'rank': 1 if top_1_pred else (rank if rank > 0 else None)
                                    },
                                    f'top_{top_k}': {
                                        'predicted_list': matches[:top_k] if matches else [],
                                        'scores_list': scores[:top_k] if scores else [],
                                        'is_correct': top_k_pred,
                                        'rank': rank if rank > 0 else None,
                                        'found_at_position': rank if top_k_pred else None
                                    },
                                    'all_matches': {
                                        'candidates': matches,
                                        'scores': scores,
                                        'total_candidates': len(matches) if matches else 0
                                    }
                                }
                        
                        except Exception as e:
                            print(f"Error processing record {original_idx} with metric {metric}: {e}")
                            # Store error information
                            result_row[f'error_{metric}'] = str(e)
                            all_matches[record_key][metric] = {'error': str(e)}
                    
                    results.append(result_row)
                    records_processed += 1
                    pbar.update(1)
                    
                    # Save checkpoint periodically
                    if records_processed % checkpoint_every == 0:
                        save_checkpoint()
    
    except KeyboardInterrupt:
        print("\nProcess interrupted. Saving current progress...")
        save_checkpoint()
        print(f"Progress saved. Processed {records_processed} records.")
        raise
    
    except Exception as e:
        print(f"Error during processing: {e}")
        save_checkpoint()
        print("Progress saved before error.")
        raise
    
    # Final save
    save_checkpoint()
    print(f"Processing complete. Total records processed: {len(results)}")
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df, all_matches


In [None]:
# Carregando todos as razoes sociais e nomes fantasia únicos
unique_razaosocial_dict = {uf: df[df['uf'] == uf]['razaosocial'].dropna().unique().tolist() for uf in df['uf'].unique()}
# Carregando todos os nomes fantasia únicos
unique_nome_fantasia_dict = {uf: df[df['uf'] == uf]['nome_fantasia'].dropna().unique().tolist() for uf in df['uf'].unique()}
sample_size = None  # Define a sample size for evaluation, or set to None to use all data
top_k = 5
retrieved, matches = evaluate_retrieval_accuracy_top_k_with_checkpoint(df=df, top_k=top_k,sample_size=sample_size)

Using all 255471 records for evaluation
Loaded 5581 existing results from retrieval_results.json
Resuming from index 255459
Evaluating retrieval accuracy with top-5 results using metrics: ['jaccard_ngram']
Total new records to process: 249890


Processing records:  18%|█▊        | 44680/249890 [5:12:07<21:45:38,  2.62it/s]    

In [52]:
retrieved

Unnamed: 0,user_input,uf,true_razaosocial,true_nome_fantasia,record_key,razaosocial_top_5_jaccard_ngram_pred,razaosocial_top_1_jaccard_ngram_pred,razaosocial_rank_jaccard_ngram,best_jaccard_ngram_score_razaosocial,nome_fantasia_top_5_jaccard_ngram_pred,nome_fantasia_top_1_jaccard_ngram_pred,nome_fantasia_rank_jaccard_ngram,best_jaccard_ngram_score_nome_fantasia
0,kadrangular ebanjeio,AC,igreja evangelho quadrangular,cruzada nacional evangelizacao,AC_0_kadrangular ebanjeio,True,True,1,0.222488,True,True,1,0.090659
1,cruzada ebanjelica,AC,igreja evangelho quadrangular,cruzada nacional evangelizacao,AC_1_cruzada ebanjelica,True,True,1,0.069728,True,True,1,0.295828
2,maceio auto,AL,m i s barbosa auto repasse,maceio auto repasse,AL_0_maceio auto,True,True,1,0.144963,True,True,1,0.514130
3,cr quatro quatro cinco zero,AL,sapore,cr 4450,AL_1_cr quatro quatro cinco zero,True,True,1,0.000000,True,True,1,0.114224
4,cruzada ebanjelizacao,AL,igreja evangelho quadrangular,cruzada nacional evangelizacao,AL_2_cruzada ebanjelizacao,True,True,1,0.066434,True,True,1,0.470753
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,mcdonalds alimentos,SP,arcos dourados alimentos,mcdonalds,SP_302_mcdonalds alimentos,True,True,1,0.371429,True,False,2,1.000000
996,teodoro business,SP,allpark empreendimentos participacoes,ed teodoro business,SP_303_teodoro business,True,True,1,0.029499,True,True,1,0.728778
997,wms,TO,wms supermercados brasil,atacadao,TO_0_wms,True,True,1,0.116071,True,True,1,0.000000
998,serrado jalapao,TO,pousada cerrado beach jalapao,cerrado beach,TO_1_serrado jalapao,True,True,1,0.411521,True,True,1,0.217593


In [42]:
retrieved[retrieved['razaosocial_top_5_jaccard_ngram_pred'] == False].head(10)

Unnamed: 0,user_input,uf,true_razaosocial,true_nome_fantasia,record_key,razaosocial_top_5_jaccard_ngram_pred,razaosocial_top_1_jaccard_ngram_pred,razaosocial_rank_jaccard_ngram,best_jaccard_ngram_score_razaosocial,nome_fantasia_top_5_jaccard_ngram_pred,nome_fantasia_top_1_jaccard_ngram_pred,nome_fantasia_rank_jaccard_ngram,best_jaccard_ngram_score_nome_fantasia
53,bajir,BA,viacao aerea rio grandense falida,varig,BA_29_bajir,False,False,0,0.103175,False,False,0,0.027526
360,latino farmacia,PR,latino americana medicamentos,farmacia preco popular,PR_60_latino farmacia,False,False,0,0.0,False,False,0,0.375652
452,c,RJ,lojas cem,lojas cem,RJ_16_c,False,False,0,0.062745,False,False,0,0.0
496,sia latino farmacia,RS,latino americana medicamentos,farmacia preco popular,RS_1_sia latino farmacia,False,False,0,0.0,False,False,0,0.337469
561,seeg,RS,cpfl transmissao,ceee gt,RS_66_seeg,False,False,0,0.0,False,False,0,0.1125
592,lojas,RS,solar agroindustria,lojas solar,RS_97_lojas,False,False,0,0.077381,False,False,0,0.464286
726,goban industria,SP,saint gobain brasil produtos industriais const...,sekurit,SP_33_goban industria,False,False,0,0.318182,False,False,0,0.0
733,social,SP,sest social transporte,sao paulo sp unidade n 01,SP_40_social,False,False,0,0.231111,False,False,0,0.222527
815,oab sao paulo,SP,ordem advogados brasil seccao sao paulo,sub seccao promissao,SP_122_oab sao paulo,False,False,0,0.417391,False,False,0,0.417391
831,mitra diosesana,SP,mitra diocesana campo limpo,paroquia santa luzia,SP_138_mitra diosesana,False,False,0,0.532468,False,False,0,0.029337


In [39]:
performance_analysis = analyze_combined_metric_performance(retrieved, top_k=top_k)
print_performance_summary(performance_analysis, top_k=top_k)

Found metrics: ['jaccard_ngram']

PERFORMANCE SUMMARY - TOP-5 RETRIEVAL ANALYSIS

📊 RANKING BY OVERALL TOP-1 ACCURACY:
--------------------------------------------------
 1. jaccard_ngram   | Top-1: 0.890 | Top-5: 0.977 | Avg Rank: 1.15

🎯 DETAILED BREAKDOWN:
--------------------------------------------------------------------------------

JACCARD_NGRAM:
  • Razão Social    - Top-1: 0.938, Top-5: 0.989, Retrieval Rate: 0.989
  • Nome Fantasia   - Top-1: 0.842, Top-5: 0.964, 


In [None]:
df.to_csv("cleaned_data.csv", index=False)

In [None]:
#retrieved.to_csv("10000_rand_42_retrieved.csv", index=False)

In [None]:
def calculate_classification_metrics(evaluation_df):
    """
    Calculate accuracy, precision, recall, and F1 score from evaluation results
    
    Parameters:
    evaluation_df (pd.DataFrame): Output from evaluate_retrieval_accuracy function
    
    Returns:
    dict: Dictionary containing all metrics for different scenarios
    """
    
    metrics_results = {}
    
    # Define the scenarios to evaluate
    scenarios = [
        ('razaosocial_top_k_cer', 'razaosocial_top_k_cer_pred'),
        ('nome_fantasia_top_k_cer', 'nome_fantasia_top_k_cer_pred'),
        ('razaosocial_top_k_wer', 'razaosocial_top_k_wer_pred'),
        ('nome_fantasia_top_k_wer', 'nome_fantasia_top_k_wer_pred'),
        ('razaosocial_top_k_levenshtein', 'razaosocial_top_k_lev_dist_pred'),
        ('nome_fantasia_top_k_levenshtein', 'nome_fantasia_top_k_lev_dist_pred'),

        ('razaosocial_top_1_cer', 'razaosocial_top_1_cer_pred'),
        ('nome_fantasia_top_1_cer', 'nome_fantasia_top_1_cer_pred'),
        ('razaosocial_top_1_wer', 'razaosocial_top_1_wer_pred'),
        ('nome_fantasia_top_1_wer', 'nome_fantasia_top_1_wer_pred'),
        ('razaosocial_top_1_levenshtein', 'razaosocial_top_1_lev_dist_pred'),
        ('nome_fantasia_top_1_levenshtein', 'nome_fantasia_top_1_lev_dist_pred')
    ]
    
    print("Classification Metrics Summary")
    print("=" * 50)
    
    for scenario_name, correct_column in scenarios:
        print(f"\n{scenario_name.upper()} Results:")
        print("-" * 30)
        
        # Get true labels (1 for correct, 0 for incorrect)
        y_true = [1] * len(evaluation_df)
        # Get predicted labels (1 for correct prediction, 0 for incorrect)
        y_pred = evaluation_df[correct_column].astype(int).tolist()
        
        # Calculate basic metrics
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
    
        
        metrics_results[scenario_name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'correct_predictions': sum(y_pred),
            'total_predictions': len(y_pred)
        }
        
        print(f"Accuracy:  {accuracy:.4f} ({sum(y_pred)}/{len(y_pred)})")
        print(f"Precision: {precision:.4f}")
        print(f"Recall:    {recall:.4f}")
        print(f"F1 Score:  {f1:.4f}")
    
    return metrics_results

In [None]:

# Calculate all metrics
comprehensive_metrics = calculate_classification_metrics(retrieved)