In [90]:
import pandas as pd
import yaml
import unicodedata
import numpy as np
import re
import Levenshtein
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from jiwer import wer, cer
from tqdm import tqdm

warnings.filterwarnings('ignore')
# Set plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

In [91]:
JACCARD_MODE = "ngram"

# 1. Carregando os Dados


In [92]:

confs = yaml.safe_load(open("confs.yaml"))
predictors = confs["predictors"] ### Importante! O cientista poderá usar apenas estas features para criar/aperfeiçoar o modelo
text_target = confs["text_target"]
cols_to_keep = predictors + text_target
df = pd.read_parquet("dados/train.parquet")[cols_to_keep]
df.to_csv("data.csv")
print("\nMissing values per column:")
print(df.isnull().sum())
print(f"\nTotal rows with any missing values: {df.isnull().any(axis=1).sum()}")



Missing values per column:
user_input       0
uf               0
razaosocial      0
nome_fantasia    0
dtype: int64

Total rows with any missing values: 0


# 2. Limpando os Dados

Iremos remover palavras como "S.A.", "LTDA", "LTDA.", "S/A", "S.A", "Ltda", "Ltda.", "S/A.", "S.A.", "S.A", "Ltda" e "Ltda" dos nomes reais das empresas a serem previstos, usando a seguinte suposicao:

- Suposicao 1: usuários tem o hábito de pesquisar por nomes de empresas sem essas palavras, então elas não devem ser consideradas na previsão.


In [93]:
def comprehensive_text_cleaning(text, 
                               remove_accents=True,
                               remove_stop_words=True, 
                               remove_company_suffixes=True,
                               custom_stop_words=None,
                               to_lowercase=True):
    """
    Comprehensive text cleaning function
    
    Parameters:
    text (str): Input text
    remove_accents (bool): Remove accents and normalize characters
    remove_stop_words (bool): Remove Portuguese stop words
    remove_company_suffixes (bool): Remove common company suffixes
    custom_stop_words (set): Additional stop words to remove
    to_lowercase (bool): Convert to lowercase
    
    Returns:
    str: Cleaned text
    """
    
    if pd.isna(text):
        return text
    
    text = str(text)
    
    # 1. Remove accents and normalize characters
    if remove_accents:
        # Normalize unicode
        text = unicodedata.normalize('NFD', text)
        text = ''.join(char for char in text if unicodedata.category(char) != 'Mn')
        
        # Handle specific cases
        text = text.replace('ç', 'c').replace('Ç', 'C')
    
    # 2. Convert to lowercase
    if to_lowercase:
        text = text.lower()
    
    # 3. Remove company suffixes
    if remove_company_suffixes:
        patterns_to_remove = [
        r'\bS\.?A\.?\b',           # S.A, SA, S.A., SA.
        r'\bS/A\.?\b',             # S/A, S/A.
        r'\bLTDA\.?\b',            # LTDA, LTDA.
        r'\bLIMITADA\b',           # LIMITADA
        r'\bCIA\.?\b',             # CIA, CIA.
        r'\bCOMPANHIA\b',          # COMPANHIA
        r'\bEMPRESA\b',            # EMPRESA
        r'\bCOMERCIO\b',           # COMERCIO
        r'\bSERVICOS?\b',          # SERVICO, SERVICOS
        r'\bME\b',                 # ME (Microempresa)
        r'\bEPP\b',                # EPP (Empresa de Pequeno Porte)
        r'\bEIRELI\b',             # EIRELI
        r'\bSOCIEDADE\b',          # SOCIEDADE
        r'ADMINISTRADORA\b',       # ADMINISTRADORA
        r'GERAL\b',                # GERAL
    ]
        
        for pattern in patterns_to_remove:
            text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    
    # 4. Remove stop words
    if remove_stop_words:
        portuguese_stop_words = {
            'a', 'ao', 'aos', 'as', 'da', 'das', 'de', 'do', 'dos', 'e', 'em', 'na', 
            'nas', 'no', 'nos', 'o', 'os', 'para', 'por', 'com', 'um', 'uma', 'uns', 
            'umas', 'se', 'que', 'ou', 'mas', 'como', 'mais', 'muito', 'sua', 'seu',
            'seus', 'suas', 'este', 'esta', 'estes', 'estas', 'esse', 'essa', 'esses',
            'essas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'isso', 'aquilo'
        }
        
        if custom_stop_words:
            portuguese_stop_words.update(custom_stop_words)
        
        words = text.split()
        words = [word for word in words if word.lower() not in portuguese_stop_words]
        text = ' '.join(words)
    
    # 5. Clean up extra whitespace and special characters
    text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)      # Multiple spaces to single space
    text = text.strip()                   # Remove leading/trailing spaces
    
    return text

# Usage
df['razaosocial'] = df['razaosocial'].apply(comprehensive_text_cleaning)
df['nome_fantasia'] = df['nome_fantasia'].apply(comprehensive_text_cleaning)
df['user_input'] = df['user_input'].apply(comprehensive_text_cleaning)

# 2. Análise de Métricas de Character Error Rate (CER), Word Error Rate (WER) e Distancia de Levenshtein

- **Word Error Rate (WER)**: fórmula para calcular a taxa de erro a nível de palavras: 
  $$WER = \frac{S + D + I}{N}$$
  onde:
  - $S$ é o número de substituições. Por exemplo, se o usuário digitou "Empresa X" e a referência é "Empresa Y", então há uma substituição.
  - $D$ é o número de deleções. Por exemplo, se o usuário digitou "Empresa" e a referência é "Empresa X", então há uma deleção.
  - $I$ é o número de inserções. Por exemplo, se o usuário digitou "Empresa X Y" e a referência é "Empresa X", então há uma inserção.
  - $N$ é o número total de palavras na referência. Por exemplo, se a referência é "Empresa X", então $N$ é 2.

- **Character Error Rate (CER)**: fórmula para calcular a taxa de erro a nível de caracteres:
  $$CER = \frac{S + D + I}{N}$$
  onde:
  - $S$ é o número de substituições. Por exemplo, se o usuário digitou "EmpresaXY" e a referência é "EmpresaXZ", então há uma substituição.
  - $D$ é o número de deleções. Por exemplo, se o usuário digitou "Empresa" e a referência é "EmpresaX", então há uma deleção.
  - $I$ é o número de inserções. Por exemplo, se o usuário digitou "Empresa XY" e a referência é "Empresa X", então há uma inserção.
  - $N$ é o número total de caracteres na referência. Por exemplo, se a referência é "Empresa X", então $N$ é 9 (contando espaços).

- **Distância de Levenshtein**: é uma métrica que mede a diferença entre duas sequências. É definida como o número mínimo de operações de edição (inserções, deleções ou substituições) necessárias para transformar uma sequência em outra.

- **Similaridade de Jaccard**: é uma métrica que mede a similaridade entre dois conjuntos. É definida como o tamanho da interseção dividido pelo tamanho da união dos conjuntos.
  $$J(A, B) = \frac{|A \cap B|}{|A \cup B|}$$


Essas métricas serao úteis para avaliar o quão diferente os inputs de usuário (`user_input`) são dos outputs esperados `razaosocial` e `nome_fantasia` e também dos outputs nao esperados, i.e., de todas as empresas que nao correspondem ao input do usuário. 

Caso o `CER` e/ou  `WER` entre o `user_input` e dos outputs nao esperados seja significativamente maior do que o `CER` e/ou `WER` entre o `user_input` e dos outputs esperados, podemos concluir que o input do usuário é mais próximo dos outputs esperados do que dos outputs não esperados e utilizar a minimização de `CER` e `WER` como critério para selecionar a empresa correta.



## 2.1 Calculando o CER, WER e a Distância de Levenshtein

Vamos usar a implementação do pacote já importado `jiwer` para calcular CER e WER. Para a Distância de Levenshtein, vamos usar a função `distance` do pacote `Levenshtein`.


In [94]:
from typing import Callable, Union, Set, Dict
import pandas as pd
import numpy as np
import Levenshtein
from functools import wraps

def validate_inputs(func: Callable) -> Callable:
    """
    Decorator to handle input validation and error handling for text comparison metrics.
    """
    @wraps(func)
    def wrapper(reference: Union[str, float], hypothesis: Union[str, float], *args, **kwargs) -> float:
        try:
            # Handle NaN values
            if pd.isna(reference) or pd.isna(hypothesis):
                return np.nan
            
            # Convert to string and clean
            reference = str(reference).strip()
            hypothesis = str(hypothesis).strip()
            
            # Handle empty strings
            if len(reference) == 0 or len(hypothesis) == 0:
                return np.nan
                
            return func(reference, hypothesis, *args, **kwargs)
            
        except Exception as e:
            print(f"Error calculating {func.__name__}: {e}")
            return np.nan
            
    return wrapper

class TextMetrics:
    """
    A class containing various text comparison metrics with input validation.
    """
    
    @staticmethod
    @validate_inputs
    def calculate_cer(reference: str, hypothesis: str) -> float:
        """Calculate Character Error Rate."""
        return cer(reference, hypothesis)
    
    @staticmethod
    @validate_inputs
    def calculate_wer(reference: str, hypothesis: str) -> float:
        """Calculate Word Error Rate."""
        return wer(reference, hypothesis)
    
    @staticmethod
    @validate_inputs
    def calculate_normalized_levenshtein(reference: str, hypothesis: str) -> float:
        """
        Calculate normalized Levenshtein distance (0-1).
        Returns:
            float: Normalized Levenshtein distance between 0 and 1
        """
        max_len = max(len(reference), len(hypothesis))
        if max_len == 0:
            return 0.0
        
        distance = Levenshtein.distance(reference, hypothesis)
        return distance / max_len

    @staticmethod
    def _get_character_set(text: str) -> Set[str]:
        """
        Convert text to a set of characters.
        
        Args:
            text (str): Input text
            
        Returns:
            Set[str]: Set of characters from the input text
        """
        return set(text)

    @staticmethod
    def _get_word_set(text: str) -> Set[str]:
        """
        Convert text to a set of words.
        
        Args:
            text (str): Input text
            
        Returns:
            Set[str]: Set of words from the input text
        """
        return set(text.lower().split())

    @staticmethod
    def _calculate_jaccard_similarity(set1: Set[str], set2: Set[str]) -> float:
        """
        Calculate Jaccard similarity between two sets.
        
        Args:
            set1 (Set[str]): First set
            set2 (Set[str]): Second set
            
        Returns:
            float: Jaccard similarity score between 0 and 1
        """
        if not set1 and not set2:  # Both sets are empty
            return 1.0
        if not set1 or not set2:   # One set is empty
            return 0.0
            
        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))
        return intersection / union

    @staticmethod
    @validate_inputs
    def calculate_jaccard_similarity_chars(reference: str, hypothesis: str) -> float:
        """
        Calculate Jaccard similarity based on character sets.
        
        Args:
            reference (str): Reference text
            hypothesis (str): Hypothesis text
            
        Returns:
            float: Jaccard similarity score between 0 and 1
        """
        ref_chars = TextMetrics._get_character_set(reference)
        hyp_chars = TextMetrics._get_character_set(hypothesis)
        return TextMetrics._calculate_jaccard_similarity(ref_chars, hyp_chars)

    @staticmethod
    @validate_inputs
    def calculate_jaccard_similarity_words(reference: str, hypothesis: str) -> float:
        """
        Calculate Jaccard similarity based on word sets.
        
        Args:
            reference (str): Reference text
            hypothesis (str): Hypothesis text
            
        Returns:
            float: Jaccard similarity score between 0 and 1
        """
        ref_words = TextMetrics._get_word_set(reference)
        hyp_words = TextMetrics._get_word_set(hypothesis)
        return TextMetrics._calculate_jaccard_similarity(ref_words, hyp_words)

    @staticmethod 
    def _ngram_jaccard_similarity(reference: str, hypothesis: str, n=2):
        """
        Calculate Jaccard similarity using character n-grams.
        This handles inversions and some misspellings well.
        
        Args:
            str1, str2: Input strings
            n: N-gram size (2=bigrams, 3=trigrams, etc.)
        """
        def get_ngrams(text, n):
            """Generate n-grams from text with padding."""
            # Add padding to capture beginning/end patterns
            padded = '#' * (n-1) + text.lower() + '#' * (n-1)
            return set(padded[i:i+n] for i in range(len(padded) - n + 1))
        
        ngrams1 = get_ngrams(reference, n)
        ngrams2 = get_ngrams(hypothesis, n)
        
        intersection = len(ngrams1 & ngrams2)
        union = len(ngrams1 | ngrams2)
        
        return intersection / union if union > 0 else 1.0 if len(reference) == len(hypothesis) == 0 else 0.0

    
    @staticmethod
    def multi_ngram_jaccard_similarity(reference: str, hypothesis: str, ngram_sizes=[2, 3], weights=None):
        """
        Combine multiple n-gram sizes for better robustness.
        """
        if weights is None:
            weights = [1.0] * len(ngram_sizes)
        
        if len(weights) != len(ngram_sizes):
            raise ValueError("Number of weights must match number of n-gram sizes")
        
        total_score = 0
        total_weight = sum(weights)
        
        for size, weight in zip(ngram_sizes, weights):
            score = TextMetrics._ngram_jaccard_similarity(reference, hypothesis, size)
            total_score += score * weight
        
        return total_score / total_weight

## 2.2 CER, WER e Distancia de Levenshtein: `user_input` vs Ground Truth (`razaosocial` e `nome_fantasia`)

Vamos calcular o CER, o WER  e a distancia de Levenshtein entre o `user_input` e as colunas `razaosocial` e `nome_fantasia` do DataFrame e adicionar essas métricas como novas colunas no DataFrame. 


- `cer_razaosocial`: CER entre `user_input` e `razaosocial`
- `wer_razaosocial`: WER entre `user_input` e `razaosocial`
- `lev_dist__razaosocial`: Distância de Levenshtein entre `user_input` e `razaosocial`
- `cer_nome_fantasia`: CER entre `user_input` e `nome_fantasia`
- `wer_nome_fantasia`: WER entre `user_input` e `nome_fantasia`
- `lev_dist__nome_fantasia`: Distância de Levenshtein entre `user_input` e `nome_fantasia`
- `jac_sim_razaosocial`: Similaridade de Jaccard entre `user_input` e `razaosocial`
- `jac_sim_nome_fantasia`: Similaridade de Jaccard entre `user_input` e `nome_fantasia`

In [95]:
import pandas as pd
from typing import List, Tuple


def calculate_metrics(row: pd.Series,
                      reference_col: str,
                      hypothesis_col: str,
                      metrics: TextMetrics,
                      jacc_mode: str) -> Tuple[float, float, float, float]:
    """
    Calculate CER, WER, Levenshtein distance, and Jaccard similarity for a single row.
    
    Args:
        row (pd.Series): A row from the DataFrame
        reference_col (str): Name of the column containing the reference text
        hypothesis_col (str): Name of the column containing the hypothesis text
        metrics (TextMetrics): An instance of the TextMetrics class
    
    Returns:
        Tuple[float, float, float, float]: CER, WER, Levenshtein distance, and Jaccard similarity
    """
    reference = row[reference_col]
    hypothesis = row[hypothesis_col]
    
    cer = metrics.calculate_cer(reference, hypothesis)
    wer = metrics.calculate_wer(reference, hypothesis)
    levenshtein = metrics.calculate_normalized_levenshtein(reference, hypothesis)
    if jacc_mode == "char":
        jaccard = metrics.calculate_jaccard_similarity_chars(reference, hypothesis)
    elif jacc_mode == "word":
        jaccard = metrics.calculate_jaccard_similarity_words(reference, hypothesis)
    elif jacc_mode == "ngram":
        jaccard = metrics.multi_ngram_jaccard_similarity(reference, hypothesis, ngram_sizes=[2, 3], weights=[0.5, 0.5])
    
    return cer, wer, levenshtein, jaccard

def apply_metrics(df: pd.DataFrame,
                  reference_cols: List[str],
                  hypothesis_col: str,
                  metrics: TextMetrics,
                  jacc_mode: str) -> pd.DataFrame:
    """
    Apply text metrics to multiple reference columns against a single hypothesis column.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        reference_cols (List[str]): List of column names to use as reference texts
        hypothesis_col (str): Column name to use as hypothesis text
        metrics (TextMetrics): An instance of the TextMetrics class
    
    Returns:
        pd.DataFrame: DataFrame with added metric columns
    """
    for ref_col in reference_cols:
        print(f"Calculating metrics for {hypothesis_col} vs {ref_col}...")
        
        cer_col = f'cer_{ref_col}'
        wer_col = f'wer_{ref_col}'
        lev_dist_col = f'lev_dist_{ref_col}'
        jaccard_col = f'jacc_sim_{jacc_mode}_{ref_col}'
        
        df[[cer_col, wer_col, lev_dist_col, jaccard_col]] = df.apply(
            lambda row: calculate_metrics(row, ref_col, hypothesis_col, metrics, jacc_mode),
            axis=1,
            result_type='expand'
        )
    
    return df

def calculate_error_rates(df: pd.DataFrame,
                          jacc_mode: str = "word") -> pd.DataFrame:
    """
    Calculate CER, WER, Levenshtein distance, and Jaccard 
    similarity for user_input vs razaosocial and nome_fantasia.
    
    Args:
        df (pd.DataFrame): Input DataFrame
    
    Returns:
        pd.DataFrame: DataFrame with added metric columns
    """
    print("\nCalculating CER, WER, Levenshtein distance, and Jaccard similarity...")
    
    metrics = TextMetrics()
    reference_columns = ['razaosocial', 'nome_fantasia']
    hypothesis_column = 'user_input'
    
    df = apply_metrics(df, reference_columns, hypothesis_column, metrics, jacc_mode=jacc_mode)
    
    print("Calculations completed!")
    return df

# Usage
df = calculate_error_rates(df, jacc_mode=JACCARD_MODE)


Calculating CER, WER, Levenshtein distance, and Jaccard similarity...
Calculating metrics for user_input vs razaosocial...
Calculating metrics for user_input vs nome_fantasia...
Calculations completed!


In [41]:
def analyze_metrics_statistics(df: pd.DataFrame):
    """Calculate and display statistical summaries"""
    # Calculate average metrics
    avg_cer_razaosocial = df['cer_razaosocial'].mean()
    avg_wer_razaosocial = df['wer_razaosocial'].mean()
    avg_lev_dist_razaosocial = df['lev_dist_razaosocial'].mean()
    avg_cer_nome_fantasia = df['cer_nome_fantasia'].mean()
    avg_wer_nome_fantasia = df['wer_nome_fantasia'].mean()
    avg_lev_dist_nome_fantasia = df['lev_dist_nome_fantasia'].mean()
    avg_jac_sim_razaosocial = df[f'jacc_sim_{JACCARD_MODE}_razaosocial'].mean()
    avg_jac_sim_nome_fantasia = df[f'jacc_sim_{JACCARD_MODE}_nome_fantasia'].mean()
    
    print("\n" + "="*50)
    print("AVERAGE ERROR RATES")
    print("="*50)
    print(f"Average CER (user_input vs razaosocial): {avg_cer_razaosocial:.4f}")
    print(f"Average WER (user_input vs razaosocial): {avg_wer_razaosocial:.4f}")
    print(f"Average Levenshtein (user_input vs razaosocial): {avg_lev_dist_razaosocial:.4f}")
    print(f"Average CER (user_input vs nome_fantasia): {avg_cer_nome_fantasia:.4f}")
    print(f"Average WER (user_input vs nome_fantasia): {avg_wer_nome_fantasia:.4f}")
    print(f"Average Levenshtein (user_input vs nome_fantasia): {avg_lev_dist_nome_fantasia:.4f}")
    print(f"Average Jaccard Similarity ({JACCARD_MODE} level) (user_input vs razaosocial): {avg_jac_sim_razaosocial:.4f}")
    print(f"Average Jaccard Similarity ({JACCARD_MODE} level) (user_input vs nome_fantasia): {avg_jac_sim_nome_fantasia:.4f}")
    
    print("\n" + "="*50)
    print("DETAILED STATISTICS")
    print("="*50)
    print("\nCER Statistics:")
    print(df[['cer_razaosocial', 'cer_nome_fantasia']].describe())
    
    print("\nWER Statistics:")
    print(df[['wer_razaosocial', 'wer_nome_fantasia']].describe())

    print("\nLevenshtein Statistics:")
    print(df[['lev_dist_razaosocial', 'lev_dist_nome_fantasia']].describe())

    print("\nJaccard Similarity Statistics:")
    print(df[[f'jacc_sim_{JACCARD_MODE}_razaosocial', f'jacc_sim_{JACCARD_MODE}_nome_fantasia']].describe())

    
    return {
        'avg_cer_razaosocial': avg_cer_razaosocial,
        'avg_wer_razaosocial': avg_wer_razaosocial,
        'avg_cer_nome_fantasia': avg_cer_nome_fantasia,
        'avg_wer_nome_fantasia': avg_wer_nome_fantasia,
        'avg_lev_dist_razaosocial': avg_lev_dist_razaosocial,
        'avg_lev_dist_nome_fantasia': avg_lev_dist_nome_fantasia,
        'avg_jac_sim_razaosocial': avg_jac_sim_razaosocial,
        'avg_jac_sim_nome_fantasia': avg_jac_sim_nome_fantasia
    }

metrics_stats = analyze_metrics_statistics(df)


AVERAGE ERROR RATES
Average CER (user_input vs razaosocial): 0.6507
Average WER (user_input vs razaosocial): 0.8398
Average Levenshtein (user_input vs razaosocial): 0.6018
Average CER (user_input vs nome_fantasia): 0.7442
Average WER (user_input vs nome_fantasia): 0.8768
Average Levenshtein (user_input vs nome_fantasia): 0.5012
Average Jaccard Similarity (ngram level) (user_input vs razaosocial): 0.3157
Average Jaccard Similarity (ngram level) (user_input vs nome_fantasia): 0.4107

DETAILED STATISTICS

CER Statistics:
       cer_razaosocial  cer_nome_fantasia
count    255450.000000      255081.000000
mean          0.650650           0.744236
std           3.815269          15.901897
min           0.000000           0.000000
25%           0.470588           0.333333
50%           0.666667           0.588235
75%           0.794872           0.791667
max        1771.071429        7819.333333

WER Statistics:
       wer_razaosocial  wer_nome_fantasia
count    255450.000000      255081.000

## 2.3 CER, WER e Distancia de Levshenstein: `user_input` vs Outras Empresas (Outputs Não Esperados)

Vamos calcular CER, WER e a Distancia de Levshenstein entre o `user_input` e as colunas `razaosocial` e `nome_fantasia` de todas as outras empresas (outputs não esperados) e adicionar essas métricas como novas colunas no DataFrame.


In [96]:
class TextRetrieval:
    """
    A class for text retrieval using various similarity metrics.
    """
    
    @staticmethod
    def find_best_matches_cer(user_input: str, candidates: List[str], top_k: int = 1) -> Tuple[List[str], List[float]]:
        """
        Find the top-k best matches using Character Error Rate (CER).
        
        Args:
            user_input (str): Input text to match against
            candidates (List[str]): List of candidate texts
            top_k (int): Number of top matches to return
            
        Returns:
            Tuple[List[str], List[float]]: Best matches and their CER scores
        """
        matches_scores = []
        
        for candidate in candidates:
            if pd.isna(candidate):
                continue
                
            cer_score = TextMetrics.calculate_cer(candidate, user_input)
            if not pd.isna(cer_score):
                matches_scores.append((candidate, cer_score))
        
        # Sort by CER score (ascending - lower is better)
        matches_scores.sort(key=lambda x: x[1])
        
        # Return top-k matches
        top_matches = matches_scores[:top_k]
        best_matches = [match for match, _ in top_matches]
        best_scores = [score for _, score in top_matches]
        
        return best_matches, best_scores

    @staticmethod
    def find_best_matches_wer(user_input: str, candidates: List[str], top_k: int = 1) -> Tuple[List[str], List[float]]:
        """
        Find the top-k best matches using Word Error Rate (WER).
        
        Args:
            user_input (str): Input text to match against
            candidates (List[str]): List of candidate texts
            top_k (int): Number of top matches to return
            
        Returns:
            Tuple[List[str], List[float]]: Best matches and their WER scores
        """
        matches_scores = []
        
        for candidate in candidates:
            if pd.isna(candidate):
                continue
                
            wer_score = TextMetrics.calculate_wer(candidate, user_input)
            if not pd.isna(wer_score):
                matches_scores.append((candidate, wer_score))
        
        # Sort by WER score (ascending - lower is better)
        matches_scores.sort(key=lambda x: x[1])
        
        # Return top-k matches
        top_matches = matches_scores[:top_k]
        best_matches = [match for match, _ in top_matches]
        best_scores = [score for _, score in top_matches]
        
        return best_matches, best_scores

    @staticmethod
    def find_best_matches_levenshtein(user_input: str, candidates: List[str], top_k: int = 1) -> Tuple[List[str], List[float]]:
        """
        Find the top-k best matches using normalized Levenshtein distance.
        
        Args:
            user_input (str): Input text to match against
            candidates (List[str]): List of candidate texts
            top_k (int): Number of top matches to return
            
        Returns:
            Tuple[List[str], List[float]]: Best matches and their Levenshtein distances
        """
        matches_scores = []
        
        for candidate in candidates:
            if pd.isna(candidate):
                continue
                
            distance = TextMetrics.calculate_normalized_levenshtein(candidate, user_input)
            if not pd.isna(distance):
                matches_scores.append((candidate, distance))
        
        # Sort by distance (ascending - lower is better)
        matches_scores.sort(key=lambda x: x[1])
        
        # Return top-k matches
        top_matches = matches_scores[:top_k]
        best_matches = [match for match, _ in top_matches]
        best_scores = [score for _, score in top_matches]
        
        return best_matches, best_scores

    @staticmethod
    def find_best_matches_jaccard_chars(user_input: str, candidates: List[str], top_k: int = 1) -> Tuple[List[str], List[float]]:
        """
        Find the top-k best matches using character-level Jaccard similarity.
        
        Args:
            user_input (str): Input text to match against
            candidates (List[str]): List of candidate texts
            top_k (int): Number of top matches to return
            
        Returns:
            Tuple[List[str], List[float]]: Best matches and their Jaccard similarity scores
        """
        matches_scores = []
        
        for candidate in candidates:
            if pd.isna(candidate):
                continue
                
            similarity = TextMetrics.calculate_jaccard_similarity_chars(candidate, user_input)
            if not pd.isna(similarity):
                matches_scores.append((candidate, similarity))
        
        # Sort by similarity (descending - higher is better)
        matches_scores.sort(key=lambda x: x[1], reverse=True)
        
        # Return top-k matches
        top_matches = matches_scores[:top_k]
        best_matches = [match for match, _ in top_matches]
        best_scores = [score for _, score in top_matches]
        
        return best_matches, best_scores

    @staticmethod
    def find_best_matches_jaccard_words(user_input: str, candidates: List[str], top_k: int = 1) -> Tuple[List[str], List[float]]:
        """
        Find the top-k best matches using word-level Jaccard similarity.
        
        Args:
            user_input (str): Input text to match against
            candidates (List[str]): List of candidate texts
            top_k (int): Number of top matches to return
            
        Returns:
            Tuple[List[str], List[float]]: Best matches and their Jaccard similarity scores
        """
        matches_scores = []
        
        for candidate in candidates:
            if pd.isna(candidate):
                continue
                
            similarity = TextMetrics.calculate_jaccard_similarity_words(candidate, user_input)
            if not pd.isna(similarity):
                matches_scores.append((candidate, similarity))
        
        # Sort by similarity (descending - higher is better)
        matches_scores.sort(key=lambda x: x[1], reverse=True)
        
        # Return top-k matches
        top_matches = matches_scores[:top_k]
        best_matches = [match for match, _ in top_matches]
        best_scores = [score for _, score in top_matches]
        
        return best_matches, best_scores

    @staticmethod
    def find_best_matches_ngram_jaccard(user_input: str, df: pd.DataFrame, top_k: int = 1) -> Tuple[List[str], List[float]]:
        """
        Find the top-k best matches using n-gram Jaccard similarity.
        
        Args:
            user_input (str): Input text to match against
            candidates (List[str]): List of candidate texts
            top_k (int): Number of top matches to return
            
        Returns:
            Tuple[List[str], List[float]]: Best matches and their Jaccard similarity scores
        """
        
        razao_social_cands = []
        nome_fantasia_cands = []
        razao_social_cand_sims = []
        nome_fantasia_cand_sims = []
        max_sims = []
        for _, row in df.iterrows():
            razaosocial_cand = row['razaosocial']
            nome_fantasia_cand = row['nome_fantasia']
            if pd.isna(razaosocial_cand) and pd.isna(nome_fantasia_cand):
                continue
                
            razaosocial_cand_sim = TextMetrics.multi_ngram_jaccard_similarity(razaosocial_cand, user_input, ngram_sizes=[2, 3], weights=[0.5, 0.5])
            nome_fantasia_cand_sim = TextMetrics.multi_ngram_jaccard_similarity(nome_fantasia_cand, user_input, ngram_sizes=[2, 3], weights=[0.5, 0.5])
            max_similarity = max(razaosocial_cand_sim, nome_fantasia_cand_sim)
            if razaosocial_cand_sim is not None and nome_fantasia_cand_sim is not None:
                razao_social_cands.append(razaosocial_cand)
                nome_fantasia_cands.append(nome_fantasia_cand)
                razao_social_cand_sims.append(razaosocial_cand_sim)
                nome_fantasia_cand_sims.append(nome_fantasia_cand_sim)
                max_sims.append(max_similarity)     
        
        matches_scores = pd.DataFrame({
            'razaosocial_cand': razao_social_cands,
            'nome_fantasia_cand': nome_fantasia_cands,
            'razaosocial_cand_sim': razao_social_cand_sims,
            'nome_fantasia_cand_sim': nome_fantasia_cand_sims,
            'max_similarity': max_sims
        })
        # Sort by similarity (descending - higher is better)
        matches_scores.sort_values(by='max_similarity', ascending=False, inplace=True)
        # Return top-k matches
        top_matches = matches_scores.head(top_k)
        best_razao_matches = top_matches['razaosocial_cand'].tolist()
        best_nome_fantasia_matches = top_matches['nome_fantasia_cand'].tolist()
        best_razao_scores = top_matches['razaosocial_cand_sim'].tolist()
        best_nome_fantasia_scores = top_matches['nome_fantasia_cand_sim'].tolist()
        
        return best_razao_matches, best_nome_fantasia_matches, best_razao_scores, best_nome_fantasia_scores
    @staticmethod
    def find_best_matches_combined(user_input: str, candidates: List[str], top_k: int = 1, 
                                 weights: dict = None) -> Tuple[List[str], List[float]]:
        """
        Find the top-k best matches using a combination of multiple metrics.
        
        Args:
            user_input (str): Input text to match against
            candidates (List[str]): List of candidate texts
            top_k (int): Number of top matches to return
            weights (dict): Weights for different metrics. Default uses equal weights.
            
        Returns:
            Tuple[List[str], List[float]]: Best matches and their combined scores
        """
        if weights is None:
            weights = {
                'cer': 0.0,
                'wer': 0.0,
                'levenshtein': 0.5,
                'jaccard_chars': 0.0,
                'jaccard_words': 0.0,  # Set to 0 if not used
                'ngram_jaccard': 0.5    # Set to 0 if not used
            }
        
        matches_scores = []
        
        for candidate in candidates:
            if pd.isna(candidate):
                continue
            
            # Calculate all metrics
            cer_score = TextMetrics.calculate_cer(candidate, user_input)
            wer_score = TextMetrics.calculate_wer(candidate, user_input)
            lev_score = TextMetrics.calculate_normalized_levenshtein(candidate, user_input)
            jac_score = TextMetrics.calculate_jaccard_similarity_chars(candidate, user_input)
            
            # Skip if any metric failed
            if any(pd.isna(score) for score in [cer_score, wer_score, lev_score, jac_score]):
                continue
            
            # Normalize scores (convert distance metrics to similarity)
            # For CER, WER, and Levenshtein: lower is better, so we use (1 - score)
            # For Jaccard: higher is better, so we use score directly
            normalized_cer = max(0, 1 - cer_score)
            normalized_wer = max(0, 1 - wer_score)
            normalized_lev = max(0, 1 - lev_score)
            normalized_jac = jac_score
            
            # Calculate weighted combined score
            combined_score = (
                weights.get('cer', 0) * normalized_cer +
                weights.get('wer', 0) * normalized_wer +
                weights.get('levenshtein', 0) * normalized_lev +
                weights.get('jaccard_chars', 0) * normalized_jac
            )
            
            matches_scores.append((candidate, combined_score))
        
        # Sort by combined score (descending - higher is better)
        matches_scores.sort(key=lambda x: x[1], reverse=True)
        
        # Return top-k matches
        top_matches = matches_scores[:top_k]
        best_matches = [match for match, _ in top_matches]
        best_scores = [score for _, score in top_matches]
        
        return best_matches, best_scores


In [97]:
def evaluate_retrieval_accuracy_top_k(df: pd.DataFrame,
                                      unique_razaosocial_dict: Dict[str, List[str]],
                                      unique_nome_fantasia_dict: Dict[str, List[str]],
                                      top_k: int=1,
                                      sample_size: int=None,
                                      include_jaccard: bool=True,
                                      include_combined: bool=True,
                                      combined_weights: dict = None) -> pd.DataFrame:
    """Enhanced evaluation with all available metrics including Jaccard similarity and combined metrics"""
    
    # Sample data if specified
    if sample_size and sample_size < len(df):
        df_sample = df.sample(n=sample_size, random_state=42)
        print(f"Using sample of {sample_size} records for evaluation")
    else:
        df_sample = df
        print(f"Using all {len(df_sample)} records for evaluation")
    
    results = []
    
    # Define base metrics
    metrics_to_evaluate = ['cer', 'wer', 'levenshtein']
    
    # Add Jaccard metrics if requested
    if include_jaccard:
        metrics_to_evaluate.extend(['jaccard_chars', 'jaccard_words', 'jaccard_ngram'])
    
    # Add combined metric if requested
    if include_combined:
        metrics_to_evaluate.append('combined')
    
    # Set default weights for combined metric if not provided
    if combined_weights is None:
        if include_jaccard:
            combined_weights = {
                'cer': 0.2,
                'wer': 0.2,
                'levenshtein': 0.2,
                'jaccard_chars': 0.2,
                'jaccard_words': 0.2
            }
        else:
            combined_weights = {
                'cer': 0.33,
                'wer': 0.33,
                'levenshtein': 0.34
            }
    
    print(f"Evaluating retrieval accuracy with top-{top_k} results using metrics: {metrics_to_evaluate}")
    if include_combined:
        print(f"Combined metric weights: {combined_weights}")
    
    for idx, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc="Processing"):
        user_input = row['user_input']
        uf = row['uf']
        true_razaosocial = row['razaosocial']
        true_nome_fantasia = row['nome_fantasia']
        unique_razaosocial = unique_razaosocial_dict.get(uf, [])
        unique_nome_fantasia = unique_nome_fantasia_dict.get(uf, [])
        unique_nome_fantasia = df_sample[df_sample['uf'] == uf]['nome_fantasia'].unique().tolist()
        result_row = {
            'user_input': user_input,
            'uf': uf,
            'true_razaosocial': true_razaosocial,
            'true_nome_fantasia': true_nome_fantasia,
        }
        
        # Dictionary to store all matches for each metric
        all_matches = {}
        
        # Find matches using individual metrics
        all_matches['cer'] = {
            'razaosocial': TextRetrieval.find_best_matches_cer(user_input, unique_razaosocial, top_k),
            'nome_fantasia': TextRetrieval.find_best_matches_cer(user_input, unique_nome_fantasia, top_k)
        }
        
        all_matches['wer'] = {
            'razaosocial': TextRetrieval.find_best_matches_wer(user_input, unique_razaosocial, top_k),
            'nome_fantasia': TextRetrieval.find_best_matches_wer(user_input, unique_nome_fantasia, top_k)
        }
        
        all_matches['levenshtein'] = {
            'razaosocial': TextRetrieval.find_best_matches_levenshtein(user_input, unique_razaosocial, top_k),
            'nome_fantasia': TextRetrieval.find_best_matches_levenshtein(user_input, unique_nome_fantasia, top_k)
        }
        
        if include_jaccard:
            all_matches['jaccard_chars'] = {
                'razaosocial': TextRetrieval.find_best_matches_jaccard_chars(user_input, unique_razaosocial, top_k),
                'nome_fantasia': TextRetrieval.find_best_matches_jaccard_chars(user_input, unique_nome_fantasia, top_k)
            }
            
            all_matches['jaccard_words'] = {
                'razaosocial': TextRetrieval.find_best_matches_jaccard_words(user_input, unique_razaosocial, top_k),
                'nome_fantasia': TextRetrieval.find_best_matches_jaccard_words(user_input, unique_nome_fantasia, top_k)
            }
            all_matches['jaccard_ngram'] = {
                'razaosocial': TextRetrieval.find_best_matches_ngram_jaccard(user_input, unique_razaosocial, top_k),
                'nome_fantasia': TextRetrieval.find_best_matches_ngram_jaccard(user_input, unique_nome_fantasia, top_k)
            }
        
        # Find matches using combined metric
        if include_combined:
            all_matches['combined'] = {
                'razaosocial': TextRetrieval.find_best_matches_combined(user_input, unique_razaosocial, top_k, combined_weights),
                'nome_fantasia': TextRetrieval.find_best_matches_combined(user_input, unique_nome_fantasia, top_k, combined_weights)
            }
        
        # Process results for each metric
        for metric in metrics_to_evaluate:
            for field in ['razaosocial', 'nome_fantasia']:
                matches, scores = all_matches[metric][field]
                true_value = true_razaosocial if field == 'razaosocial' else true_nome_fantasia
                
                # Top-k accuracy
                top_k_pred = true_value in matches
                metric_col = f'{field}_top_{top_k}_{metric}_pred'
                result_row[metric_col] = top_k_pred

                # Top-1 accuracy
                top_1_pred = matches[0] == true_value if matches else False
                metric_col = f'{field}_top_1_{metric}_pred'
                result_row[metric_col] = top_1_pred
                
                # Ranking
                rank = matches.index(true_value) + 1 if true_value in matches else 0
                result_row[f'{field}_rank_{metric}'] = rank
                
                # Store the actual scores for analysis
                if matches:
                    result_row[f'best_{metric}_score_{field}'] = scores[0] if scores else None
        
        results.append(result_row)
    
    return pd.DataFrame(results)

def analyze_combined_metric_performance(results_df: pd.DataFrame, 
                                        top_k: int = 1) -> pd.DataFrame:
    """
    Analyze the performance of different metrics including the combined metric
    
    Args:
        results_df: DataFrame returned from evaluate_retrieval_accuracy_top_k
        top_k: Which top-k accuracy to analyze
        
    Returns:
        DataFrame with performance metrics for each method
    """
    
    # Extract available metrics from column names
    available_metrics = set()
    
    # Look for columns that match the pattern: {field}_top_{k}_{metric}_pred
    for col in results_df.columns:
        if '_top_' in col and col.endswith('_pred'):
            # Split the column name to extract the metric
            parts = col.split('_')
            if len(parts) >= 4:
                # Find the position of 'top' in the parts
                try:
                    top_idx = parts.index('top')
                    if top_idx + 2 < len(parts):  # Ensure we have enough parts after 'top'
                        # Extract metric name (everything between the number and 'pred')
                        metric_parts = parts[top_idx + 2:-1]  # Skip 'top', number, and 'pred'
                        metric = '_'.join(metric_parts)
                        available_metrics.add(metric)
                except ValueError:
                    continue
    
    print(f"Found metrics: {sorted(available_metrics)}")
    
    performance_results = []
    
    for metric in available_metrics:
        # Construct column names
        razaosocial_top_k_col = f'razaosocial_top_{top_k}_{metric}_pred'
        nome_fantasia_top_k_col = f'nome_fantasia_top_{top_k}_{metric}_pred'
        razaosocial_top_1_col = f'razaosocial_top_1_{metric}_pred'
        nome_fantasia_top_1_col = f'nome_fantasia_top_1_{metric}_pred'
        razaosocial_rank_col = f'razaosocial_rank_{metric}'
        nome_fantasia_rank_col = f'nome_fantasia_rank_{metric}'
        
        # Check if required columns exist
        required_cols = [razaosocial_top_k_col, nome_fantasia_top_k_col, 
                        razaosocial_top_1_col, nome_fantasia_top_1_col,
                        razaosocial_rank_col, nome_fantasia_rank_col]
        
        missing_cols = [col for col in required_cols if col not in results_df.columns]
        if missing_cols:
            print(f"Warning: Missing columns for metric '{metric}': {missing_cols}")
            continue
        
        try:
            # Calculate top-k accuracy
            razaosocial_top_k_accuracy = results_df[razaosocial_top_k_col].mean()
            nome_fantasia_top_k_accuracy = results_df[nome_fantasia_top_k_col].mean()
            overall_top_k_accuracy = (razaosocial_top_k_accuracy + nome_fantasia_top_k_accuracy) / 2
            
            # Calculate top-1 accuracy
            razaosocial_top_1_accuracy = results_df[razaosocial_top_1_col].mean()
            nome_fantasia_top_1_accuracy = results_df[nome_fantasia_top_1_col].mean()
            overall_top_1_accuracy = (razaosocial_top_1_accuracy + nome_fantasia_top_1_accuracy) / 2
            
            # Calculate mean rank (lower is better, 0 means not found)
            # Only consider non-zero ranks for mean calculation
            razaosocial_ranks = results_df[razaosocial_rank_col]
            nome_fantasia_ranks = results_df[nome_fantasia_rank_col]
            
            # Calculate mean rank excluding 0s (not found cases)
            razaosocial_found_ranks = razaosocial_ranks[razaosocial_ranks > 0]
            nome_fantasia_found_ranks = nome_fantasia_ranks[nome_fantasia_ranks > 0]
            
            mean_razaosocial_rank = razaosocial_found_ranks.mean() if len(razaosocial_found_ranks) > 0 else float('inf')
            mean_nome_fantasia_rank = nome_fantasia_found_ranks.mean() if len(nome_fantasia_found_ranks) > 0 else float('inf')
            
            # Overall mean rank
            if mean_razaosocial_rank != float('inf') and mean_nome_fantasia_rank != float('inf'):
                overall_mean_rank = (mean_razaosocial_rank + mean_nome_fantasia_rank) / 2
            elif mean_razaosocial_rank != float('inf'):
                overall_mean_rank = mean_razaosocial_rank
            elif mean_nome_fantasia_rank != float('inf'):
                overall_mean_rank = mean_nome_fantasia_rank
            else:
                overall_mean_rank = float('inf')
            
            # Calculate retrieval rate (percentage of cases where target was found in top-k)
            razaosocial_retrieval_rate = (razaosocial_ranks > 0).mean()
            nome_fantasia_retrieval_rate = (nome_fantasia_ranks > 0).mean()
            overall_retrieval_rate = (razaosocial_retrieval_rate + nome_fantasia_retrieval_rate) / 2
            
            performance_results.append({
                'metric': metric,
                f'razaosocial_top_{top_k}_accuracy': razaosocial_top_k_accuracy,
                f'nome_fantasia_top_{top_k}_accuracy': nome_fantasia_top_k_accuracy,
                f'overall_top_{top_k}_accuracy': overall_top_k_accuracy,
                'razaosocial_top_1_accuracy': razaosocial_top_1_accuracy,
                'nome_fantasia_top_1_accuracy': nome_fantasia_top_1_accuracy,
                'overall_top_1_accuracy': overall_top_1_accuracy,
                'mean_razaosocial_rank': mean_razaosocial_rank,
                'mean_nome_fantasia_rank': mean_nome_fantasia_rank,
                'overall_mean_rank': overall_mean_rank,
                'razaosocial_retrieval_rate': razaosocial_retrieval_rate,
                'nome_fantasia_retrieval_rate': nome_fantasia_retrieval_rate,
                'overall_retrieval_rate': overall_retrieval_rate
            })
            
        except Exception as e:
            print(f"Error processing metric '{metric}': {e}")
            continue
    
    if not performance_results:
        print("No valid metrics found for analysis")
        return pd.DataFrame()
    
    performance_df = pd.DataFrame(performance_results)
    
    # Sort by overall top-1 accuracy (descending)
    performance_df = performance_df.sort_values('overall_top_1_accuracy', ascending=False)
    
    # Round numeric columns for better readability
    numeric_columns = performance_df.select_dtypes(include=[np.number]).columns
    performance_df[numeric_columns] = performance_df[numeric_columns].round(4)
    
    return performance_df

def print_performance_summary(performance_df: pd.DataFrame, top_k: int = 1):
    """
    Print a formatted summary of the performance analysis
    
    Args:
        performance_df: DataFrame from analyze_combined_metric_performance
        top_k: The top-k value used in analysis
    """
    
    if performance_df.empty:
        print("No performance data to display")
        return
    
    print(f"\n{'='*80}")
    print(f"PERFORMANCE SUMMARY - TOP-{top_k} RETRIEVAL ANALYSIS")
    print(f"{'='*80}")
    
    print(f"\n📊 RANKING BY OVERALL TOP-1 ACCURACY:")
    print("-" * 50)
    for idx, row in performance_df.iterrows():
        print(f"{idx+1:2d}. {row['metric']:<15} | Top-1: {row['overall_top_1_accuracy']:.3f} | "
              f"Top-{top_k}: {row[f'overall_top_{top_k}_accuracy']:.3f} | "
              f"Avg Rank: {row['overall_mean_rank']:.2f}")
    
    print(f"\n🎯 DETAILED BREAKDOWN:")
    print("-" * 80)
    
    for idx, row in performance_df.head(5).iterrows():  # Show top 5 metrics
        print(f"\n{row['metric'].upper()}:")
        print(f"  • Razão Social    - Top-1: {row['razaosocial_top_1_accuracy']:.3f}, "
              f"Top-{top_k}: {row[f'razaosocial_top_{top_k}_accuracy']:.3f}, "
              f"Retrieval Rate: {row['razaosocial_retrieval_rate']:.3f}")
        print(f"  • Nome Fantasia   - Top-1: {row['nome_fantasia_top_1_accuracy']:.3f}, "
              f"Top-{top_k}: {row[f'nome_fantasia_top_{top_k}_accuracy']:.3f}, ")

In [98]:
def process_matches(all_matches: Dict[str, Tuple[List[str], List[float]]],
                    uf: str,
                    df: pd.DataFrame) -> Dict[str, Tuple[List[str], List[float]]]:
    """
    Process matches to determine if the true values are in the top-k results.
    
        Dict: Dictionary with top-k and top-1 predictions for each metric and field
    """ 

    for match, score in zip(*all_matches['razaosocial']):
        if score == 0:
            all_matches['razaosocial'] = ([match], [score])
            all_matches['nome_fantasia'] = (df[(df['uf'] == uf) & (df['razaosocial'] == match)]['nome_fantasia'].tolist(), [0])
            return all_matches
    for match, score in zip(*all_matches['nome_fantasia']):
        if score == 0:
            all_matches['nome_fantasia'] = ([match], [score])
            all_matches['razaosocial'] = (df[(df['uf'] == uf) & (df['nome_fantasia'] == match)]['razaosocial'].tolist(), [0])
            return all_matches
    return all_matches
        

In [99]:
def evaluate_retrieval_accuracy_top_k(df: pd.DataFrame,
                                      unique_razaosocial_dict: Dict[str, List[str]],
                                      unique_nome_fantasia_dict: Dict[str, List[str]],
                                      top_k: int=1,
                                      sample_size: int=None,
                                      include_jaccard: bool=True,
                                      include_combined: bool=True,
                                      combined_weights: dict = None) -> pd.DataFrame:
    """Enhanced evaluation with all available metrics including Jaccard similarity and combined metrics"""
    
    # Sample data if specified
    if sample_size and sample_size < len(df):
        df_sample = df.sample(n=sample_size, random_state=42)
        print(f"Using sample of {sample_size} records for evaluation")
    else:
        df_sample = df
        print(f"Using all {len(df_sample)} records for evaluation")
    
    results = []
    
    # Define base metrics
    metrics_to_evaluate = ['cer', 'wer', 'levenshtein']
    
    # Add Jaccard metrics if requested
    if include_jaccard:
        metrics_to_evaluate.extend(['jaccard_chars', 'jaccard_words', 'jaccard_ngram'])
    
    # Add combined metric if requested
    if include_combined:
        metrics_to_evaluate.append('combined')
    
    # Set default weights for combined metric if not provided
    if combined_weights is None:
        if include_jaccard:
            combined_weights = {
                'cer': 0.2,
                'wer': 0.2,
                'levenshtein': 0.2,
                'jaccard_chars': 0.2,
                'jaccard_words': 0.2
            }
        else:
            combined_weights = {
                'cer': 0.33,
                'wer': 0.33,
                'levenshtein': 0.34
            }
    
    print(f"Evaluating retrieval accuracy with top-{top_k} results using metrics: {metrics_to_evaluate}")
    if include_combined:
        print(f"Combined metric weights: {combined_weights}")
    for idx, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc="Processing"):
        user_input = row['user_input']
        uf = row['uf']
        true_razaosocial = row['razaosocial']
        true_nome_fantasia = row['nome_fantasia']
        unique_razaosocial = unique_razaosocial_dict.get(uf, [])
        unique_nome_fantasia = unique_nome_fantasia_dict.get(uf, [])
        unique_nome_fantasia = df_sample[df_sample['uf'] == uf]['nome_fantasia'].unique().tolist()
        result_row = {
            'user_input': user_input,
            'uf': uf,
            'true_razaosocial': true_razaosocial,
            'true_nome_fantasia': true_nome_fantasia,
        }
        
        # Dictionary to store all matches for each metric
        all_matches = {}
        
        # Find matches using individual metrics
        all_matches['cer'] = {
            'razaosocial': TextRetrieval.find_best_matches_cer(user_input, unique_razaosocial, top_k),
            'nome_fantasia': TextRetrieval.find_best_matches_cer(user_input, unique_nome_fantasia, top_k),
            'uf': uf
        }
        
        all_matches['wer'] = {
            'razaosocial': TextRetrieval.find_best_matches_wer(user_input, unique_razaosocial, top_k),
            'nome_fantasia': TextRetrieval.find_best_matches_wer(user_input, unique_nome_fantasia, top_k),
            'uf': uf
        }
        
        all_matches['levenshtein'] = {
            'razaosocial': TextRetrieval.find_best_matches_levenshtein(user_input, unique_razaosocial, top_k),
            'nome_fantasia': TextRetrieval.find_best_matches_levenshtein(user_input, unique_nome_fantasia, top_k),
            'uf': uf
        }
        
        if include_jaccard:
            all_matches['jaccard_chars'] = {
                'razaosocial': TextRetrieval.find_best_matches_jaccard_chars(user_input, unique_razaosocial, top_k),
                'nome_fantasia': TextRetrieval.find_best_matches_jaccard_chars(user_input, unique_nome_fantasia, top_k),
                'uf': uf
            }
            
            all_matches['jaccard_words'] = {
                'razaosocial': TextRetrieval.find_best_matches_jaccard_words(user_input, unique_razaosocial, top_k),
                'nome_fantasia': TextRetrieval.find_best_matches_jaccard_words(user_input, unique_nome_fantasia, top_k),
                'uf': uf
            }

            df_uf = df_sample[df_sample['uf'] == uf]
            best_razao_matches, best_nome_fantasia_matches, best_razao_scores, best_nome_fantasia_scores = TextRetrieval.find_best_matches_ngram_jaccard(user_input, df_uf, top_k)
            all_matches['jaccard_ngram'] = {
                'razaosocial': (best_razao_matches, best_razao_scores),
                'nome_fantasia': (best_nome_fantasia_matches, best_nome_fantasia_scores),
                'uf': uf
            }
        
        # Find matches using combined metric
        if include_combined:
            all_matches['combined'] = {
                'razaosocial': TextRetrieval.find_best_matches_combined(user_input, unique_razaosocial, top_k, combined_weights),
                'nome_fantasia': TextRetrieval.find_best_matches_combined(user_input, unique_nome_fantasia, top_k, combined_weights),
                'uf': uf
            }
        # Process results for each metric
        for metric in metrics_to_evaluate:
            for field in ['razaosocial', 'nome_fantasia']:
                matches, scores = all_matches[metric][field]
                true_value = true_razaosocial if field == 'razaosocial' else true_nome_fantasia
                
                # Top-k accuracy
                top_k_pred = true_value in matches
                metric_col = f'{field}_top_{top_k}_{metric}_pred'
                result_row[metric_col] = top_k_pred

                # Top-1 accuracy
                top_1_pred = matches[0] == true_value if matches else False
                metric_col = f'{field}_top_1_{metric}_pred'
                result_row[metric_col] = top_1_pred
                
                # Ranking
                rank = matches.index(true_value) + 1 if true_value in matches else 0
                result_row[f'{field}_rank_{metric}'] = rank
                
                # Store the actual scores for analysis
                if matches:
                    result_row[f'best_{metric}_score_{field}'] = scores[0] if scores else None
        
        results.append(result_row)
    
    return pd.DataFrame(results), matches

In [103]:
# Carregando todos as razoes sociais e nomes fantasia únicos
unique_razaosocial_dict = {uf: df[df['uf'] == uf]['razaosocial'].dropna().unique().tolist() for uf in df['uf'].unique()}
# Carregando todos os nomes fantasia únicos
unique_nome_fantasia_dict = {uf: df[df['uf'] == uf]['nome_fantasia'].dropna().unique().tolist() for uf in df['uf'].unique()}
sample_size = None  # Define a sample size for evaluation, or set to None to use all data
top_k = 5
combined_weights = {
                'cer': 0.1,
                'wer': 0.2,
                'levenshtein': 0.2,
                'jaccard_chars': 0.1,
                'jaccard_words': 0.4,
            }
retrieved, matches = evaluate_retrieval_accuracy_top_k(df=df,
                                              top_k=top_k,
                                              unique_razaosocial_dict=unique_razaosocial_dict,
                                              unique_nome_fantasia_dict=unique_nome_fantasia_dict,
                                              sample_size=sample_size,)
                                              #combined_weights=combined_weights)

Using all 255471 records for evaluation
Evaluating retrieval accuracy with top-5 results using metrics: ['cer', 'wer', 'levenshtein', 'jaccard_chars', 'jaccard_words', 'jaccard_ngram', 'combined']
Combined metric weights: {'cer': 0.2, 'wer': 0.2, 'levenshtein': 0.2, 'jaccard_chars': 0.2, 'jaccard_words': 0.2}


Processing:   0%|          | 9/255471 [00:15<118:51:30,  1.67s/it]


KeyboardInterrupt: 

In [101]:
performance_analysis = analyze_combined_metric_performance(retrieved, top_k=top_k)
print_performance_summary(performance_analysis, top_k=top_k)

Found metrics: ['cer', 'combined', 'jaccard_chars', 'jaccard_ngram', 'jaccard_words', 'levenshtein', 'wer']

PERFORMANCE SUMMARY - TOP-5 RETRIEVAL ANALYSIS

📊 RANKING BY OVERALL TOP-1 ACCURACY:
--------------------------------------------------
 4. jaccard_ngram   | Top-1: 0.891 | Top-5: 0.977 | Avg Rank: 1.15
 5. jaccard_words   | Top-1: 0.588 | Top-5: 0.700 | Avg Rank: 1.29
 7. combined        | Top-1: 0.538 | Top-5: 0.693 | Avg Rank: 1.45
 6. levenshtein     | Top-1: 0.484 | Top-5: 0.653 | Avg Rank: 1.52
 1. wer             | Top-1: 0.467 | Top-5: 0.578 | Avg Rank: 1.37
 2. cer             | Top-1: 0.444 | Top-5: 0.590 | Avg Rank: 1.51
 3. jaccard_chars   | Top-1: 0.381 | Top-5: 0.553 | Avg Rank: 1.65

🎯 DETAILED BREAKDOWN:
--------------------------------------------------------------------------------

JACCARD_NGRAM:
  • Razão Social    - Top-1: 0.935, Top-5: 0.989, Retrieval Rate: 0.989
  • Nome Fantasia   - Top-1: 0.848, Top-5: 0.964, 

JACCARD_WORDS:
  • Razão Social    - Top-1

In [64]:
df.to_csv("cleaned_data.csv", index=False)

In [None]:
#retrieved.to_csv("10000_rand_42_retrieved.csv", index=False)

In [None]:
def calculate_classification_metrics(evaluation_df):
    """
    Calculate accuracy, precision, recall, and F1 score from evaluation results
    
    Parameters:
    evaluation_df (pd.DataFrame): Output from evaluate_retrieval_accuracy function
    
    Returns:
    dict: Dictionary containing all metrics for different scenarios
    """
    
    metrics_results = {}
    
    # Define the scenarios to evaluate
    scenarios = [
        ('razaosocial_top_k_cer', 'razaosocial_top_k_cer_pred'),
        ('nome_fantasia_top_k_cer', 'nome_fantasia_top_k_cer_pred'),
        ('razaosocial_top_k_wer', 'razaosocial_top_k_wer_pred'),
        ('nome_fantasia_top_k_wer', 'nome_fantasia_top_k_wer_pred'),
        ('razaosocial_top_k_levenshtein', 'razaosocial_top_k_lev_dist_pred'),
        ('nome_fantasia_top_k_levenshtein', 'nome_fantasia_top_k_lev_dist_pred'),

        ('razaosocial_top_1_cer', 'razaosocial_top_1_cer_pred'),
        ('nome_fantasia_top_1_cer', 'nome_fantasia_top_1_cer_pred'),
        ('razaosocial_top_1_wer', 'razaosocial_top_1_wer_pred'),
        ('nome_fantasia_top_1_wer', 'nome_fantasia_top_1_wer_pred'),
        ('razaosocial_top_1_levenshtein', 'razaosocial_top_1_lev_dist_pred'),
        ('nome_fantasia_top_1_levenshtein', 'nome_fantasia_top_1_lev_dist_pred')
    ]
    
    print("Classification Metrics Summary")
    print("=" * 50)
    
    for scenario_name, correct_column in scenarios:
        print(f"\n{scenario_name.upper()} Results:")
        print("-" * 30)
        
        # Get true labels (1 for correct, 0 for incorrect)
        y_true = [1] * len(evaluation_df)
        # Get predicted labels (1 for correct prediction, 0 for incorrect)
        y_pred = evaluation_df[correct_column].astype(int).tolist()
        
        # Calculate basic metrics
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
    
        
        metrics_results[scenario_name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'correct_predictions': sum(y_pred),
            'total_predictions': len(y_pred)
        }
        
        print(f"Accuracy:  {accuracy:.4f} ({sum(y_pred)}/{len(y_pred)})")
        print(f"Precision: {precision:.4f}")
        print(f"Recall:    {recall:.4f}")
        print(f"F1 Score:  {f1:.4f}")
    
    return metrics_results

In [None]:

# Calculate all metrics
comprehensive_metrics = calculate_classification_metrics(retrieved)