# NLP Challenge - Data Scientist - Jun 2025

## Imports

In [3]:
# imports
import yaml
import pandas as pd
import unicodedata
import re
from functools import reduce
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import distance as levenshtein_distance


confs = yaml.safe_load(open("data/confs.yaml"))
predictors = confs["predictors"] ### Importante! O cientista poderá usar apenas estas features para criar/aperfeiçoar o modelo
text_target = confs["text_target"]

## Loading Dataset

In [5]:
# loading dataset

train = pd.read_parquet('data/train.parquet')
uf, y_train = train['uf'], train[text_target]

## Pre-processing

### Functions

In [6]:
#--- 1. Pre-processing Functions ---

def fix_special_characters(text):
  """ Remove accents and uniform unicode characters """
  if not isinstance(text, str):
    return ''
  return unicodedata.normalize('NFKD', text).encode('ascii','ignore').decode('utf-8')

def lowering(text):
  """ Convert text to lower case """
  if not isinstance(text,str):
    return ''
  return text.lower()

def remove(text,regex):
  """ Remove unwanted patterns with regex """
  if not isinstance(text,str):
    return ''
  return regex.sub(' ', text)

def substitute_spaces(text,regex):
  """Substitute multiple spaces for unique space """
  if not isinstance(text,str):
    return ''
  return regex.sub(' ', text)

#--- 2. Functions Pipeline ---

def apply_pipeline(text,funcs):
  """ Apply sequence of functions to a text """
  return reduce(lambda t,f: f(t), funcs, text)


#--- 3. Main Pre-processing Function (multiple columns) ---

def preprocess_columns(df, columns, funcs):
  """ Apply functions pipeline to multiple columns of a DataFrame.

  Args:
    df (pd.DataFrame): dataset
    columns (list): list of columns to be processed
    funcs (list): list of functions to apply in sequence

  Return:
    pd.Dataframe: dataframe with modified columns """

  for col in columns:
    df[col] = df[col].apply(lambda x: apply_pipeline(x, funcs)).str.strip()
  return df


# --- 4. Others ---


# Regex
regex_remove = re.compile(r'[^a-z0-9\s&/-]')  # mantém &, / e -
regex_spaces = re.compile(r'\s+')
regex_enterprise = re.compile(r'\b(ltda|eireli|s/a|me|ei|comercial|comercio)\b')



# --- 5. Columns to be processed ---

y_columns = ['nome_fantasia', 'razaosocial']
uf_column = ['uf']


# --- 6. Pipeline sequence

funcs = [
    fix_special_characters,
    lowering,
    lambda x: remove(x, regex_remove),
    lambda x: remove(x, regex_enterprise),
    lambda x: substitute_spaces(x, regex_spaces)
]

### Apply pre-processing

In [7]:
y_train = preprocess_columns(y_train, y_columns, funcs)
uf_df = uf.to_frame()
uf_df = preprocess_columns(uf_df, uf_column, funcs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].apply(lambda x: apply_pipeline(x, funcs)).str.strip()


In [8]:
y_train_sentences = y_train[y_columns].agg(' '.join, axis = 1)

## Embeddings

In [9]:
#--- 1. Load model ---

model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

In [10]:
y_sentences = y_train_sentences.tolist()
y_train_embeddings = model.encode(y_sentences, show_progress_bar = True, batch_size = 16)

Batches: 100%|██████████| 15967/15967 [18:37<00:00, 14.29it/s]


In [None]:
# saving embeddings
np.save('y_train_embeddings.npy', y_train_embeddings)
df_y = y_train.copy()
df_y['y_text'] = y_train_sentences
df_y['uf'] = uf_df['uf']
df_y.to_parquet('df_y.parquet', index = False)

In [None]:
def hybrid_predict(
    user_input,
    user_uf,
    df_y_path,
    y_embeddings_path,
    k=5,
    levenshtein_threshold=0.75
):
    df_y = pd.read_parquet(df_y_path)
    y_embeddings = np.load(y_embeddings_path)

    # preprocess user input
    user_input_df = pd.DataFrame({'user_input': [user_input]})
    user_input_df = preprocess_columns(user_input_df, ['user_input'], funcs)
    user_input_proc = user_input_df['user_input'].iloc[0]

    # preprocess user UF
    if user_uf:
        user_uf_df = pd.DataFrame({'uf': [user_uf]})
        user_uf_df = preprocess_columns(user_uf_df, ['uf'], funcs)
        user_uf_proc = user_uf_df['uf'].iloc[0]
    else:
        user_uf_proc = ''

    # uf filter
    if user_uf_proc:
        mask = df_y['uf'].str.lower() == user_uf_proc
        filtered_df_y = df_y[mask].reset_index(drop=True)
        filtered_embeddings = y_embeddings[mask.values]
    else:
        filtered_df_y = df_y.reset_index(drop=True)
        filtered_embeddings = y_embeddings

    # if no uf matches
    if len(filtered_df_y) == 0:
        return []

    # step 1 - levenshtein
    levenshtein_scores = []
    max_lengths = []
    for target_text in filtered_df_y['y_text']:
        dist = levenshtein_distance(user_input_proc, target_text)
        max_len = max(len(user_input_proc), len(target_text))
        levenshtein_scores.append(dist)
        max_lengths.append(max_len)

    levenshtein_similarities = [
    1 - (dist / max_len) if max_len > 0 else 0
    for dist, max_len in zip(levenshtein_scores, max_lengths)
    ]

    # check if there is good candidates
    sorted_lev_indices = np.argsort([-sim for sim in levenshtein_similarities])
    top_lev_dist = levenshtein_similarities[sorted_lev_indices[0]]

    if top_lev_dist <= levenshtein_threshold:
        top_k_indices = sorted_lev_indices[:k]
        results = []
        for idx in top_k_indices:
            text = filtered_df_y.iloc[idx]['y_text']
            sim_score = levenshtein_similarities[idx]
            results.append((text, sim_score))
        return results

    # step 2 - cosine fallback
    user_embedding = model.encode([f"{user_input_proc} {user_uf_proc}"])[0]
    sims = cosine_similarity([user_embedding], filtered_embeddings)[0]
    top_k_indices = np.argsort(sims)[-k:][::-1]
    results = []
    for idx in top_k_indices:
        text = filtered_df_y.iloc[idx]['y_text']
        score = sims[idx]
        results.append((text, score))

    return results

In [None]:
def evaluate_precision_at_k_batch_hybrid(
    user_inputs,
    user_ufs,
    true_targets,
    df_y_path,
    y_embeddings_path,
    k=5,
    batch_size=100
):
    total = len(user_inputs)
    correct = 0

    # create batches for evaluation
    for start_idx in range(0, total, batch_size):
        end_idx = min(start_idx + batch_size, total)
        batch_inputs = user_inputs[start_idx:end_idx]
        batch_ufs = user_ufs[start_idx:end_idx]
        batch_targets = true_targets[start_idx:end_idx]
        
        #run prediction
        for idx in range(len(batch_inputs)):
            preds = hybrid_predict(
                user_input=batch_inputs[idx],
                user_uf=batch_ufs[idx],
                df_y_path=df_y_path,
                y_embeddings_path=y_embeddings_path,
                k=k
            )

            pred_texts = [p[0] for p in preds]
            if batch_targets[idx] in pred_texts:
                correct += 1

        print(f"Processed batch {start_idx}-{end_idx} | Precision so far: {correct / (end_idx):.4f}")

    final_precision = correct / total
    return final_precision


In [None]:
precision = evaluate_precision_at_k_batch_hybrid(
    user_inputs=train['user_input'].tolist(),
    user_ufs=train['uf'].tolist(),
    true_targets=y_sentences,
    df_y_path='df_y.parquet',
    y_embeddings_path='y_train_embeddings.npy',
    k=5,
    batch_size=1000
)

print(f"Final Precision@{5}: {precision:.4f}")