# Parte 0: Setup de ambiente e instalação de dependeências

In [4]:
!python -m spacy download pt_core_news_sm &> /dev/null
!pip install transformers[torch] &> /dev/null
!pip install accelerate[torch] &> /dev/null
!pip install --upgrade pyarrow datasets evaluate seqeval gradio &> /dev/null

In [1]:
# Imports necessários para TFIDF
import os
import re
import pickle
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import RSLPStemmer
# Imports necessários para DistilBert NER
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertConfig, DistilBertForTokenClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import evaluate
# Imports necessários para a interface Gradio
import gradio as gr

# Definir dispositivo (CPU ou GPU, se disponível)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Baixar recursos necessários do NLTK
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('rslp')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gilva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gilva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\gilva\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

# Parte 1: Preprocessamento de Texto e TF-IDF

In [3]:
# Carregar dados
file_path = "base_info_produtos.csv"
df = pd.read_csv(file_path, sep='\t')

# Configurar pré-processamento de texto
stop_words = set(stopwords.words('portuguese'))
stemmer = RSLPStemmer()

def preprocess_text(text):
    """Preprocessa o texto removendo stopwords e aplicando stemming."""
    words = word_tokenize(text.lower())
    words = [stemmer.stem(word) for word in words if word.isalnum() and word not in stop_words]
    return ' '.join(words)

# Concatenar colunas para enriquecer as informações
df.fillna('n/a', inplace=True)
df['concatenated'] = (df['nome'] + ' ' + df['tipo'] + ' ' + df['marca'] + ' ' + df['categoria'] + ' ' +
                      df['cor'] + ' ' + df['modelo'])

# Aplicar preprocessamento de texto
df['processed_text'] = df['concatenated'].apply(preprocess_text)

# Verificar se os arquivos do modelo TF-IDF já existem
tfidf_dir = "tfidf_model/"
vectorizer_path = os.path.join(tfidf_dir, "tfidf_vectorizer.pkl")
matrix_path = os.path.join(tfidf_dir, "tfidf_matrix.pkl")

if os.path.exists(vectorizer_path) and os.path.exists(matrix_path):
    with open(vectorizer_path, 'rb') as f:
        vectorizer = pickle.load(f)
    with open(matrix_path, 'rb') as f:
        tfidf_matrix = pickle.load(f)
    print("Modelo TF-IDF carregado com sucesso.")
else:
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['processed_text'])
    os.makedirs(tfidf_dir, exist_ok=True)
    with open(vectorizer_path, 'wb') as f:
        pickle.dump(vectorizer, f)
    with open(matrix_path, 'wb') as f:
        pickle.dump(tfidf_matrix, f)
    print("Novo modelo TF-IDF treinado e salvo com sucesso.")

def calculate_similarity(product1, product2):
    """Calcula a similaridade entre dois produtos."""
    product1_processed = preprocess_text(product1)
    product2_processed = preprocess_text(product2)
    product1_tfidf = vectorizer.transform([product1_processed])
    product2_tfidf = vectorizer.transform([product2_processed])
    similarity = cosine_similarity(product1_tfidf, product2_tfidf)
    return min(similarity[0][0], 1.0)

def search_products(query, top_n=5):
    """Realiza busca de produtos com base na similaridade TF-IDF."""
    query = preprocess_text(query)
    query_tfidf = vectorizer.transform([query])
    similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[::-1][:top_n]
    results = df.iloc[top_indices].copy()
    results['probabilidade'] = [calculate_similarity(query, results.iloc[i]['concatenated']) for i in range(len(results))]
    return results[['nome', 'tipo', 'marca', 'categoria', 'cor', 'modelo', 'probabilidade']]

def extract_info_from_title(title):
    """Extrai informações de um título usando TF-IDF."""
    processed_title = preprocess_text(title)
    query_tfidf = vectorizer.transform([processed_title])
    similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    top_index = similarities.argsort()[::-1][0]
    return df.iloc[top_index][['tipo', 'marca', 'categoria', 'cor', 'modelo']]

Modelo TF-IDF carregado com sucesso.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


# Parte 2: Treinamento e Uso do Modelo DistilBERT para a tarefa de NER

## Leitura e processamento dos dados para treinar o modelo NER

In [4]:
import pandas as pd
from datasets import Dataset, DatasetDict, ClassLabel, Features, Sequence, Value
from sklearn.model_selection import train_test_split

# Mapeamento de rótulos para índices numéricos
label_map = {'tipo': 0, 'marca': 1, 'categoria': 2, 'cor': 3, 'modelo': 4}

# Inverter o mapeamento para recuperar rótulos a partir dos índices
idx_to_label = {v: k for k, v in label_map.items()}

# Dataset fornecido
file_path = "base_info_produtos.csv"
df_full = pd.read_csv(file_path, sep='\t', usecols=['tipo', 'marca', 'categoria', 'cor', 'modelo']).dropna()

# Converter dados originais em novo formato de dataframe
data_list = []
for idx, row in df_full.iterrows():
    for column in ['tipo', 'marca', 'categoria', 'cor', 'modelo']:
        data_list.append({'texto': row[column], 'label': column})
df = pd.DataFrame(data_list)

# Initialize lists to hold the data
ids = []
tokens = []
ner_tags = []

# Label mapping
label_map = {'tipo': 0, 'marca': 1, 'categoria': 2, 'cor': 3, 'modelo': 4}

# Process the DataFrame line by line
for index, row in df.iterrows():
    ids.append(index)
    tokens.append([row['texto']])
    ner_tags.append([label_map[row['label']]])

# Create a new DataFrame
processed_df = pd.DataFrame({
    'id': ids,
    'tokens': tokens,
    'ner_tags': ner_tags
})

# Split the data into train (80%) and temp (20%) sets
train_df, temp_df = train_test_split(processed_df, test_size=0.2, random_state=42)

# Split the temp set into validation (50% of 20% = 10%) and test (50% of 20% = 10%) sets
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)



# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df).remove_columns(['__index_level_0__'])
validation_dataset = Dataset.from_pandas(val_df).remove_columns(['__index_level_0__'])
test_dataset = Dataset.from_pandas(test_df).remove_columns(['__index_level_0__'])

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})

# Verify the structure
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 6388
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 798
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 799
    })
})


## Carregar modelo base para realizar a configuração com as tags específicas do problema.
### ['tipo', 'marca', 'categoria', 'cor', 'modelo']

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
example = dataset_dict["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]', 'dvd', 'player', '[SEP]']

In [7]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [8]:
tokenized_dataset_dict = dataset_dict.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/6388 [00:00<?, ? examples/s]

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

In [10]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

## Métricas para avaliação do modelo

In [11]:
import evaluate

seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [12]:
import numpy as np

label_list = ['tipo', 'marca', 'categoria', 'cor', 'modelo']

labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## Treino

In [13]:
id2label = {
    0: 'tipo',
    1: 'marca',
    2: 'categoria',
    3: 'cor',
    4: 'modelo'
}
label2id = {
    'tipo': 0,
    'marca': 1,
    'categoria': 2,
    'cor': 3,
    'modelo': 4
}

In [14]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=5, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
tokenized_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 6388
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 798
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 799
    })
})

In [16]:
training_args = TrainingArguments(
    output_dir="ner_model",
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="ner_model",
    logging_steps=25,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_dict["train"],
    eval_dataset=tokenized_dataset_dict["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2868,0.125389,0.977472,0.977472,0.977472,0.977472
2,0.0518,0.026039,0.993742,0.993742,0.993742,0.993742
3,0.0242,0.013263,0.994994,0.994994,0.994994,0.994994
4,0.0155,0.013252,0.994994,0.994994,0.994994,0.994994




TrainOutput(global_step=200, training_loss=0.19039693236351013, metrics={'train_runtime': 81.1384, 'train_samples_per_second': 314.919, 'train_steps_per_second': 2.465, 'total_flos': 73079875917504.0, 'train_loss': 0.19039693236351013, 'epoch': 4.0})

## Inferência do modelo

In [5]:
from collections import defaultdict
from transformers import pipeline

def get_most_cited_label_for_strings(string, model_path, tokenizer, device):
    strings = string.split(" ")
    classifier = pipeline("ner", model=model_path, tokenizer=tokenizer, device=device)
    results = {}

    # Initialize a list to keep track of entities and their positions
    entities = []

    for idx, string in enumerate(strings):
        classifier_output = classifier(string)
        label_scores = defaultdict(float)

        # Aggregate scores for each label
        for item in classifier_output:
            entity = item['entity']
            score = item['score']
            label_scores[entity] += score

        # Find the label with the highest cumulative score
        most_cited_label = max(label_scores, key=label_scores.get)

        # Store the entity and its position
        entities.append((idx, most_cited_label))

    # Sort entities by their original position in the input string
    entities.sort(key=lambda x: x[0])

    # Build the results dictionary aligned with the original input
    for position, label in entities:
        results[strings[position]] = label

    return results

# Example usage:
model_path = "ner_model"
tokenizer = "ner_model"

In [6]:
string = "Samsung UN40C6900 LED Plana 40 Polegadas"
word_labels = get_most_cited_label_for_strings(string, model_path, tokenizer, device)
print(word_labels)

{'Samsung': 'marca', 'UN40C6900': 'modelo', 'LED': 'tipo', 'Plana': 'tipo', '40': 'modelo', 'Polegadas': 'tipo'}


In [7]:
string = "notebook sony 4 GB"
word_labels = get_most_cited_label_for_strings(string, model_path, tokenizer, device)
print(word_labels)

{'notebook': 'tipo', 'sony': 'marca', '4': 'modelo', 'GB': 'marca'}


# Parte 3: Criação da Interface Gradio

In [8]:
# Habilitar modo de debug com a variável de ambiente GRADIO_DEBUG=1
os.environ["GRADIO_DEBUG"] = "1"

def search_interface(query):
    results = search_products(query)
    return results

def ner_interface(input_text):
    ner_predictions = get_most_cited_label_for_strings(input_text, model_path, tokenizer, device)
    return ner_predictions

search_demo = gr.Interface(fn=search_interface, inputs="text", outputs="dataframe", title="Busca de produtos")
ner_demo = gr.Interface(fn=ner_interface, inputs="text", outputs="json", title="NER Extraction")

demo = gr.TabbedInterface([search_demo, ner_demo], ["Busca de produtos", "Extração de features NER"])
demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


Keyboard interruption in main thread... closing server.




In [9]:
demo.close()

Closing server running on port: 7860
