In [1]:
import random, torch, pandas as pd, numpy as np, openai, os
from tqdm.notebook import tqdm
from functools import partial
from decouple import config
from concurrent.futures import ThreadPoolExecutor, as_completed

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel, RobertaTokenizer, RobertaModel

from sentence_transformers import SentenceTransformer

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

import torch_directml
# Device DirectML
device = torch_directml.device()

import warnings
warnings.filterwarnings("ignore")

# Set a random seed
random_seed = 42
random.seed(random_seed)
torch.manual_seed(random_seed)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gusta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<torch._C.Generator at 0x1c94fef7cf0>

In [2]:
path_database = "temp_gpt_4_1_nano_features.csv"
stop_words = "stop_words.txt"
max_workers = 5

GPT_KEY = config("GPT_KEY")

### Processamento

In [3]:
database = pd.read_csv(path_database)
display(database.head(3))
database.shape

Unnamed: 0,resumo_reclamacao,resumo_resposta,tipo_reclamacao,produto_servico,topicos,trechos_adequados,trechos_inadequados,classificacao_resposta,grau_atendimento,redirecionamento,company,source,ask,answer,service_note,service_class,resolved_class
0,"A linha telefônica está bloqueada, impedindo c...",A empresa forneceu uma carta com informações s...,bloqueio de linha,serviço de telefonia móvel,"['bloqueio', 'chamadas', 'recarga', 'atendimen...",['a carta resposta com todas as informações re...,"['esclarecimento sobre cancelamento de linha, ...",parcial,não atendeu,Sim,tim,consumidor_gov,"minha linha encontra-se bloqueada, não consigo...","Olá, Johanes ""Esclarecimento sobre cancelamen...",10,1,1
1,A cliente foi cobrada por um valor superior ao...,"A empresa informou que tentou contato, enviou ...",cobrança indevida,plano de telefonia móvel,"['cobrança', 'valor', 'serviço adicional', 'at...",['Estamos à disposição para quaisquer esclarec...,['Realizamos tentativa de contato no dia 20/04...,genérica,não atendeu,Sim,vivo,consumidor_gov,Estão me cobrando mais do que foi proposto na ...,"Prezado (a) Cliente, Realizamos tentativa de c...",10,1,1
2,A cliente reclama do aumento no valor do pacot...,A empresa encaminhou uma carta com informações...,reajuste de preço e qualidade de serviço,internet banda larga,"['preço', 'negociação', 'conexão', 'manutenção...",['encaminhou uma carta com informações detalha...,"['não abordou diretamente a reclamação', 'não ...",genérica,não atendeu,Sim,vivo,consumidor_gov,"Boa tarde, Quero negociar o pacote de internet...","Prezado (a), Segue em ""ANEXO"" ao lado da opçã...",10,1,1


(23935, 17)

##### Configuração textual

Ref: https://medium.com/@manyi.yim/in-depth-understanding-of-berttokenizerfast-1bf84b28f30d

| Token           | Description                        |
|-----------------|------------------------------------|
| [PAD]           | padding                            |
| [UNK]           | unknown                            |
| [CLS]           | classification / beginning of text |
| [SEP]           | separation                         |
| [MASK]          | mask                               |

In [4]:
database["X_TEXT"] = "[CLS] " + database["resumo_reclamacao"] + " [SEP] " + database["resumo_resposta"] + " [SEP]" + database["redirecionamento"]

print(f"COLUMNS: {database.columns.to_list()}\nSAMPLES: {database.shape[0]}")

COLUMNS: ['resumo_reclamacao', 'resumo_resposta', 'tipo_reclamacao', 'produto_servico', 'topicos', 'trechos_adequados', 'trechos_inadequados', 'classificacao_resposta', 'grau_atendimento', 'redirecionamento', 'company', 'source', 'ask', 'answer', 'service_note', 'service_class', 'resolved_class', 'X_TEXT']
SAMPLES: 23935


##### Vec setup

In [5]:
def stemmed_words(doc):
    return (WordNetLemmatizer().lemmatize(w) for w in CountVectorizer().build_analyzer()(doc))

In [6]:
stop_words = open(stop_words, encoding="utf-8").read().replace(" ", "").split("\n")

vec_params = dict(
    tokenizer=stemmed_words,
    strip_accents="unicode",
    analyzer="word",
    binary=True,
    lowercase=True,
    max_features=500, 
    ngram_range=(1, 1),
    stop_words=stop_words
)

print("BoW training...")
bow = CountVectorizer(**vec_params).fit(list(database["X_TEXT"]))

print("Tfidf training...")
tfidf = TfidfVectorizer(**vec_params).fit(list(database["X_TEXT"]))

BoW training...
Tfidf training...


In [7]:
def old_text_vectorizer(text, model="bow"):
    if model == "bow":
        return np.array(bow.transform([text]).toarray()[0])
    elif model == "tfidf":
        return np.array(tfidf.transform([text]).toarray()[0])

In [8]:
def text_vectorizer(
        text,
        is_test = False,
        tokenizer=AutoTokenizer.from_pretrained("distilbert-base-uncased"), 
        model=AutoModel.from_pretrained("distilbert-base-uncased")):
    """
    https://www.geeksforgeeks.org/how-to-generate-word-embedding-using-bert/
    """

    # Tokenize and encode text using batch_encode_plus
    # The function returns a dictionary containing the token IDs and attention masks
    encoding = tokenizer.batch_encode_plus(
        [text], # List of input texts
        padding=True, # Pad to the maximum sequence length
        truncation=True, # Truncate to the maximum sequence length if necessary
        return_tensors='pt', # Return PyTorch tensors
        add_special_tokens=True # Add special tokens CLS and SEP
    )

    input_ids = encoding['input_ids']  # Token IDs
    attention_mask = encoding['attention_mask']  # Attention mask

    # Generate embeddings using BERT model
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        word_embeddings = outputs.last_hidden_state  # This contains the embeddings

    # Compute the average of word embeddings to get the sentence embedding
    sentence_embedding = word_embeddings.mean(dim=1)  # Average pooling along the sequence length dimension

    if is_test:
        # Decode the token IDs back to text
        decoded_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)

        print(f"Original Text: {text}")
        print(f"Decoded Text: {decoded_text}")
        print(f"\tInput ID: {input_ids}")
        print(f"\tAttention mask: {attention_mask}")
        print(f"\tShape of Word Embeddings: {word_embeddings.shape}")

    return np.array(sentence_embedding[0])

In [9]:
def gpt_vectorizer(text, 
        model="text-embedding-3-small",
        api_base="https://api.openai.com/v1", 
        api_key=GPT_KEY):
    
    openai.api_base = api_base
    openai.api_key = api_key

    response = openai.Embedding.create(
        input=text,
        model=model
    )
    return response['data'][0]['embedding']

In [10]:
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
def nomic_vectorizer(text, task_type="classification"):   
    # Adiciona o prefixo de tarefa conforme a documentação do modelo
    prefixed_texts = [f"{task_type}: {text}"]
    embeddings = model.encode(prefixed_texts, convert_to_numpy=True)
    return embeddings[0]

<All keys matched successfully>


In [11]:
vectorizers = {
    "BoW": partial(old_text_vectorizer, model="bow"),
    "Tfidf": partial(old_text_vectorizer, model="tfidf"),
    "GPT3small": partial(gpt_vectorizer, model="text-embedding-3-small"),
    "DistilBert": partial(
        text_vectorizer,
        tokenizer=AutoTokenizer.from_pretrained("distilbert-base-uncased"),
        model=AutoModel.from_pretrained("distilbert-base-uncased")
    ),
    "Bert": partial(
        text_vectorizer,
        tokenizer=BertTokenizer.from_pretrained("bert-base-uncased"),
        model=BertModel.from_pretrained("bert-base-uncased")
    ),
    "Roberta": partial(
        text_vectorizer,
        tokenizer=RobertaTokenizer.from_pretrained("roberta-base"),
        model=RobertaModel.from_pretrained("roberta-base")
    ),
    "Nomic": partial(nomic_vectorizer),
    "OLMo": partial(
        text_vectorizer,
        tokenizer=AutoTokenizer.from_pretrained("allenai/OLMo-1B-hf", trust_remote_code=True),
        model=AutoModel.from_pretrained("allenai/OLMo-1B-hf", trust_remote_code=True)
    )
}

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Função worker
def processar_vetor_reclamacao(idx, r):
    try:
        text = r["X_TEXT"]
        for v in vectorizers:
            emb = vectorizers[v](text)
            emb = {f"embeddings_{v}": list(emb)}
            r.update(emb)
        return idx, r
    except Exception as e:
        print(f"Erro ao processar o índice {idx}: {e}")

In [13]:
path_vec_database = path_database.replace("_features", "_vecs")
results = database.to_dict(orient="records")
print(f"Resultados sendo salvos em: {path_vec_database}")

try:
    results_emb = pd.read_csv(path_vec_database).to_dict(orient="records")
    print(f"Carregados {len(results_emb)} resultados já existentes")
except:
    results_emb = []

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    vectors = [
        executor.submit(processar_vetor_reclamacao, idx, results[idx])
        for idx in range(len(results_emb), len(results))
    ]

    for vector in tqdm(as_completed(vectors), total=len(vectors)):
        idx, emb = vector.result(timeout=10)
        results_emb.append(emb)

        # Salvar incrementalmente
        df = pd.DataFrame([emb])
        df.to_csv(path_vec_database, index=False, mode="a", header=not os.path.isfile(path_vec_database) or os.path.getsize(path_vec_database) == 0)

        print(idx, [k for k in emb.keys() if "embeddings" in k])


Resultados sendo salvos em: temp_gpt_4_1_nano_vecs.csv
Carregados 18593 resultados já existentes


  0%|          | 0/5342 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


18593 ['embeddings_BoW', 'embeddings_Tfidf', 'embeddings_GPT3small', 'embeddings_DistilBert', 'embeddings_Bert', 'embeddings_Roberta', 'embeddings_Nomic', 'embeddings_OLMo']
18594 ['embeddings_BoW', 'embeddings_Tfidf', 'embeddings_GPT3small', 'embeddings_DistilBert', 'embeddings_Bert', 'embeddings_Roberta', 'embeddings_Nomic', 'embeddings_OLMo']
18596 ['embeddings_BoW', 'embeddings_Tfidf', 'embeddings_GPT3small', 'embeddings_DistilBert', 'embeddings_Bert', 'embeddings_Roberta', 'embeddings_Nomic', 'embeddings_OLMo']
18595 ['embeddings_BoW', 'embeddings_Tfidf', 'embeddings_GPT3small', 'embeddings_DistilBert', 'embeddings_Bert', 'embeddings_Roberta', 'embeddings_Nomic', 'embeddings_OLMo']
18597 ['embeddings_BoW', 'embeddings_Tfidf', 'embeddings_GPT3small', 'embeddings_DistilBert', 'embeddings_Bert', 'embeddings_Roberta', 'embeddings_Nomic', 'embeddings_OLMo']
18599 ['embeddings_BoW', 'embeddings_Tfidf', 'embeddings_GPT3small', 'embeddings_DistilBert', 'embeddings_Bert', 'embeddings_Rober