# Importação de Bibliotecas

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments, BertModel
import torch
from datasets import Dataset
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud
import re

  from .autonotebook import tqdm as notebook_tqdm


# Preparação dos Dados

In [2]:
# Definicao da raiz do projeto

PROJECT_ROOT = 'G:/Csouza/nlp/topic_modeling'

os.chdir(PROJECT_ROOT)

sys.path.insert(0, PROJECT_ROOT)

In [3]:
def extract(extract_path, file_name='all_process.xlsx', sheet_name='Sheet1'):
    return pd.read_excel(f'{extract_path}/{file_name}', sheet_name=sheet_name)

In [4]:
data_path = os.path.join(PROJECT_ROOT, 'data', 'internal', 'fapesp_projects')

full_data = extract(data_path)

variables = {
'N. Processo_B.V': 'n_processo',
'Data de Início': 'data',
'Título (Português)': 'titulo',
'Grande Área do Conhecimento': 'grande_area',
'Área do Conhecimento': 'area',
'Subárea do Conhecimento': 'subarea',
'Palavras-Chave do Processo': 'palavras_chave',
'Assuntos': 'assuntos',
'Resumo (Português)': 'resumo'}

full_data = full_data.rename(columns=variables)

# Selecionar colunas específicas
full_data = full_data[list(variables.values())]

# Filtrar linhas com base em condições
full_data = full_data[
    full_data['n_processo'].notnull() &
    full_data['resumo'].notnull() &
    (full_data['resumo'] != '')
]

# Adicionar novas colunas com base em transformações de colunas existentes
full_data['data'] = pd.to_datetime(full_data['data'], format='%m-%d-%y', errors='coerce')
full_data['ano'] = full_data['data'].dt.year
full_data['mes'] = full_data['data'].dt.month

# Excluir a coluna 'data'
full_data = full_data.drop(columns=['data'])

In [5]:
full_data.head(3)

Unnamed: 0,n_processo,titulo,grande_area,area,subarea,palavras_chave,assuntos,resumo,ano,mes
0,95/04916-0,Estudo sistemático de campos hiperfinos eletro...,Ciências Exatas e da Terra,Física,Física da Matéria Condensada,"CORRELACAO ANGULAR, ESTUDO SISTEMATICO, INTERA...",,Este projeto está vinculado ao processo FAPESP...,1995,12
6,95/05064-7,"Cultura, ideologia e representação",Ciências Humanas,Sociologia,Outras Sociologias Específicas,"BRASIL, IDENTIDADE, PENSAMENTO SOCIAL, REPRESE...",Brasil:Identidade social,"Participar do Seminário """"Sociologia e Filosof...",1995,12
22,95/09836-4,Bernard Schmitt | Université de Bourgogne - Fr...,Ciências Exatas e da Terra,Probabilidade e Estatística,Probabilidade,"COMPRESSOR, ENTROPIA, ESTADO DE GIBBS, SISTEMA...",Entropia (matemática aplicada):Compressores,O principal objetivo da visita do Professor Be...,1995,12


In [6]:
data_train_test = full_data[full_data['assuntos'].notnull() & (full_data['area'] == 'Medicina')]
data_train_test.shape

(17342, 10)

In [7]:
def clean_text(text):
    if not isinstance(text, str):
        raise ValueError("O argumento 'text' deve ser uma string.")
    
    text = re.sub(r'[^a-zA-ZÀ-ÿ0-9\s]', '', text)

    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Carregar os dados
data = data_train_test

data['titulo'] = data['titulo'].astype(str)
data['palavras_chave'] = data['palavras_chave'].astype(str)

# Aplicar a limpeza de texto sem remover stop words
data['cleaned_text'] = data['titulo'].apply(clean_text) + '. ' + data['resumo'].apply(clean_text) + '. Palavras-chave: ' + data['palavras_chave'].apply(clean_text)

data['assuntos'] = data['assuntos'].apply(lambda x: [s.strip() for s in str(x).split(':')])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['titulo'] = data['titulo'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['palavras_chave'] = data['palavras_chave'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['cleaned_text'] = data['titulo'].apply(clean_text) + '. ' + data['resumo'].apply(clean_text) + '

# Treinamento do Modelo BERT

In [8]:
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
bert_model = BertForMaskedLM.from_pretrained('neuralmind/bert-base-portuguese-cased')

# Função de tokenização para MLM
def tokenize_function(examples):
    return tokenizer(examples['cleaned_text'], padding="max_length", truncation=True, max_length=512)

dataset = Dataset.from_pandas(data)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Dividir em conjunto de treino e teste
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Data collator para MLM (vai automaticamente mascarar tokens)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Map: 100%|██████████| 17342/17342 [00:57<00:00, 299.75 examples/s]


In [18]:
def train_model(model, tokenizer, train_dataset, test_dataset, data_collator, model_path, tokenizer_path, output_dir, overwrite_output_dir=True, save_steps=10_000, save_total_limit=2, prediction_loss_only=True, num_train_epochs=3, per_device_train_batch_size=8):    
    
    os.makedirs(model_path, exist_ok=True)
    os.makedirs(tokenizer_path, exist_ok=True)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        save_steps=save_steps,
        save_total_limit=save_total_limit,
        prediction_loss_only=prediction_loss_only,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )

    # Treinar o modelo
    trainer.train()

    # Salvar o modelo e o tokenizer
    if model_path:
        trainer.save_model(model_path)
    
    if tokenizer_path:
        tokenizer.save_pretrained(tokenizer_path)
    
    return trainer

def evaluate_model(trainer, test_dataset):
    eval_results = trainer.evaluate(eval_dataset=test_dataset)
    loss = eval_results['eval_loss']
    perplexity = np.exp(loss)
    
    metrics = {
        'loss': loss,
        'perplexity': perplexity
    }
    
    return metrics

def get_embeddings(texts, model, tokenizer, max_length=512, batch_size=8):
    if not hasattr(model, "is_on_device"):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model.to(device)
        model.is_on_device = True

    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding="max_length", max_length=max_length).to(model.device)

        with torch.no_grad():
            outputs = model(**inputs)

        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        all_embeddings.append(batch_embeddings)

    all_embeddings = np.vstack(all_embeddings)
    return all_embeddings

def generate_embeddings(dataset, text_col='cleaned_text', topic_col='assuntos', batch_size=8):
    texts = dataset[text_col]
    dataset['text_embedding'] = get_embeddings(texts, bert_model, tokenizer, batch_size=batch_size)
    
    all_assuntos = dataset[topic_col]
    all_assuntos_embeddings = []
    for subjects in all_assuntos:
        subjects_embeddings = get_embeddings(subjects, bert_model, tokenizer, batch_size=batch_size)
        all_assuntos_embeddings.append(subjects_embeddings)
    
    dataset['topics_embeddings'] = all_assuntos_embeddings
    return dataset

In [19]:
model_path = os.path.join(PROJECT_ROOT, 'models')
tokenizer_path = os.path.join(PROJECT_ROOT, 'tokenizers')
results_path = os.path.join(model_path, 'results')

model_exists = os.path.isfile(os.path.join(model_path, 'model.safetensors')) and os.path.isfile(os.path.join(model_path, 'config.json'))
tokenizer_exists = os.path.isfile(os.path.join(tokenizer_path, 'vocab.txt'))

# Treinar o modelo se ele não existir
if not model_exists or not tokenizer_exists:
    print("Modelo treinado não encontrado. Iniciando o treinamento...")
    trainer = train_model(
        model=bert_model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        test_dataset=test_dataset,
        data_collator=data_collator,
        model_path=model_path,
        tokenizer_path=tokenizer_path,
        output_dir=results_path
    )

    metrics = evaluate_model(trainer, test_dataset)
    print("Metrics:", metrics)

In [20]:
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
bert_model = BertModel.from_pretrained(model_path)

Some weights of BertModel were not initialized from the model checkpoint at G:/Csouza/nlp/topic_modeling\models and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Geração de Embeddings com BERT

In [22]:
embedding_path = os.path.join(PROJECT_ROOT, 'data', 'processed', 'fapesp_projects', 'test_dataset_with_embeddings.parquet')

def save_dataset(dataset, path):
    df = dataset.to_pandas()
    df.to_parquet(path, index=False)

def load_dataset(path):
    df = pd.read_parquet(path)
    return Dataset.from_pandas(df)

if os.path.exists(embedding_path):
    print("Carregando test_dataset do arquivo salvo...")
    test_dataset = load_dataset(embedding_path)
else:
    print("Gerando embeddings e salvando test_dataset...")
    
    BATCH_SIZE = 8
    test_dataset = test_dataset.map(generate_embeddings, batched=True, batch_size=BATCH_SIZE)
    
    save_dataset(test_dataset, embedding_path)
    print("test_dataset salvo com sucesso.")


Map: 100%|██████████| 3469/3469 [3:19:20<00:00,  3.45s/ examples]  


# Ranqueamento dos Assuntos por Relevância

In [25]:
# Função para calcular a relevância dos tópicos
def rank_topics_by_relevance(text_embedding, topics_embeddings, topics):
    # Converter os embeddings para arrays NumPy, caso sejam listas
    text_embedding = np.array(text_embedding)
    topics_embeddings = [np.array(topic_emb) for topic_emb in topics_embeddings]
    
    # Calcular similaridades de cosseno entre o embedding do texto e cada embedding dos tópicos
    similarities = [cosine_similarity(text_embedding.reshape(1, -1), topic_emb.reshape(1, -1))[0, 0] for topic_emb in topics_embeddings]
    
    # Classificar os tópicos de acordo com a similaridade, do maior para o menor
    ranked_topics = sorted(zip(topics, similarities), key=lambda x: x[1], reverse=True)
    
    # Retornar apenas os tópicos ranqueados
    return [topic for topic, _ in ranked_topics]

# Aplicar o ranqueamento dos tópicos ao dataset de teste
def rank_topics(dataset, text_embedding_col='text_embedding', topics_embeddings_col='topics_embeddings', topics_col='assuntos'):
    dataset['ranked_topics'] = rank_topics_by_relevance(dataset[text_embedding_col], dataset[topics_embeddings_col], dataset[topics_col])
    return dataset

test_dataset = test_dataset.map(rank_topics, batched=False)

Map: 100%|██████████| 3469/3469 [00:13<00:00, 265.06 examples/s]


# Avaliação

In [33]:
# Função para calcular precisão no top-k
def precision_at_k(true_labels, predicted_labels, k):
    correct_predictions = 0
    
    for true, predicted in zip(true_labels, predicted_labels):
        predicted_top_k = predicted[:k]
        if any(subject in predicted_top_k for subject in true):
            correct_predictions += 1
    
    return correct_predictions / len(true_labels)

# Função para calcular recall no top-k
def recall_at_k(true_labels, predicted_labels, k):
    correct_predictions = 0
    
    for true, predicted in zip(true_labels, predicted_labels):
        predicted_top_k = predicted[:k]
        correct_in_top_k = len(set(true) & set(predicted_top_k))
        total_relevant = len(true)
        
        if total_relevant > 0:
            correct_predictions += correct_in_top_k / total_relevant
    
    return correct_predictions / len(true_labels)

# Função para calcular DCG
def dcg_at_k(relevances, k):
    relevances = np.array(relevances)[:k]
    return np.sum((2**relevances - 1) / np.log2(np.arange(2, len(relevances) + 2)))

# Função para calcular NDCG no top-k
def ndcg_at_k(true_labels, predicted_labels, k):
    total_ndcg = 0.0
    
    for true, predicted in zip(true_labels, predicted_labels):
        # Atribuir relevância: 1 para tópicos verdadeiros, 0 para os outros
        relevances = [1 if topic in true else 0 for topic in predicted[:k]]
        dcg = dcg_at_k(relevances, k)
        ideal_relevances = sorted(relevances, reverse=True)
        idcg = dcg_at_k(ideal_relevances, k)
        
        if idcg > 0:
            total_ndcg += dcg / idcg
    
    return total_ndcg / len(true_labels)

# Extrair os dados do Dataset Hugging Face para listas
true_labels = test_dataset['assuntos']
predicted_labels = test_dataset['ranked_topics']

# Calcular as métricas no Top-3
k = 3
precision = precision_at_k(true_labels, predicted_labels, k)
recall = recall_at_k(true_labels, predicted_labels, k)
ndcg = ndcg_at_k(true_labels, predicted_labels, k)

# Imprimir as métricas
print(f"Precisão no Top-{k}: {precision:.2f}")
print(f"Recall no Top-{k}: {recall:.2f}")
print(f"NDCG no Top-{k}: {ndcg:.2f}")

Precisão no Top-3: 1.00
Recall no Top-3: 0.71
NDCG no Top-3: 1.00


# Visualização dos Resultados

In [None]:
num_examples = 5

for i in range(num_examples):
    example = test_dataset[i]
    
    print(f"Resumo: {example['cleaned_text']}")
    print(f"Tópicos Reais: {example['assuntos']}")
    print(f"Tópicos Ranqueados: {example['ranked_topics']}")
    print('-' * 50)

# Calculo da Similaridade Semântica

In [None]:
# Função para calcular a similaridade de cosseno média para cada modelo
def calculate_mean_cosine_similarity(text_embeddings, topics_embeddings):
    all_similarities = []
    for text_emb, topic_embs in zip(text_embeddings, topics_embeddings):
        similarities = [cosine_similarity(text_emb.reshape(1, -1), topic_emb.reshape(1, -1))[0, 0]
                        for topic_emb in topic_embs]
        all_similarities.append(np.mean(similarities))  # Calcular média de similaridades para cada resumo
    return np.array(all_similarities)

# Função para comparar dois modelos (BERTimbau vs RoBERTa) por similaridade semântica
def compare_models_by_semantic_similarity(bert_similarities, roberta_similarities):
    # Comparação de métricas
    mean_similarity_bert = np.mean(bert_similarities)
    mean_similarity_roberta = np.mean(roberta_similarities)

    std_similarity_bert = np.std(bert_similarities)
    std_similarity_roberta = np.std(roberta_similarities)

    print(f"Média Similaridade (BERTimbau): {mean_similarity_bert:.2f}")
    print(f"Média Similaridade (RoBERTa): {mean_similarity_roberta:.2f}")
    print(f"Desvio Padrão Similaridade (BERTimbau): {std_similarity_bert:.2f}")
    print(f"Desvio Padrão Similaridade (RoBERTa): {std_similarity_roberta:.2f}")

    # Plotando a Distribuição das Similaridades
    plt.figure(figsize=(10, 6))
    sns.histplot(bert_similarities, color='blue', label='BERTimbau', kde=True, bins=20)
    sns.histplot(roberta_similarities, color='green', label='RoBERTa', kde=True, bins=20)
    plt.title('Distribuição das Similaridades de Cosseno: BERTimbau vs RoBERTa')
    plt.xlabel('Similaridade de Cosseno')
    plt.ylabel('Frequência')
    plt.legend()
    plt.show()

    # Boxplot para Comparação de Similaridades
    plt.figure(figsize=(8, 6))
    sns.boxplot(data=[bert_similarities, roberta_similarities], palette='Set2')
    plt.xticks([0, 1], ['BERTimbau', 'RoBERTa'])
    plt.title('Comparação da Similaridade de Cosseno: BERTimbau vs RoBERTa')
    plt.ylabel('Similaridade de Cosseno')
    plt.show()

bert_similarities = calculate_mean_cosine_similarity(test_dataset['text_embedding'], test_dataset['topics_embeddings'])

# roberta_similarities = calculate_mean_cosine_similarity(test_dataset['text_embedding'], test_dataset['topics_embeddings_roberta'])

# Comparar BERTimbau com RoBERTa quando RoBERTa estiver disponível
# compare_models_by_semantic_similarity(bert_similarities, roberta_similarities)