In [1]:
# Importação de bibliotecas

import os
import sys
import pandas as pd
import numpy as np
import polars as pl
import re
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bertopic import BERTopic
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments, BertModel
import torch
from datasets import Dataset
from umap import UMAP
import plotly.express as px
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import json
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Definicao da raiz do projeto

PROJECT_ROOT = 'G:/Csouza/nlp/topic_modeling'

os.chdir(PROJECT_ROOT)

sys.path.insert(0, PROJECT_ROOT)

In [3]:
def extract(extract_path, file_name='all_process.xlsx', sheet_name='Sheet1'):
    
    return pl.read_excel(f'{extract_path}/{file_name}', sheet_name=sheet_name)

In [4]:
data_path = os.path.join(PROJECT_ROOT, 'data', 'internal', 'fapesp_projects')

full_data = extract(data_path)

variables = {
'N. Processo_B.V': 'n_processo',
'Data de Início': 'data',
'Título (Português)': 'titulo',
'Grande Área do Conhecimento': 'grande_area',
'Área do Conhecimento': 'area',
'Subárea do Conhecimento': 'subarea',
'Palavras-Chave do Processo': 'palavras_chave',
'Assuntos': 'assuntos',
'Resumo (Português)': 'resumo'}

full_data = (
    full_data
    .lazy()
    .rename(variables)
    .select(variables.values())
    .filter(
        pl.col('n_processo').is_not_null(),
        pl.col('resumo').is_not_null(),
        pl.col('resumo') != '')
    .with_columns(
        pl.col('data').str.to_datetime('%m-%d-%y').dt.year().alias('ano'),
        pl.col('data').str.to_datetime('%m-%d-%y').dt.month().alias('mes'))
    .select(pl.exclude('data'))
).collect()

full_data.head(3)

n_processo,titulo,grande_area,area,subarea,palavras_chave,assuntos,resumo,ano,mes
str,str,str,str,str,str,str,str,i32,i8
"""95/04916-0""","""Estudo sistemático de campos h…","""Ciências Exatas e da Terra""","""Física""","""Física da Matéria Condensada""","""CORRELACAO ANGULAR, ESTUDO SIS…",,"""Este projeto está vinculado ao…",1995,12
"""95/05064-7""","""Cultura, ideologia e represent…","""Ciências Humanas""","""Sociologia""","""Outras Sociologias Específicas""","""BRASIL, IDENTIDADE, PENSAMENTO…","""Brasil:Identidade social""","""Participar do Seminário """"Soci…",1995,12
"""95/09836-4""","""Bernard Schmitt | Université d…","""Ciências Exatas e da Terra""","""Probabilidade e Estatística""","""Probabilidade""","""COMPRESSOR, ENTROPIA, ESTADO D…","""Entropia (matemática aplicada)…","""O principal objetivo da visita…",1995,12


In [5]:
data_train_test = full_data.filter(pl.col('assuntos').is_not_null(), pl.col('area') == 'Medicina', pl.col('ano') >= 2020)

data_train_test.shape

(2534, 10)

In [11]:
def get_spacy_model(model='pt_core_news_sm'):
    """
    Baixa o modelo de linguagem spaCy se não estiver presente.
    """
    try:
        nlp = spacy.load(model)
    except OSError:
        from spacy.cli import download
        download(model)
        nlp = spacy.load(model)
    return nlp

# Carregar o modelo de linguagem em português do spaCy
nlp = get_spacy_model('pt_core_news_sm')

def clean_text(text):
    if not isinstance(text, str):
        raise ValueError("O argumento 'text' deve ser uma string.")
    
    # Converter o texto para minúsculas
    text = text.lower()
    
    # Processar o texto inteiro de uma vez
    doc = nlp(text)
    
    # Lematizar os tokens (sem remover as stop words)
    lemmatized_tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_space]
    
    # Unir tokens lematizados em uma string
    cleaned_text = ' '.join(lemmatized_tokens)
    
    return cleaned_text

# Carregar os dados
data = data_train_test.to_pandas()

data['titulo'] = data['titulo'].astype(str)
data['palavras_chave'] = data['palavras_chave'].astype(str)

# Aplicar a limpeza de texto sem remover stop words
data['cleaned_text'] = data['resumo'].apply(clean_text)
data['cleaned_text'] += ' ' + data['titulo'].apply(clean_text) + ' ' + data['palavras_chave'].apply(clean_text)

In [12]:
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
bert_model = BertForMaskedLM.from_pretrained('neuralmind/bert-base-portuguese-cased')

# Função de tokenização para MLM
def tokenize_function(examples):
    return tokenizer(examples['cleaned_text'], padding="max_length", truncation=True, max_length=512)

dataset = Dataset.from_pandas(data)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Dividir em conjunto de treino e teste
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Data collator para MLM (vai automaticamente mascarar tokens)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Map: 100%|██████████| 2534/2534 [00:07<00:00, 338.27 examples/s]


In [13]:
data['word_count'] = data['cleaned_text'].apply(lambda x: len(x.split()))
fig = px.histogram(data, x='word_count', nbins=30, title='Distribuição da Contagem de Palavras por Documento')
fig.show()

In [None]:
# Definir as stop words em português usando spaCy
stop_words = nlp.Defaults.stop_words
additional_stop_words = ['ser', 'de', 'se', 'o', 'para', 'em', 'além', 'este', 'esse', 'ao', 'do', 'pelo', 'por', 'au']
stop_words |= set(additional_stop_words)
stop_words = {word.lower() for word in stop_words}

In [14]:
all_words = ' '.join(data['cleaned_text']).split()
all_words = [word for word in all_words if word not in stop_words]

word_freq = Counter(all_words)
common_words = word_freq.most_common(50)

words, counts = zip(*common_words)
word_freq_df = pd.DataFrame({'Palavra': words, 'Frequência': counts})

fig = px.bar(word_freq_df, x='Frequência', y='Palavra', orientation='h', title='Top 10 Palavras Mais Frequentes')
fig.show()

0                 ser
1            paciente
2              estudo
3              doença
4          tratamento
5             clínico
6             avaliar
7              célula
8            objetivo
9     desenvolvimento
10             efeito
11           realizar
12            projeto
13           utilizar
14              saúde
15          avaliação
16           associar
17        diagnóstico
18                dar
19             câncer
20              risco
21                 de
22              fator
23            celular
24         apresentar
25            análise
26          resultado
27          expressão
28             modelo
29                 se
30           resposta
31               gene
32          alteração
33           pesquisa
34           processo
35                uso
36         relacionar
37              renal
38                ano
39               alto
40               caso
41              tumor
42          qualidade
43          molecular
44                  2
45        

In [None]:
text = ' '.join(data['cleaned_text']).split()
text = [word for word in all_words if word not in stop_words]

text = ' '.join(text)

wordcloud = WordCloud(width=800, height=400, background_color ='white').generate(text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Criar a matriz de coocorrência com um limite de termos
vectorizer = CountVectorizer(max_features=1000, min_df=5, max_df=0.8)
X = vectorizer.fit_transform(data['cleaned_text'])

# Obter os nomes das palavras
terms = vectorizer.get_feature_names_out()

# Calcular a matriz de coocorrência
cooc_matrix = (X.T * X).tocoo()

# Converter para DataFrame para melhor visualização
cooc_df = pd.DataFrame.sparse.from_spmatrix(cooc_matrix, index=terms, columns=terms)

# Filtrar para os termos mais frequentes
filtered_cooc_df = cooc_df.loc[terms[:20], terms[:20]]

# Visualizar a matriz de coocorrência
plt.figure(figsize=(10, 8))
sns.heatmap(filtered_cooc_df, cmap="YlGnBu")
plt.show()

In [12]:
def train_model(model, tokenizer, train_dataset, test_dataset, data_collator, model_path, tokenizer_path, output_dir, overwrite_output_dir=True, save_steps=10_000, save_total_limit=2, prediction_loss_only=True, num_train_epochs=3, per_device_train_batch_size=8):    
    
    os.makedirs(model_path, exist_ok=True)
    os.makedirs(tokenizer_path, exist_ok=True)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        save_steps=save_steps,
        save_total_limit=save_total_limit,
        prediction_loss_only=prediction_loss_only,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )

    # Treinar o modelo
    trainer.train()

    # Salvar o modelo e o tokenizer
    if model_path:
        trainer.save_model(model_path)
    
    if tokenizer_path:
        tokenizer.save_pretrained(tokenizer_path)
    
    return trainer

def evaluate_model(trainer, test_dataset):
    eval_results = trainer.evaluate(eval_dataset=test_dataset)
    loss = eval_results['eval_loss']
    perplexity = np.exp(loss)
    
    metrics = {
        'loss': loss,
        'perplexity': perplexity
    }
    
    return metrics

def extract_embeddings(texts, model, tokenizer, max_length=512, batch_size=8):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    all_embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)
        all_embeddings.append(batch_embeddings.cpu())
    
    all_embeddings = torch.cat(all_embeddings, dim=0)
    return all_embeddings

def save_embeddings(embeddings, embeddings_path):
    np.save(embeddings_path, embeddings)
    
def load_embeddings(embeddings_path):
    if os.path.exists(embeddings_path):
        return np.load(embeddings_path)
    else:
        raise FileNotFoundError(f'Embeddings file not found at {embeddings_path}')

def train_bertopic(docs, clean_text=None, use_embeddings=False, precomputed_embeddings=None, bert_model=None, tokenizer=None, umap_n_neighbors=15, umap_n_components=5, umap_min_dist=0.0, umap_metric='cosine', vectorizer_ngram_range=(1, 2), stop_words=stop_words):
    if clean_text:
        docs = [clean_text(doc) for doc in docs]
    
    if not isinstance(stop_words, list):
        stop_words = list(stop_words)
    
    umap_model = UMAP(n_neighbors=umap_n_neighbors, n_components=umap_n_components, min_dist=umap_min_dist, metric=umap_metric)
    vectorizer_model = CountVectorizer(ngram_range=vectorizer_ngram_range, stop_words=stop_words)
    
    if use_embeddings:
        embeddings = precomputed_embeddings
        if embeddings is None:
            assert bert_model is not None, "O modelo deve ser fornecido quando use_embeddings=True"
            assert tokenizer is not None, "O tokenizador deve ser fornecido quando use_embeddings=True"
            embeddings = extract_embeddings(docs, bert_model, tokenizer)
        topic_model = BERTopic(umap_model=umap_model, vectorizer_model=vectorizer_model)
        topic_model.fit(docs, embeddings)
    else:
        topic_model = BERTopic(umap_model=umap_model, vectorizer_model=vectorizer_model)
        topic_model.fit(docs)
    
    return topic_model

In [13]:
model_path = os.path.join(PROJECT_ROOT, 'models')
tokenizer_path = os.path.join(PROJECT_ROOT, 'tokenizers')
results_path = os.path.join(model_path, 'results')

embed_full_path = os.path.isfile(os.path.join(model_path, 'bertimbal_embeddings.npy'))
tokzr_full_path = os.path.isfile(os.path.join(tokenizer_path, 'vocab.txt'))

if not embed_full_path or not tokzr_full_path:
    trainer = train_model(
        model=bert_model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        test_dataset=test_dataset,
        data_collator=data_collator,
        model_path=model_path,
        tokenizer_path=tokenizer_path,
        output_dir=results_path
    )

    metrics = evaluate_model(trainer, test_dataset)
    print("Metrics:", metrics)

In [None]:
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
bert_model = BertModel.from_pretrained(model_path)

In [None]:
bertopic_name = f'{model_path}/bertopic_model.pt'

if not os.path.exists(bertopic_name):
    text_train = train_dataset['cleaned_text']
    bertopic_model = train_bertopic(docs=text_train)
    bertopic_model.save(bertopic_name)

bertopic_model = BERTopic.load(bertopic_name)
bertopic_model.visualize_topics()

In [16]:
embeddings_name = f'{model_path}/bertimbal_embeddings.npy'
if not os.path.exists(embeddings_name):
    text_embeddings = train_dataset['cleaned_text']
    embeddings = extract_embeddings(text_embeddings, bert_model, tokenizer)
    save_embeddings(embeddings, embeddings_name)

In [None]:
embeddings = load_embeddings(embeddings_name)

bertopic_bertimbal_embeddings_name = f'{model_path}/bertopic_bertimbal_embeddings_model.pt'
if not os.path.exists(bertopic_bertimbal_embeddings_name):
    text_train = train_dataset['cleaned_text']
    bertopic_bertimbal_embeddings_model = train_bertopic(docs=text_train, use_embeddings=True, precomputed_embeddings=embeddings)
    bertopic_bertimbal_embeddings_model.save(bertopic_bertimbal_embeddings_name)

bertopic_bertimbal_embeddings_model = BERTopic.load(bertopic_bertimbal_embeddings_name)
bertopic_bertimbal_embeddings_model.visualize_topics()

In [15]:
vocab_path = os.path.join(PROJECT_ROOT, 'data', 'processed', 'usp_controlled_vocabulary')
vocab_name = 'vocabulario_usp_hierarchy.json'

# Carregar o vocabulário controlado da USP
with open(f'{vocab_path}/{vocab_name}', 'r', encoding='utf-8') as file:
    vocab_data = json.load(file)

# Função para extrair termos de forma hierárquica
def extract_terms(vocab_data, terms_list=None):
    if terms_list is None:
        terms_list = []

    for item in vocab_data:
        term = item['string']
        terms_list.append(term)
        if 'subterms' in item and item['subterms']:
            extract_terms(item['subterms'], terms_list)

    return terms_list

# Extrair todos os termos do vocabulário controlado
all_terms = extract_terms(vocab_data)

In [20]:
def get_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embedding.squeeze()

def compute_similarity(embedding1, embedding2):
    return cosine_similarity(embedding1.reshape(1, -1), embedding2.reshape(1, -1))[0, 0]

def map_topics_to_vocab(terms_embeddings, term_embeddings):
    mappings = {}
    for topic_id, terms in terms_embeddings.items():
        best_matches = {}
        for term, embedding in terms.items():
            similarities = {vocab_term: compute_similarity(embedding, vocab_embedding) for vocab_term, vocab_embedding in term_embeddings.items()}
            best_match_term = max(similarities, key=similarities.get)
            best_matches[term] = best_match_term
        mappings[topic_id] = best_matches
    return mappings

def categorize_project(resumo, bertopic_model, term_embeddings):
    topics, embeddings = bertopic_model.transform([resumo])
    best_matches = {}
    for embedding in embeddings:
        similarities = {vocab_term: compute_similarity(embedding, term_embeddings[vocab_term]) for vocab_term in term_embeddings}
        best_match_term = max(similarities, key=similarities.get)
        best_matches[resumo] = best_match_term
    return best_matches

In [None]:
# Gerar embeddings para todos os termos
term_embeddings = {term: get_embedding(term, tokenizer, bert_model) for term in all_terms}

In [48]:
# Gerar embeddings para os termos principais dos tópicos
terms_per_topic_standard = bertopic_model.get_topics()
terms_per_topic_finetuned = bertopic_bertimbal_embeddings_model.get_topics()

terms_embeddings_standard = {topic_id: {term[0]: get_embedding(term[0], tokenizer, bert_model) for term in terms} for topic_id, terms in terms_per_topic_standard.items()}
terms_embeddings_finetuned = {topic_id: {term[0]: get_embedding(term[0], tokenizer, bert_model) for term in terms} for topic_id, terms in terms_per_topic_finetuned.items()}

In [49]:
# Mapear tópicos aos termos do vocabulário para ambos os modelos BERTopic
topic_to_vocab_mappings_standard = map_topics_to_vocab(terms_embeddings_standard, term_embeddings)
topic_to_vocab_mappings_finetuned = map_topics_to_vocab(terms_embeddings_finetuned, term_embeddings)

In [None]:
topic_to_vocab_mappings_standard

In [None]:
topic_to_vocab_mappings_finetuned

In [None]:
# Aplicação para o conjunto de dados
data['conceitos_gerados_standard'] = data['resumo'].apply(lambda x: categorize_project(x, bertopic_model, term_embeddings))
data['conceitos_gerados_finetuned'] = data['resumo'].apply(lambda x: categorize_project(x, bertopic_bertimbal_embeddings_model, term_embeddings))