In [1]:
# Importação de bibliotecas

import os
import sys
import pandas as pd
import numpy as np
import polars as pl
import re
import spacy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader, Dataset
from umap import UMAP

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Definicao da raiz do projeto

PROJECT_ROOT = 'G:/Csouza/nlp/topic_modeling'

os.chdir(PROJECT_ROOT)

sys.path.insert(0, PROJECT_ROOT)

In [3]:
def extract(extract_path, file_name='all_process.xlsx', sheet_name='Sheet1'):
    
    return pl.read_excel(f'{extract_path}/{file_name}', sheet_name=sheet_name)

In [4]:
data_path = os.path.join(PROJECT_ROOT, 'data', 'internal', 'fapesp_projects')

full_data = extract(data_path)

variables = {
'N. Processo_B.V': 'n_processo',
'Data de Início': 'data',
'Título (Português)': 'titulo',
'Grande Área do Conhecimento': 'grande_area',
'Área do Conhecimento': 'area',
'Subárea do Conhecimento': 'subarea',
'Palavras-Chave do Processo': 'palavras_chave',
'Assuntos': 'assuntos',
'Resumo (Português)': 'resumo'}

full_data = (
    full_data
    .lazy()
    .rename(variables)
    .select(variables.values())
    .filter(
        pl.col('n_processo').is_not_null(),
        pl.col('resumo').is_not_null(),
        pl.col('resumo') != '')
    .with_columns(
        pl.col('data').str.to_datetime('%m-%d-%y').dt.year().alias('ano'),
        pl.col('data').str.to_datetime('%m-%d-%y').dt.month().alias('mes'))
    .select(pl.exclude('data'))
).collect()

full_data.head(3)

n_processo,titulo,grande_area,area,subarea,palavras_chave,assuntos,resumo,ano,mes
str,str,str,str,str,str,str,str,i32,i8
"""95/04916-0""","""Estudo sistemático de campos h…","""Ciências Exatas e da Terra""","""Física""","""Física da Matéria Condensada""","""CORRELACAO ANGULAR, ESTUDO SIS…",,"""Este projeto está vinculado ao…",1995,12
"""95/05064-7""","""Cultura, ideologia e represent…","""Ciências Humanas""","""Sociologia""","""Outras Sociologias Específicas""","""BRASIL, IDENTIDADE, PENSAMENTO…","""Brasil:Identidade social""","""Participar do Seminário """"Soci…",1995,12
"""95/09836-4""","""Bernard Schmitt | Université d…","""Ciências Exatas e da Terra""","""Probabilidade e Estatística""","""Probabilidade""","""COMPRESSOR, ENTROPIA, ESTADO D…","""Entropia (matemática aplicada)…","""O principal objetivo da visita…",1995,12


In [5]:
data_train_test = full_data.filter(pl.col('assuntos').is_not_null(), pl.col('area') == 'Medicina')

data_train_test.shape

(17342, 10)

In [6]:
def get_spacy_model(model='en_core_web_sm'):
    """
    Baixa o modelo de linguagem spaCy se não estiver presente.
    """
    try:
        nlp = spacy.load(model)
    except OSError:
        from spacy.cli import download
        download(model)
        nlp = spacy.load(model)
    return nlp

# Carregar o modelo de linguagem em português do spaCy
nlp = get_spacy_model('pt_core_news_sm')

# Definir as stop words em português usando spaCy
stop_words = nlp.Defaults.stop_words

# Compilador para remover caracteres especiais (exceto acentos e espaços)
special_char_remover = re.compile(r'[^A-Za-zÀ-ÿ\s]')

def clean_text(text):
    if not isinstance(text, str):
        raise ValueError("O argumento 'text' deve ser uma string.")
    
    # Remover caracteres especiais
    text = special_char_remover.sub('', text)
    
    # Tokenizar o texto e remover stop words
    tokens = [token.text for token in nlp(text) if token.text not in stop_words]
    
    # Lematizar o texto
    doc = nlp(' '.join(tokens))
    text = ' '.join([token.lemma_ for token in doc])

    return text

In [7]:
# Carregar os dados
data = data_train_test.to_pandas()

data['titulo'] = data['titulo'].astype(str)
data['palavras_chave'] = data['palavras_chave'].astype(str)

data['cleaned_text'] = data['resumo'].apply(clean_text)
data['cleaned_text'] += ' Título: ' + data['titulo'].apply(clean_text) + ' Palavras-chave: ' + data['palavras_chave'].apply(clean_text)

In [8]:
# Dividir os dados em treino e teste
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Tokenização com BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

train_tokenized_texts = tokenizer(train_data['cleaned_text'].tolist(), padding=True, truncation=True, return_tensors="pt")
test_tokenized_texts = tokenizer(test_data['cleaned_text'].tolist(), padding=True, truncation=True, return_tensors="pt")

# Converte os assuntos em listas
train_data['assuntos_list'] = train_data['assuntos'].apply(lambda x: x.split(':'))
test_data['assuntos_list'] = test_data['assuntos'].apply(lambda x: x.split(':'))

# Binariza os rótulos
mlb = MultiLabelBinarizer()

train_binary_labels = mlb.fit_transform(train_data['assuntos_list'])
test_binary_labels = mlb.transform(test_data['assuntos_list'])

# Convertendo para tensores PyTorch
train_binary_labels = torch.tensor(train_binary_labels, dtype=torch.float)
test_binary_labels = torch.tensor(test_binary_labels, dtype=torch.float)

# Criação de máscaras de atenção
train_attention_mask = train_tokenized_texts['attention_mask']
test_attention_mask = test_tokenized_texts['attention_mask']

In [9]:
# Modelo BERT pré-treinado
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(mlb.classes_))

# Ajuste da camada de classificação para multi-rótulo
model.classifier = torch.nn.Sequential(
    torch.nn.Linear(model.config.hidden_size, model.config.hidden_size),
    torch.nn.ReLU(),
    torch.nn.Linear(model.config.hidden_size, len(mlb.classes_)),
    torch.nn.Sigmoid()
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Congelar todas as camadas do BERT
for param in model.bert.parameters():
    param.requires_grad = False

# Descongelar a última camada
for param in model.bert.encoder.layer[-1].parameters():
    param.requires_grad = True

# A camada de classificação é treinada por padrão
for param in model.classifier.parameters():
    param.requires_grad = True

In [11]:
# Classe Customizada para Dataset
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [12]:
def train(model, dataloader, criterion, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            total_loss += loss.item()
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(outputs.logits.cpu().numpy())
    
    return total_loss / len(dataloader), np.array(all_labels), np.array(all_preds)

# Função para calcular métricas
def compute_metrics(labels, preds, threshold=0.5):
    preds = (preds > threshold).astype(int)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='samples')
    recall = recall_score(labels, preds, average='samples')
    precision = precision_score(labels, preds, average='samples')
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'recall': recall,
        'precision': precision
    }

In [13]:
# Função de perda e otimizador
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5, eps=1e-8)  # Apenas atualiza os parâmetros que requerem gradiente

num_epochs = 3

# Scheduler de taxa de aprendizado
total_steps = len(train_tokenized_texts['input_ids']) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Criação dos DataLoaders
train_dataset = CustomDataset(train_tokenized_texts, train_binary_labels)
test_dataset = CustomDataset(test_tokenized_texts, test_binary_labels)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [14]:
# Configuração de hiperparâmetros
learning_rate = 1e-5
batch_size = 16
num_epochs = 3
eps = 1e-8
weight_decay = 0.01
warmup_steps = 0

# Função de perda e otimizador
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps, weight_decay=weight_decay)

# Scheduler de taxa de aprendizado
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

In [15]:
model_path = os.path.join(PROJECT_ROOT, 'models')
bert_name = f'{model_path}/bert_model.pt'

if not os.path.exists(bert_name):
    # Loop de Treinamento e Avaliação
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    num_epochs = 3
    best_f1 = 0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch + 1}/{num_epochs}')
        
        train_loss = train(model, train_dataloader, criterion, optimizer, scheduler, device)
        print(f'Training loss: {train_loss:.4f}')
        
        val_loss, val_labels, val_preds = evaluate(model, test_dataloader, criterion, device)
        print(f'Validation loss: {val_loss:.4f}')
        
        metrics = compute_metrics(np.array(val_labels), np.array(val_preds))
        print(f'Validation metrics: {metrics}')
        
        if metrics['f1'] > best_f1:
            print(f'Saving best model with F1 score: {metrics["f1"]:.4f}')
            torch.save(model.state_dict(), bert_name)
            best_f1 = metrics['f1']

In [16]:
# Carregar o modelo fine-tuned
model.load_state_dict(torch.load(bert_name))
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [21]:
def extract_embeddings_in_batches(texts, model, tokenizer, batch_size=8):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True, return_dict=True)
        hidden_states = outputs.hidden_states[-1]  # Último estado oculto
        embeddings = hidden_states.mean(dim=1)  # Média dos embeddings das palavras
        all_embeddings.append(embeddings.cpu().numpy())
    return np.concatenate(all_embeddings, axis=0)

def save_embeddings(embeddings, embeddings_path):
    """
    Salva os embeddings em um arquivo .npy.

    Args:
    embeddings (numpy.ndarray): Embeddings a serem salvos.
    embeddings_path (str): Caminho para salvar o arquivo .npy.
    """
    np.save(embeddings_path, embeddings)
    print(f'Embeddings salvos em {embeddings_path}')

def load_embeddings(embeddings_path):
    """
    Carrega os embeddings de um arquivo .npy.

    Args:
    embeddings_path (str): Caminho para o arquivo .npy contendo os embeddings.

    Returns:
    numpy.ndarray: Os embeddings carregados do arquivo.
    """
    if os.path.exists(embeddings_path):
        print(f'Loading embeddings from {embeddings_path}')
        return np.load(embeddings_path)
    else:
        raise FileNotFoundError(f'Embeddings file not found at {embeddings_path}')

def train_bertopic(docs, clean_text=None, use_embeddings=False, precomputed_embeddings=None, bert_model=None, tokenizer=None, umap_n_neighbors=15, umap_n_components=5, umap_min_dist=0.0, umap_metric='cosine', vectorizer_ngram_range=(1, 2), stop_words=stop_words):
    # Pré-processar os documentos
    if clean_text:
        docs = [clean_text(doc) for doc in docs]
    
    if not isinstance(stop_words, list):
        stop_words = list(stop_words)
    
    # Configurações padrão do BERTopic
    umap_model = UMAP(n_neighbors=umap_n_neighbors, n_components=umap_n_components, min_dist=umap_min_dist, metric=umap_metric)
    vectorizer_model = CountVectorizer(ngram_range=vectorizer_ngram_range, stop_words=stop_words)
    
    # Check if we should use embeddings
    if use_embeddings:
        embeddings = precomputed_embeddings
        if embeddings is None:
            assert bert_model is not None, "O modelo deve ser fornecido quando use_embeddings=True"
            assert tokenizer is not None, "O tokenizador deve ser fornecido quando use_embeddings=True"
            
            # Extrair embeddings para os documentos em lotes
            embeddings = extract_embeddings_in_batches(docs, bert_model, tokenizer)
            
            # Criar e treinar o modelo BERTopic usando os embeddings
            topic_model = BERTopic(umap_model=umap_model, vectorizer_model=vectorizer_model)
            topic_model.fit(docs, embeddings)
    else:
        # Criar e treinar o modelo BERTopic usando a configuração padrão
        topic_model = BERTopic(umap_model=umap_model, vectorizer_model=vectorizer_model)
        topic_model.fit(docs)
    
    return topic_model

In [18]:
bertopic_name = f'{model_path}/bertopic_model.pt'

if not os.path.exists(bertopic_name):
    text_train = train_data['cleaned_text'].tolist()
    bertopic_model = train_bertopic(docs=text_train)
    bertopic_model.save(bertopic_name)

bertopic_model = BERTopic.load(bertopic_name)
bertopic_model.visualize_topics()

In [22]:
embeddings_name = f'{model_path}/embeddings_bert_model.pt'
if not os.path.exists(embeddings_name):
    text_embeddings = train_data['cleaned_text'].tolist()
    embeddings = extract_embeddings_in_batches(text_embeddings, model, tokenizer)
    save_embeddings(embeddings, embeddings_name)

embeddings = load_embeddings(embeddings_name)

bertopic_custom_embeddings_name = f'{model_path}/bertopic_custom_embeddings_model.pt'
if not os.path.exists(bertopic_custom_embeddings_name):
    text_train = train_data['cleaned_text'].tolist()
    bertopic_custom_embeddings_model = train_bertopic(docs=text_train, use_embeddings=True, precomputed_embeddings=embeddings)
    bertopic_custom_embeddings_model.save(bertopic_custom_embeddings_name)

bertopic_custom_embeddings_model = BERTopic.load(bertopic_custom_embeddings_name)
bertopic_custom_embeddings_model.visualize_topics()

