In [None]:
# Importação de bibliotecas

import os
import sys
import pandas as pd
import numpy as np
import polars as pl
import re
import spacy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader, Dataset
from umap import UMAP

In [None]:
# Definicao da raiz do projeto

PROJECT_ROOT = 'G:/Csouza/nlp/topic_modeling'

os.chdir(PROJECT_ROOT)

sys.path.insert(0, PROJECT_ROOT)

In [None]:
def extract(extract_path, file_name='all_process.xlsx', sheet_name='Sheet1'):
    
    return pl.read_excel(f'{extract_path}/{file_name}', sheet_name=sheet_name)

In [None]:
data_path = os.path.join(PROJECT_ROOT, 'data', 'internal', 'fapesp_projects')

full_data = extract(data_path)

variables = {
'N. Processo_B.V': 'n_processo',
'Data de Início': 'data',
'Título (Português)': 'titulo',
'Grande Área do Conhecimento': 'grande_area',
'Área do Conhecimento': 'area',
'Subárea do Conhecimento': 'subarea',
'Palavras-Chave do Processo': 'palavras_chave',
'Assuntos': 'assuntos',
'Resumo (Português)': 'resumo'}

full_data = (
    full_data
    .lazy()
    .rename(variables)
    .select(variables.values())
    .filter(
        pl.col('n_processo').is_not_null(),
        pl.col('resumo').is_not_null(),
        pl.col('resumo') != '')
    .with_columns(
        pl.col('data').str.to_datetime('%m-%d-%y').dt.year().alias('ano'),
        pl.col('data').str.to_datetime('%m-%d-%y').dt.month().alias('mes'))
    .select(pl.exclude('data'))
).collect()

full_data.head(3)

In [None]:
data_train_test = full_data.filter(pl.col('assuntos').is_not_null(), pl.col('area') == 'Medicina', pl.col('ano') >= 2022)

data_train_test.shape

In [None]:
def get_spacy_model(model='en_core_web_sm'):
    """
    Baixa o modelo de linguagem spaCy se não estiver presente.
    """
    try:
        nlp = spacy.load(model)
    except OSError:
        from spacy.cli import download
        download(model)
        nlp = spacy.load(model)
    return nlp

In [None]:
# Carregar o modelo de linguagem em português do spaCy
nlp = get_spacy_model('pt_core_news_sm')

# Definir as stop words em português usando spaCy
stop_words = nlp.Defaults.stop_words

# Compilador para remover caracteres especiais (exceto acentos e espaços)
special_char_remover = re.compile(r'[^A-Za-zÀ-ÿ\s]')

def clean_text(text):
    if not isinstance(text, str):
        raise ValueError("O argumento 'text' deve ser uma string.")
    
    # Remover caracteres especiais
    text = special_char_remover.sub('', text)
    
    # Tokenizar o texto e remover stop words
    tokens = [token.text for token in nlp(text) if token.text not in stop_words]
    
    # Lematizar o texto
    doc = nlp(' '.join(tokens))
    text = ' '.join([token.lemma_ for token in doc])

    return text

In [None]:
def mlsmote(X, y, num_samples=100, minority_threshold=5):
    new_X = []
    new_y = []

    # Identificar as instâncias minoritárias com base no limiar
    label_counts = np.sum(y, axis=0)
    print("Contadores de rótulos:", label_counts)  # Depuração

    minority_classes = np.where(label_counts < minority_threshold)[0]
    print("Classes minoritárias:", minority_classes)  # Depuração

    # Selecionar exemplos minoritários
    minority_indices = [i for i in range(len(y)) if any(y[i, minority_classes])]
    print("Índices minoritários:", minority_indices)  # Depuração

    if len(minority_indices) == 0:
        raise ValueError("Nenhum exemplo minoritário foi encontrado. Verifique os dados de entrada.")

    minority_X = X[minority_indices]
    minority_y = y[minority_indices]

    if len(minority_X) < 2:
        raise ValueError("Exemplos minoritários insuficientes para aplicar MLSMOTE. Verifique os dados de entrada.")

    # Usar NearestNeighbors para encontrar os vizinhos mais próximos
    nn = NearestNeighbors(n_neighbors=5).fit(minority_X)
    
    for _ in range(num_samples):
        idx = np.random.choice(range(len(minority_X)))
        sample_X = minority_X[idx]
        sample_y = minority_y[idx]
        
        neighbors = nn.kneighbors([sample_X], return_distance=False)[0]
        neighbor_idx = neighbors[np.random.randint(1, len(neighbors))]
        neighbor_X = minority_X[neighbor_idx]
        neighbor_y = minority_y[neighbor_idx]
        
        new_sample_X = sample_X + np.random.rand() * (neighbor_X - sample_X)
        new_sample_y = np.clip(sample_y + neighbor_y, 0, 1)

        # Verificação de exemplos gerados
        print(f"Exemplo gerado - new_sample_X: {new_sample_X}, new_sample_y: {new_sample_y}")
        
        new_X.append(new_sample_X)
        new_y.append(new_sample_y)
    
    X_resampled = np.vstack([X, np.array(new_X)])
    y_resampled = np.vstack([y, np.array(new_y)])

    return X_resampled, y_resampled

In [None]:
# Carregar os dados
data = data_train_test.to_pandas()

data['titulo'] = data['titulo'].astype(str)
data['palavras_chave'] = data['palavras_chave'].astype(str)

data['cleaned_text'] = data['resumo'].apply(clean_text)
data['cleaned_text'] += ' Título: ' + data['titulo'].apply(clean_text) + ' Palavras-chave: ' + data['palavras_chave'].apply(clean_text)

In [None]:
# Dividir os dados em treino e teste
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Tokenização com BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

train_tokenized_texts = tokenizer(train_data['cleaned_text'].tolist(), padding=True, truncation=True, return_tensors="pt")
test_tokenized_texts = tokenizer(test_data['cleaned_text'].tolist(), padding=True, truncation=True, return_tensors="pt")

# Converte os assuntos em listas
train_data['assuntos_list'] = train_data['assuntos'].apply(lambda x: x.split(':'))
test_data['assuntos_list'] = test_data['assuntos'].apply(lambda x: x.split(':'))

# Binariza os rótulos
mlb = MultiLabelBinarizer()

train_binary_labels = mlb.fit_transform(train_data['assuntos_list'])
test_binary_labels = mlb.transform(test_data['assuntos_list'])

mean_exemples = np.median(np.sum(train_binary_labels, axis=0)).round()
# Aplicar MLSMOTE aos dados de treinamento
X_resampled, y_resampled = mlsmote(train_tokenized_texts['input_ids'].numpy(), train_binary_labels, num_samples=100, minority_threshold=mean_exemples)

train_tokenized_texts['input_ids'] = torch.tensor(X_resampled, dtype=torch.long)
train_binary_labels = torch.tensor(y_resampled, dtype=torch.float)

# Criação de novos tensores para os dados resampled com atenção correta
attention_mask = (X_resampled != 0).astype(int)  # Gera a máscara de atenção com base nos tokens não nulos

train_tokenized_texts_resampled = {
    'input_ids': torch.tensor(X_resampled, dtype=torch.long),
    'attention_mask': torch.tensor(attention_mask, dtype=torch.long)
}

train_binary_labels_resampled = torch.tensor(y_resampled, dtype=torch.float)

In [None]:
# Modelo BERT pré-treinado
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(mlb.classes_))

# Ajuste da camada de classificação para multi-rótulo
model.classifier = torch.nn.Sequential(
    torch.nn.Linear(model.config.hidden_size, model.config.hidden_size),
    torch.nn.ReLU(),
    torch.nn.Linear(model.config.hidden_size, len(mlb.classes_)),
    torch.nn.Sigmoid()
)

In [None]:
# Congelar todas as camadas do BERT
for param in model.bert.parameters():
    param.requires_grad = False

# Descongelar a última camada
for param in model.bert.encoder.layer[-1].parameters():
    param.requires_grad = True

# A camada de classificação é treinada por padrão
for param in model.classifier.parameters():
    param.requires_grad = True

# Função de perda e otimizador
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5, eps=1e-8)  # Apenas atualiza os parâmetros que requerem gradiente

num_epochs = 3

# Scheduler de taxa de aprendizado
total_steps = len(train_tokenized_texts['input_ids']) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# Criação do DataLoader para os dados resampled
train_dataset_resampled = CustomDataset(train_tokenized_texts_resampled, train_binary_labels_resampled)
test_dataset = CustomDataset(test_tokenized_texts, torch.tensor(test_binary_labels, dtype=torch.float))

train_dataloader = DataLoader(train_dataset_resampled, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
def train(model, dataloader, criterion, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            total_loss += loss.item()
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(outputs.logits.cpu().numpy())
    
    return total_loss / len(dataloader), np.array(all_labels), np.array(all_preds)

In [None]:
# Função para calcular métricas
def compute_metrics(labels, preds, threshold=0.5):
    preds = (preds > threshold).astype(int)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='samples')
    recall = recall_score(labels, preds, average='samples')
    precision = precision_score(labels, preds, average='samples')
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'recall': recall,
        'precision': precision
    }

In [None]:
model_path = os.path.join(PROJECT_ROOT, 'models')
model_name = f'{model_path}/best_model.pt'

if not os.path.exists(model_name):
    # Loop de Treinamento e Avaliação
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    num_epochs = 3
    best_f1 = 0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch + 1}/{num_epochs}')
        
        train_loss = train(model, train_dataloader, criterion, optimizer, scheduler, device)
        print(f'Training loss: {train_loss:.4f}')
        
        val_loss, val_labels, val_preds = evaluate(model, test_dataloader, criterion, device)
        print(f'Validation loss: {val_loss:.4f}')
        
        metrics = compute_metrics(np.array(val_labels), np.array(val_preds))
        print(f'Validation metrics: {metrics}')
        
        if metrics['f1'] > best_f1:
            print(f'Saving best model with F1 score: {metrics["f1"]:.4f}')
            torch.save(model.state_dict(), model_name)
            best_f1 = metrics['f1']


In [None]:
# Carregar o modelo fine-tuned
model.load_state_dict(torch.load(model_name))
model.eval()

# Função para gerar embeddings
def get_embeddings(texts, model, tokenizer):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model.bert(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token embeddings
    return embeddings

# Gerar embeddings
embeddings = get_embeddings(test_data['cleaned_text'].tolist(), model, tokenizer)

# Verificar se os embeddings foram gerados corretamente
if embeddings.size(0) == 0:
    raise ValueError("Os embeddings gerados estão vazios.")

# Debug print para verificar os embeddings
print(f"Shape of embeddings: {embeddings.shape}")

# Inicializar BERTopic com um modelo de embeddings padrão
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=3, n_components=2, metric='cosine', random_state=42)  # Ajuste n_neighbors conforme necessário
topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model)

# Ajustar o modelo aos dados
try:
    topics, probabilities = topic_model.fit_transform(test_data['cleaned_text'].tolist(), embeddings.numpy())
except ValueError as e:
    raise ValueError(f"Erro ao ajustar o modelo BERTopic: {e}")

# Verificar se os tópicos foram gerados corretamente
if len(topics) == 0:
    raise ValueError("Nenhum tópico foi gerado. Verifique os dados de entrada e os embeddings.")

# Debug print para verificar os tópicos
print(f"Number of topics: {len(set(topics))}")

# Verificar se os embeddings dos tópicos não são vazios antes da visualização
if topic_model.topic_embeddings_ is not None:
    if topic_model.topic_embeddings_.size == 0:
        raise ValueError("Os embeddings dos tópicos estão vazios.")
else:
    if topic_model.c_tf_idf_.size == 0:
        raise ValueError("A matriz c_tf_idf_ dos tópicos está vazia.")

# Adicionar verificação de tamanho dos embeddings antes de visualizar os tópicos
if topic_model.topic_embeddings_ is not None:
    if topic_model.topic_embeddings_.shape[0] == 0 or topic_model.topic_embeddings_.shape[1] == 0:
        raise ValueError("Os embeddings dos tópicos têm tamanho zero.")
else:
    if topic_model.c_tf_idf_.shape[0] == 0 or topic_model.c_tf_idf_.shape[1] == 0:
        raise ValueError("A matriz c_tf_idf_ dos tópicos tem tamanho zero.")

# Debug print para verificar os embeddings dos tópicos
print(f"Shape of topic embeddings: {topic_model.topic_embeddings_.shape if topic_model.topic_embeddings_ is not None else 'N/A'}")
print(f"Shape of c_tf_idf_: {topic_model.c_tf_idf_.shape}")

In [None]:
topic_model.visualize_barchart()

In [None]:
def batch_get_embeddings(texts, bert_model, tokenizer, device='cpu'):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs, output_hidden_states=True)
    hidden_states = outputs.hidden_states[-2]
    embeddings = hidden_states.mean(dim=1).cpu().numpy()  # Média dos embeddings e conversão para numpy array
    return embeddings

def batch_predict_labels(texts, model, tokenizer, threshold=0.5, top_k=None, device='cpu'):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.sigmoid(outputs.logits).cpu().numpy()

    preds = []
    for prob in probs:
        if top_k:
            top_indices = np.argsort(prob)[-top_k:]  # Obter os índices dos top_k rótulos mais prováveis
            pred = np.zeros(prob.shape)
            pred[top_indices] = 1
        else:
            pred = (prob > threshold).astype(int)
        preds.append(pred)
    
    return np.array(preds), probs

def apply_bertopic(texts, topic_model, bert_model=None, tokenizer=None, use_bert_embeddings=False, device='cpu', top_n_topics=None, top_n_tokens=None):
    if use_bert_embeddings and bert_model and tokenizer:
        embeddings = batch_get_embeddings(texts, bert_model, tokenizer, device=device)
        topics, _ = topic_model.transform(texts, embeddings=embeddings)  # Passar os embeddings aqui
    else:
        topics, _ = topic_model.transform(texts)
    
    topic_details = []
    for topic in topics:
        if topic != -1:
            topic_tokens = topic_model.get_topic(topic)[:top_n_tokens] if top_n_tokens else topic_model.get_topic(topic)
            topic_details.append((topic, topic_tokens))
    
    return topic_details[:top_n_topics] if top_n_topics else topic_details

def consolidate_results(texts, model, tokenizer, topic_model, mlb, threshold=0.5, top_k=None, use_bert_embeddings=False, device='cpu', top_n_topics=None, top_n_tokens=None):
    preds, probs = batch_predict_labels(texts, model, tokenizer, threshold, top_k, device)
    known_labels = mlb.inverse_transform(preds)
    
    all_labels = []
    topic_details = []

    for i, (pred, prob) in enumerate(zip(preds, probs)):
        if np.max(prob) < threshold:
            topics = apply_bertopic([texts[i]], topic_model, bert_model=model, tokenizer=tokenizer, use_bert_embeddings=use_bert_embeddings, device=device, top_n_topics=top_n_topics, top_n_tokens=top_n_tokens)
            topic_labels = [token for topic, tokens in topics for token in tokens]
            all_labels.append(list(set(known_labels[i]) | set(topic_labels)))
            topic_details.append(topics)
        else:
            all_labels.append(known_labels[i])
            topic_details.append([])

    return all_labels, topic_details

def batch_predict(texts, model, tokenizer, topic_model, mlb, threshold=0.5, top_k=None, use_bert_embeddings=False, device='cpu', top_n_topics=None, top_n_tokens=None):
    # Move model to device and set to evaluation mode
    model.to(device)
    model.eval()

    if use_bert_embeddings:
        bert_model = model
        bert_model.to(device)
        bert_model.eval()
    
    all_labels, topic_details = consolidate_results(texts, model, tokenizer, topic_model, mlb, threshold, top_k, use_bert_embeddings, device, top_n_topics, top_n_tokens)
    return list(zip(all_labels, topic_details))

In [None]:
test_texts = test_data['cleaned_text'].tolist()[0:4]

threshold = 0.7
top_k = 5
top_n_topics = 3
top_n_tokens = 5

# Aplicar a predição em lote
predicted_labels_and_topics = batch_predict(
    texts=test_texts,
    model=model,
    tokenizer=tokenizer,
    topic_model=topic_model,
    mlb=mlb,
    threshold=threshold,
    top_k=top_k,
    use_bert_embeddings=False,  # Escolher se deseja usar embeddings do BERT
    device='cpu',  # Ou 'cuda' se você estiver usando GPU
    top_n_topics=top_n_topics,
    top_n_tokens=top_n_tokens
)

# Exibir os resultados
for text, (labels, topics) in zip(test_texts, predicted_labels_and_topics):
    print(f"Texto: {text}")
    print(f"Rótulos preditos: {labels}")
    if topics:
        print("Tópicos e tokens associados:")
        for topic, tokens in topics:
            print(f"  Tópico {topic}: {tokens}")
    print()
