<a href="https://colab.research.google.com/github/daycardoso/bert-vs-modernbert-valueeval24/blob/main/valores_modern_bert_final1_cru.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Inicialização

In [14]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import userdata
import os
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
os.environ['WANDB_API_KEY'] = userdata.get('WANDB_API_KEY')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
import pandas as pd
import json
import os # Adicionado para construir os caminhos

# Carregar os Dados em Ingles
home = "/content/drive/MyDrive/Mestrado/DetectionOfHumanValuesInTexts/Colab_Experimentos/"

folder_treino = home + "training-english/"
folder_validacao = home + "validation-english/"
folder_teste = home + "test-english/"

# Json value-categories - Carregue o JSON uma única vez
caminho_json_valores = os.path.join(home, "value-categories.json")
with open(caminho_json_valores, 'r') as f:
    categorias_valores = json.load(f)

# Json value-categories
categorias_valores = json.load(open(home + "value-categories.json"))


# Aplicando o pre-processamento de adição de contexto do Hierocles of Alexandria at Touché

# Treinamento dos modelos para cada direction

In [16]:
!pip install "numpy<2.0"



In [17]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [18]:
import os
import gc
import random
import numpy as np
import pandas as pd
import torch
import datasets
import wandb
from scipy.special import expit as sigmoid
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from tqdm.auto import tqdm
from transformers import (
    AutoTokenizer,
    AutoConfig,
    ModernBertForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding
)

# Configuração para reduzir fragmentação de memória
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# --- 1. Inicialização e Configuração Global ---

try:
    wandb.init(
        project='touche_multi_head',
        name='modern-bert_seq_class_19_values_no_context_cru'
    )
except Exception as e:
    print(f"W&B initialization failed: {e}. Running without logging.")

VALORES = [
    'Self-direction: thought', 'Self-direction: action', 'Stimulation', 'Hedonism',
    'Achievement', 'Power: dominance', 'Power: resources', 'Face',
    'Security: personal', 'Security: societal', 'Tradition', 'Conformity: rules',
    'Conformity: interpersonal', 'Humility', 'Benevolence: caring',
    'Benevolence: dependability', 'Universalism: concern', 'Universalism: nature',
    'Universalism: tolerance'
]
NUM_LABELS = len(VALORES)
ID2LABEL = {i: l for i, l in enumerate(VALORES)}
LABEL2ID = {l: i for i, l in enumerate(VALORES)}

PRETRAINED_MODEL = 'answerdotai/ModernBERT-base'
MAX_LENGTH = 512

# --- 2. Preparação do Tokenizador ---

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

# --- 3. Funções de Processamento de Dados ---

def preprocess_function(examples):
    """Tokeniza os textos do dataset."""
    return tokenizer(examples['Text'], padding='max_length', truncation=True, max_length=MAX_LENGTH)

def load_and_process_dataset(directory, tokenizer_instance):
    """Carrega, mescla, processa e tokeniza o dataset."""
    sentences_file_path = os.path.join(directory, 'sentences.tsv')
    labels_file_path = os.path.join(directory, 'final_labels_cru.tsv')

    if not os.path.exists(sentences_file_path) or not os.path.exists(labels_file_path):
        raise FileNotFoundError(f"Arquivos de dataset não encontrados em {directory}")

    key_column_types = {'Text-ID': str, 'Sentence-ID': str}
    try:
        data_df = pd.read_csv(
            sentences_file_path,
            sep='\t',
            dtype=key_column_types,
            usecols=['Text-ID', 'Sentence-ID', 'Text']
        )
    except ValueError as e:
        raise ValueError(f"Verifique se 'sentences.tsv' contém as colunas 'Text-ID', 'Sentence-ID' e 'Text'. Erro: {e}")

    labels_df = pd.read_csv(labels_file_path, sep='\t', dtype=key_column_types)
    merged_df = pd.merge(data_df, labels_df, on=['Text-ID', 'Sentence-ID'])

    labels_matrix = merged_df[VALORES].values.astype(np.float32)
    merged_df['labels'] = [row.astype(np.float32) for row in labels_matrix]

    # Exemplo aleatório para inspeção
    random_idx = random.randint(0, len(merged_df) - 1)
    sample_info = {
        'Text-ID': merged_df['Text-ID'][random_idx],
        'Sentence-ID': merged_df['Sentence-ID'][random_idx],
        'Text': merged_df['Text'][random_idx],
        'labels': [ID2LABEL[i] for i, label in enumerate(merged_df['labels'][random_idx]) if label == 1] or 'Nenhum'
    }

    print("\n" + "="*35)
    print(f"=== Exemplo de Texto Pré-processado ({os.path.basename(directory)}) ===")
    print(f"Text-ID: {sample_info['Text-ID']}")
    print(f"Sentence-ID: {sample_info['Sentence-ID']}")
    print(f"Texto: {sample_info['Text']}")
    print(f"Rótulos: {sample_info['labels']}")
    print("="*35 + "\n")

    dataset = datasets.Dataset.from_pandas(merged_df)
    dataset = dataset.map(preprocess_function, batched=True, load_from_cache_file=False)
    valid_cols = ['input_ids', 'attention_mask', 'labels']
    dataset = dataset.remove_columns([c for c in dataset.column_names if c not in valid_cols])
    dataset.set_format("torch")
    return dataset

# --- 4. Função de Métricas ---

def compute_metrics(eval_pred):
    """Calcula métricas para avaliação multi-label."""
    logits, true_labels = eval_pred
    probs = sigmoid(logits)
    preds = (probs > 0.5).astype(int)

    f1_macro = f1_score(true_labels, preds, average='macro', zero_division=0)
    f1_micro = f1_score(true_labels, preds, average='micro', zero_division=0)
    precision_macro = precision_score(true_labels, preds, average='macro', zero_division=0)
    recall_macro = recall_score(true_labels, preds, average='macro', zero_division=0)
    subset_accuracy = (true_labels == preds).all(axis=1).mean()

    auc_scores = []
    for i in range(NUM_LABELS):
        if len(np.unique(true_labels[:, i])) > 1:
            auc_scores.append(roc_auc_score(true_labels[:, i], probs[:, i]))
    roc_auc = np.mean(auc_scores) if auc_scores else float('nan')

    return {
        'subset_accuracy': subset_accuracy,
        'f1_macro': f1_macro,
        'f1_micro': f1_micro,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'roc_auc': roc_auc
    }



In [19]:
# --- 5. Lógica Principal de Treinamento ---

def main():
    """Função principal que executa o pipeline."""
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f'Usando dispositivo: {device}')

    print('Carregando e processando datasets...')
    train_ds = load_and_process_dataset(folder_treino, tokenizer)
    val_ds = load_and_process_dataset(folder_validacao, tokenizer)
    test_ds = load_and_process_dataset(folder_teste, tokenizer)
    print('Datasets prontos.')

    config = AutoConfig.from_pretrained(
        PRETRAINED_MODEL,
        num_labels=NUM_LABELS,
        id2label=ID2LABEL,
        label2id=LABEL2ID,
        problem_type='multi_label_classification'
    )
    model = ModernBertForSequenceClassification.from_pretrained(PRETRAINED_MODEL, config=config)

    training_args = TrainingArguments(
        output_dir='modern-bert-seq-class-values-no-context_cru',
        report_to='wandb',
        eval_strategy='steps',
        eval_steps=767,
        save_strategy='steps',
        save_steps=767,
        save_total_limit=2,
        learning_rate=5e-5,
        adam_epsilon=1e-8,
        num_train_epochs=20,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='f1_macro',
        greater_is_better=True,
        fp16=True if device == 'cuda' else False,
        lr_scheduler_type='linear',
        warmup_ratio=0.1,
        seed=2025,
        overwrite_output_dir=True,
        push_to_hub=True,
        hub_model_id='DayCardoso/modern-bert-seq-class-values-no-context_cru',
    )

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
    )

    print('Iniciando treinamento...')
    torch.cuda.empty_cache()  # Liberar memória antes do treinamento
    train_res = trainer.train()

    metrics = train_res.metrics
    metrics['train_samples'] = len(train_ds)
    trainer.log_metrics('train', metrics)
    trainer.save_model()
    trainer.save_state()
    trainer.save_metrics('train', metrics)

    print('\nAvaliação final no dataset de validação...')
    torch.cuda.empty_cache()  # Liberar memória antes da avaliação
    eval_res = trainer.evaluate()
    eval_res['eval_samples'] = len(val_ds)
    trainer.log_metrics('eval', eval_res)
    trainer.save_metrics('eval', eval_res)

    print('\nAvaliação no dataset de teste...')
    torch.cuda.empty_cache()  # Liberar memória antes do teste
    test_res = trainer.evaluate(eval_dataset=test_ds, metric_key_prefix='test')
    test_res['test_samples'] = len(test_ds)
    trainer.log_metrics('test', test_res)
    trainer.save_metrics('test', test_res)
    print('Teste concluído! Métricas:', test_res)

    wandb.finish()

if __name__ == '__main__':

    main()

Usando dispositivo: cuda
Carregando e processando datasets...

=== Exemplo de Texto Pré-processado () ===
Text-ID: NL_M_013
Sentence-ID: 29
Texto: We make the system of benefits and taxes much simpler so that no one gets caught up in the system.
Rótulos: ['Universalism: concern']



Map:   0%|          | 0/21392 [00:00<?, ? examples/s]


=== Exemplo de Texto Pré-processado () ===
Text-ID: BG_066
Sentence-ID: 6
Texto: In addition to the funds under the Multiannual Financial Framework 2021-2027, which will total €16.9 billion, the new EU anti-crisis plan - Next Generation EU, with a budget of more than €12.3 billion, is expected to be operational.
Rótulos: ['Stimulation', 'Power: resources']



Map:   0%|          | 0/7038 [00:00<?, ? examples/s]


=== Exemplo de Texto Pré-processado () ===
Text-ID: NL_010
Sentence-ID: 9
Texto: Yesterday, the Russian president called on citizens to leave Cherson in southern Ukraine because of the advancing counteroffensive against the occupied city.
Rótulos: ['Security: societal']



Map:   0%|          | 0/6878 [00:00<?, ? examples/s]

Datasets prontos.


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Iniciando treinamento...


Step,Training Loss,Validation Loss,Subset Accuracy,F1 Macro,F1 Micro,Precision Macro,Recall Macro,Roc Auc
767,1.1484,0.175219,0.104149,0.08495,0.184079,0.2599,0.05985,0.785338
1534,0.6467,0.162809,0.20915,0.207785,0.328568,0.544757,0.168202,0.845028
2301,0.604,0.158313,0.233731,0.262478,0.351388,0.524329,0.204564,0.855941
3068,0.4872,0.165855,0.302217,0.318059,0.424958,0.474384,0.26488,0.852347
3835,0.4325,0.164233,0.296959,0.307905,0.41648,0.480849,0.249548,0.854894
4602,0.2084,0.212886,0.31202,0.338881,0.42531,0.419774,0.30424,0.840243
5369,0.2093,0.231258,0.313583,0.332404,0.423302,0.42638,0.284437,0.838986
6136,0.0995,0.261412,0.296675,0.327666,0.418186,0.395703,0.294139,0.827602
6903,0.1017,0.281949,0.28204,0.31887,0.399713,0.412451,0.273431,0.828201
7670,0.0569,0.299686,0.292981,0.328452,0.421661,0.405497,0.289804,0.823633


***** train metrics *****
  epoch                    =     6.3104
  total_flos               = 42845499GF
  train_loss               =     0.3616
  train_runtime            = 1:00:56.50
  train_samples            =      21392
  train_samples_per_second =    117.008
  train_steps_per_second   =      7.313

Avaliação final no dataset de validação...


***** eval metrics *****
  epoch                   =     6.3104
  eval_f1_macro           =     0.3389
  eval_f1_micro           =     0.4253
  eval_loss               =     0.2129
  eval_precision_macro    =     0.4198
  eval_recall_macro       =     0.3042
  eval_roc_auc            =     0.8402
  eval_runtime            = 0:01:04.36
  eval_samples            =       7038
  eval_samples_per_second =    109.353
  eval_steps_per_second   =     27.346
  eval_subset_accuracy    =      0.312

Avaliação no dataset de teste...


early stopping required metric_for_best_model, but did not find eval_f1_macro so early stopping is disabled


***** test metrics *****
  epoch                   =     6.3104
  test_f1_macro           =     0.3307
  test_f1_micro           =     0.4111
  test_loss               =     0.2197
  test_precision_macro    =     0.4192
  test_recall_macro       =      0.297
  test_roc_auc            =     0.8361
  test_runtime            = 0:01:05.39
  test_samples            =       6878
  test_samples_per_second =    105.184
  test_steps_per_second   =     26.304
  test_subset_accuracy    =     0.2953
Teste concluído! Métricas: {'test_loss': 0.21971718966960907, 'test_subset_accuracy': 0.29528932829310844, 'test_f1_macro': 0.33074604537971747, 'test_f1_micro': 0.4111382932796086, 'test_precision_macro': 0.41923298084516286, 'test_recall_macro': 0.2970088622354287, 'test_roc_auc': 0.836073630031073, 'test_runtime': 65.3905, 'test_samples_per_second': 105.184, 'test_steps_per_second': 26.304, 'epoch': 6.310396409872849, 'test_samples': 6878}


0,1
eval/f1_macro,▁▄▆▇▇███▇███
eval/f1_micro,▁▅▆█████▇███
eval/loss,▂▁▁▁▁▃▄▆▇▇█▃
eval/precision_macro,▁█▇▆▆▅▅▄▅▅▅▅
eval/recall_macro,▁▄▅▇▆█▇█▇█▇█
eval/roc_auc,▁▇███▆▆▅▅▅▅▆
eval/runtime,█▁▂▁▄▃▃▂▃▄▂▁
eval/samples_per_second,▁█▇█▅▆▆▇▅▅▇█
eval/steps_per_second,▁█▇█▅▆▆▇▅▅▇█
eval/subset_accuracy,▁▅▅█▇██▇▇▇▇█

0,1
eval/f1_macro,0.33888
eval/f1_micro,0.42531
eval/loss,0.21289
eval/precision_macro,0.41977
eval/recall_macro,0.30424
eval/roc_auc,0.84024
eval/runtime,64.3604
eval/samples_per_second,109.353
eval/steps_per_second,27.346
eval/subset_accuracy,0.31202


In [20]:
# --- Célula para Inspeção dos Dados de Teste ---
print('\n' + '='*35)
print('=== Inspeção de Dados para Avaliação de Teste ===')
test_ds = load_and_process_dataset(folder_teste, tokenizer)
random_idx = random.randint(0, len(test_ds) - 1)
example = test_ds[random_idx]
text_decoded = tokenizer.decode(example['input_ids'], skip_special_tokens=True)
print(f"Exemplo Aleatório do Dataset de Teste:")
print(f"Text-ID: {example['input_ids'].numpy()[:10]}...")  # Mostra primeiros 10 tokens para brevidade
print(f"Texto Decodificado: {text_decoded}")
print(f"Máscara de Atenção: {example['attention_mask'].numpy()[:10]}...")  # Mostra primeiros 10 valores
print(f"Rótulos (Multi-hot): {example['labels'].numpy()}")
print(f"Rótulos Ativos: {[ID2LABEL[i] for i, label in enumerate(example['labels']) if label == 1] or 'Nenhum'}")
print('='*35 + '\n')


=== Inspeção de Dados para Avaliação de Teste ===

=== Exemplo de Texto Pré-processado () ===
Text-ID: IT_030
Sentence-ID: 25
Texto: We are not there yet, but the end is at hand."
Rótulos: ['Stimulation']



Map:   0%|          | 0/6878 [00:00<?, ? examples/s]

Exemplo Aleatório do Dataset de Teste:
Text-ID: [50281  3726   588 16270  6936  2905   281   253 48502   273]...
Texto Decodificado: They will acquire skills related to the professions of the present and the future and apply their experience in Bulgaria.
Máscara de Atenção: [1 1 1 1 1 1 1 1 1 1]...
Rótulos (Multi-hot): [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
Rótulos Ativos: ['Benevolence: caring']



In [23]:
# prompt: gere codigo para gerar uma table do f1 do modelo hub_model_id='DayCardoso/modern-bert-seq-class-values-no-context_cru', para cada valor, exemplo:
# All	Self-direction: thought	Self-direction: action	Stimulation	Hedonism	Achievement	Power: dominance	Power: resources	Face	Security: personal	Security: societal	Tradition	Conformity: rules	Conformity: interpersonal	Humility	Benevolence: caring	Benevolence: dependability	Universalism: concern	Universalism: nature	Universalism: tolerance
# 0.39	0.15	0.27	0.30	0.37	0.45	0.42	0.49	0.31	0.42	0.49	0.46	0.51	0.24	0.00	0.34	0.33	0.47	0.63	0.27

# Carregar as métricas do teste
test_metrics_path = os.path.join('modern-bert-seq-class-values-no-context_cru', 'test_results.json')

if os.path.exists(test_metrics_path):
    with open(test_metrics_path, 'r') as f:
        test_metrics = json.load(f)

    # Coletar as métricas F1 por classe (se existirem)
    # O Trainer padrão não salva F1 por classe diretamente no test_results.json.
    # Precisamos recalcular as métricas F1 por classe a partir das previsões e rótulos do conjunto de teste.

    # Primeiro, precisamos carregar o modelo e o dataset de teste novamente para obter as previsões.
    # Se o Trainer já foi executado e salvou o modelo e os resultados, podemos carregar o modelo salvo.
    # Caso contrário, precisaríamos reexecutar a parte da avaliação.
    # Assumindo que o modelo foi salvo no output_dir

    try:
        # Carregar o modelo salvo
        model_path = os.path.join('modern-bert-seq-class-values-no-context_cru', 'checkpoint-XXXX') # Substitua XXXX pelo nome da pasta do checkpoint, ou use o modelo salvo na raiz do output_dir
        # Uma forma mais robusta é carregar diretamente do output_dir, pois load_best_model_at_end salva o melhor modelo lá
        model_path = 'modern-bert-seq-class-values-no-context_cru' # Carrega o modelo final salvo pelo Trainer

        print(f"\nCarregando modelo para recalcular F1 por classe de: {model_path}")
        config = AutoConfig.from_pretrained(model_path)
        model = ModernBertForSequenceClassification.from_pretrained(model_path, config=config)
        model.eval()
        if torch.cuda.is_available():
            model.to('cuda')

        # Recarregar o dataset de teste se necessário (se a variável test_ds não estiver mais acessível)
        # test_ds = load_and_process_dataset(folder_teste, tokenizer) # Descomente se precisar recarregar

        print("Realizando inferência no dataset de teste para calcular F1 por classe...")
        all_logits = []
        all_labels = []

        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        test_dataloader = torch.utils.data.DataLoader(test_ds, batch_size=4, collate_fn=data_collator)

        with torch.no_grad():
            for batch in tqdm(test_dataloader, desc="Inferência no Teste"):
                inputs = {k: v.to(model.device) for k, v in batch.items() if k != 'labels'}
                labels = batch['labels']
                outputs = model(**inputs)
                logits = outputs.logits
                all_logits.append(logits.cpu().numpy())
                all_labels.append(labels.cpu().numpy())

        all_logits = np.concatenate(all_logits, axis=0)
        all_labels = np.concatenate(all_labels, axis=0)

        probs = sigmoid(all_logits)
        preds = (probs > 0.5).astype(int)

        # Calcular F1 para cada classe
        f1_per_class = f1_score(all_labels, preds, average=None, zero_division=0)

        # Adicionar a métrica F1 macro/micro do arquivo de resultados
        f1_macro_total = test_metrics.get('test_f1_macro', float('nan'))
        f1_micro_total = test_metrics.get('test_f1_micro', float('nan'))

        # Criar DataFrame para a tabela
        data = {'All': [f1_macro_total], 'All_Micro': [f1_micro_total]}
        for i, valor in enumerate(VALORES):
            data[valor] = [f1_per_class[i]]

        df_f1 = pd.DataFrame(data)

        # Reordenar colunas para colocar 'All' primeiro
        cols = ['All', 'All_Micro'] + VALORES
        df_f1 = df_f1[cols]

        print("\nTabela de F1-Score por Valor:")
        print(df_f1.to_string(index=False, float_format='%.2f'))

    except Exception as e:
        print(f"\nErro ao calcular F1 por classe: {e}")
        print("Certifique-se de que o treinamento foi concluído e o modelo foi salvo corretamente.")
        print(f"Verifique se o diretório de saída '{model_path}' contém o modelo salvo.")
        print("Tentando exibir métricas globais se o arquivo de resultados existir...")

        if os.path.exists(test_metrics_path):
             with open(test_metrics_path, 'r') as f:
                test_metrics = json.load(f)
             print("\nMétricas globais encontradas:")
             for key, value in test_metrics.items():
                 print(f"{key}: {value:.4f}")
        else:
            print(f"Arquivo de resultados '{test_metrics_path}' não encontrado.")


else:
    print(f"Arquivo de resultados do teste '{test_metrics_path}' não encontrado.")
    print("Execute o treinamento e a avaliação primeiro.")



Carregando modelo para recalcular F1 por classe de: modern-bert-seq-class-values-no-context_cru
Realizando inferência no dataset de teste para calcular F1 por classe...


Inferência no Teste:   0%|          | 0/1720 [00:00<?, ?it/s]




Tabela de F1-Score por Valor:
 All  All_Micro  Self-direction: thought  Self-direction: action  Stimulation  Hedonism  Achievement  Power: dominance  Power: resources  Face  Security: personal  Security: societal  Tradition  Conformity: rules  Conformity: interpersonal  Humility  Benevolence: caring  Benevolence: dependability  Universalism: concern  Universalism: nature  Universalism: tolerance
0.33       0.41                     0.15                    0.25         0.17      0.39         0.44              0.24              0.45  0.17                0.41                0.50       0.50               0.56                       0.07      0.05                 0.21                        0.31                   0.44                  0.68                     0.27
