In [12]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import numpy as np
import evaluate

In [13]:
# --- 1. CARREGAR TODOS OS DATASETS ---

path_seed_set = '/Users/giossaurus/Developer/leia_tcc/data/processed/seed_set_anotado.csv'
path_cycle1_annotations = '/Users/giossaurus/Developer/leia_tcc/data/processed/active_learning_cycle1.csv'
path_final_test_set = '/Users/giossaurus/Developer/leia_tcc/data/processed/final_test_set.csv'

# Carregar os dataframes
df_seed_anotado = pd.read_csv(path_seed_set)
df_cycle1_anotado = pd.read_csv(path_cycle1_annotations)
df_final_test = pd.read_csv(path_final_test_set)

print(f"Carregados {len(df_seed_anotado)} exemplos do Seed Set original.")
print(f"Carregados {len(df_cycle1_anotado)} novos exemplos do Ciclo 1.")

Carregados 200 exemplos do Seed Set original.
Carregados 50 novos exemplos do Ciclo 1.


In [14]:
# --- 2. ENRIQUECER O CONJUNTO DE TREINO (COM CORREÇÃO) ---

# Definir as colunas essenciais que queremos manter
colunas_essenciais = ['question_id', 'question', 'intent_choice', 'disciplina']

# Padronizar os dataframes para terem apenas as colunas essenciais
df_seed_padronizado = df_seed_anotado[colunas_essenciais]
df_cycle1_padronizado = df_cycle1_anotado[colunas_essenciais]

# Excluir o conjunto de teste final do seed set para criar a base de treino
df_seed_train_base = df_seed_padronizado[~df_seed_padronizado['question_id'].isin(df_final_test['question_id'])]

# Combinar a base de treino com as novas 50 anotações
df_new_train = pd.concat([df_seed_train_base, df_cycle1_padronizado], ignore_index=True)

# Renomear colunas para o padrão
df_new_train = df_new_train.rename(columns={'question': 'text', 'intent_choice': 'label'})

# Mapear os rótulos de texto para números (IDs)
labels = df_seed_anotado['intent_choice'].unique().tolist()
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

# Aplicar o mapeamento e remover linhas onde o label possa ser nulo (se houver)
df_new_train['label'] = df_new_train['label'].map(label2id)
df_new_train.dropna(subset=['label'], inplace=True)
df_new_train['label'] = df_new_train['label'].astype(int)

# Converter para Dataset do Hugging Face
from datasets import ClassLabel
new_train_dataset_full = Dataset.from_pandas(df_new_train)

# Converter a coluna label para ClassLabel para permitir estratificação
features = new_train_dataset_full.features.copy()
features['label'] = ClassLabel(names=labels)
new_train_dataset_full = new_train_dataset_full.cast(features)

# Dividir o novo conjunto de treino em treino e validação para este ciclo
train_val_split = new_train_dataset_full.train_test_split(test_size=0.2, stratify_by_column="label", seed=42)
train_dataset = train_val_split["train"]
eval_dataset = train_val_split["test"]

print(f"\n--- NOVO DATASET DE TREINO CRIADO (CICLO 2) ---")
print(f"Tamanho total para treino/validação: {len(new_train_dataset_full)} exemplos")
print(f"-> Para este Ciclo 2: Treino = {len(train_dataset)}, Validação = {len(eval_dataset)}")

Casting the dataset:   0%|          | 0/210 [00:00<?, ? examples/s]


--- NOVO DATASET DE TREINO CRIADO (CICLO 2) ---
Tamanho total para treino/validação: 210 exemplos
-> Para este Ciclo 2: Treino = 168, Validação = 42


In [15]:
# --- 3. PREPARAR O MODELO E TOKENIZADOR ---
# Vamos começar do mesmo ponto para uma comparação justa
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, 
    num_labels=len(labels), 
    id2label=id2label, 
    label2id=label2id
)

Map:   0%|          | 0/168 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# --- 4. CONFIGURAR E EXECUTAR O TREINAMENTO (CICLO 2) ---
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": acc["accuracy"], "f1": f1["f1"]}

training_args = TrainingArguments(
    output_dir="/Users/giossaurus/Developer/leia_tcc/models/leia_classifier_cycle2", 
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=15,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

print("\n--- INICIANDO TREINAMENTO DO MODELO v2 ---")
trainer.train()
print("\n--- TREINAMENTO CONCLUÍDO ---")

# Salvar o modelo v2
trainer.save_model("/Users/giossaurus/Developer/leia_tcc/models/leia_classifier_v2_final")
print("Modelo v2 salvo em '/Users/giossaurus/Developer/leia_tcc/models/leia_classifier_v2_final'")

  trainer = Trainer(



--- INICIANDO TREINAMENTO DO MODELO v2 ---




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.037859,0.595238,0.515344
2,No log,0.910825,0.642857,0.568783
3,No log,0.867817,0.690476,0.638207
4,No log,0.902062,0.642857,0.592511
5,No log,0.842641,0.642857,0.590461
6,No log,0.956977,0.595238,0.561429
7,No log,0.842382,0.642857,0.596187
8,No log,0.904781,0.619048,0.561258
9,No log,0.900882,0.642857,0.58961
10,No log,0.969778,0.642857,0.622107





--- TREINAMENTO CONCLUÍDO ---
Modelo v2 salvo em '/Users/giossaurus/Developer/leia_tcc/models/leia_classifier_v2_final'
