In [1]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import numpy as np
import evaluate

In [2]:
# --- 1. CARREGAR TODOS OS DATASETS ANOTADOS ATÉ AGORA ---

path_seed_set = '/Users/giossaurus/Developer/leia_tcc/data/processed/seed_set_anotado.csv'
path_cycle1_annotations = '/Users/giossaurus/Developer/leia_tcc/data/processed/active_learning_cycle1.csv'
path_cycle2_annotations = '/Users/giossaurus/Developer/leia_tcc/data/processed/active_learning_cycle2.csv' # <-- NOVO ARQUIVO
path_final_test_set = '/Users/giossaurus/Developer/leia_tcc/data/processed/final_test_set.csv'

df_seed_anotado = pd.read_csv(path_seed_set)
df_cycle1_anotado = pd.read_csv(path_cycle1_annotations)
df_cycle2_anotado = pd.read_csv(path_cycle2_annotations) # <-- NOVO ARQUIVO
df_final_test = pd.read_csv(path_final_test_set)

print(f"Carregados {len(df_seed_anotado)} exemplos do Seed Set.")
print(f"Carregados {len(df_cycle1_anotado)} exemplos do Ciclo 1.")
print(f"Carregados {len(df_cycle2_anotado)} novos exemplos do Ciclo 2.") # <-- NOVO

Carregados 200 exemplos do Seed Set.
Carregados 50 exemplos do Ciclo 1.
Carregados 50 novos exemplos do Ciclo 2.


In [3]:
# --- 2. ENRIQUECER O CONJUNTO DE TREINO ---

# Definir as colunas essenciais para padronização
colunas_essenciais = ['question_id', 'question', 'intent_choice', 'disciplina']

# Padronizar todos os dataframes
df_seed_padronizado = df_seed_anotado[colunas_essenciais]
df_cycle1_padronizado = df_cycle1_anotado[colunas_essenciais]
df_cycle2_padronizado = df_cycle2_anotado[colunas_essenciais] # <-- NOVO

# Isolar a base de treino original (sem o test set)
df_seed_train_base = df_seed_padronizado[~df_seed_padronizado['question_id'].isin(df_final_test['question_id'])]

# Combinar a base de treino com TODAS as novas anotações
df_new_train = pd.concat([
    df_seed_train_base, 
    df_cycle1_padronizado, 
    df_cycle2_padronizado # <-- NOVO
], ignore_index=True)

# Renomear colunas
df_new_train = df_new_train.rename(columns={'question': 'text', 'intent_choice': 'label'})

# Mapear rótulos para IDs (usando a lista completa de rótulos do seed set)
labels = df_seed_anotado['intent_choice'].unique().tolist()
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

df_new_train['label'] = df_new_train['label'].map(label2id)
df_new_train.dropna(subset=['label'], inplace=True)
df_new_train['label'] = df_new_train['label'].astype(int)

# Converter para Dataset do Hugging Face
from datasets import ClassLabel
new_train_dataset_full = Dataset.from_pandas(df_new_train)

# Converter a coluna label para ClassLabel para permitir estratificação
features = new_train_dataset_full.features.copy()
features['label'] = ClassLabel(names=labels)
new_train_dataset_full = new_train_dataset_full.cast(features)

# Dividir em treino e validação
train_val_split = new_train_dataset_full.train_test_split(test_size=0.2, stratify_by_column="label", seed=42)
train_dataset = train_val_split["train"]
eval_dataset = train_val_split["test"]

print(f"\n--- NOVO DATASET DE TREINO CRIADO (CICLO 3) ---")
print(f"Tamanho total para treino/validação: {len(new_train_dataset_full)} exemplos")
print(f"-> Para este Ciclo 3: Treino = {len(train_dataset)}, Validação = {len(eval_dataset)}")

Casting the dataset:   0%|          | 0/260 [00:00<?, ? examples/s]


--- NOVO DATASET DE TREINO CRIADO (CICLO 3) ---
Tamanho total para treino/validação: 260 exemplos
-> Para este Ciclo 3: Treino = 208, Validação = 52


In [4]:
# --- 3. PREPARAR O MODELO E TOKENIZADOR ---
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, 
    num_labels=len(labels), 
    id2label=id2label, 
    label2id=label2id
)

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/52 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# --- 4. CONFIGURAR E EXECUTAR O TREINAMENTO (CICLO 3) ---
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": acc["accuracy"], "f1": f1["f1"]}

training_args = TrainingArguments(
    output_dir="/Users/giossaurus/Developer/leia_tcc/models/leia_classifier_cycle3", # Novo diretório de output
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=15, # Manter 15 épocas
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

print("\n--- INICIANDO TREINAMENTO DO MODELO v3 ---")
trainer.train()
print("\n--- TREINAMENTO CONCLUÍDO ---")

trainer.save_model("/Users/giossaurus/Developer/leia_tcc/models/leia_classifier_v3_final")
print("Modelo v3 salvo em '/Users/giossaurus/Developer/leia_tcc/models/leia_classifier_v3_final'")

  trainer = Trainer(



--- INICIANDO TREINAMENTO DO MODELO v3 ---




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.287632,0.384615,0.213675
2,No log,1.183468,0.538462,0.467313
3,No log,1.051529,0.576923,0.504886
4,No log,0.966702,0.596154,0.546154
5,No log,0.902479,0.673077,0.622894
6,No log,0.794456,0.653846,0.585953
7,No log,0.845255,0.653846,0.604438
8,No log,0.827857,0.653846,0.630446
9,No log,0.816359,0.653846,0.628377
10,No log,0.869613,0.634615,0.599934





--- TREINAMENTO CONCLUÍDO ---
Modelo v3 salvo em '/Users/giossaurus/Developer/leia_tcc/models/leia_classifier_v3_final'
