In [8]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import numpy as np
import evaluate

In [9]:
# --- 1. CARREGAR DATASET DE TREINO (FRACO) E DE AVALIAÇÃO (GOLDEN SET) ---

# Carregar o grande dataset de treino que você acabou de gerar
path_train_ws = '/Users/giossaurus/Developer/leia_tcc/data/processed/weak_supervision_train_set.csv'
df_train = pd.read_csv(path_train_ws)

# Carregar nosso Golden Set para avaliação
path_eval_golden = '/Users/giossaurus/Developer/leia_tcc/data/processed/seed_set_anotado.csv'
df_eval = pd.read_csv(path_eval_golden)

print(f"Dataset de Treino (Supervisão Fraca): {len(df_train)} exemplos")
print(f"Dataset de Avaliação (Golden Set): {len(df_eval)} exemplos")

Dataset de Treino (Supervisão Fraca): 1484 exemplos
Dataset de Avaliação (Golden Set): 200 exemplos


In [10]:
# --- 2. PREPARAR OS DADOS PARA O MODELO ---

# Renomear colunas
df_train = df_train.rename(columns={'question': 'text', 'weak_label': 'label'})
df_eval = df_eval.rename(columns={'question': 'text', 'intent_choice': 'label'})

# Mapear rótulos para IDs (usando os rótulos do Golden Set como referência de consistência)
labels = df_eval['label'].unique().tolist()
id2label = {i: label for i, label in enumerate(labels)}
# Criar o reverso do id2label para mapear os IDs numéricos dos rótulos fracos de volta para texto
id2label_ws = {0: 'Conceitual', 1: 'Procedimental', 2: 'Análise de Exemplo', 3: 'Comparativo'}
label2id = {label: i for i, label in enumerate(labels)}

# Aplicar mapeamento
df_train['label'] = df_train['label'].map(id2label_ws) # Mapear de ID numérico para texto primeiro
df_train['label'] = df_train['label'].map(label2id) # Agora de texto para o ID correto e consistente
df_eval['label'] = df_eval['label'].map(label2id)

# Remover quaisquer linhas que possam ter ficado sem rótulo
df_train.dropna(subset=['label'], inplace=True)
df_eval.dropna(subset=['label'], inplace=True)
df_train['label'] = df_train['label'].astype(int)
df_eval['label'] = df_eval['label'].astype(int)

# Converter para Datasets do Hugging Face
from datasets import ClassLabel
train_dataset = Dataset.from_pandas(df_train)
eval_dataset = Dataset.from_pandas(df_eval)

# Configurar ClassLabel para ambos os datasets
features_train = train_dataset.features.copy()
features_train['label'] = ClassLabel(names=labels)
train_dataset = train_dataset.cast(features_train)

features_eval = eval_dataset.features.copy()
features_eval['label'] = ClassLabel(names=labels)
eval_dataset = eval_dataset.cast(features_eval)

Casting the dataset:   0%|          | 0/1484 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

In [11]:
# --- 3. PREPARAR O MODELO E TOKENIZADOR ---
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

Map:   0%|          | 0/1484 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# --- 4. CONFIGURAR E EXECUTAR O TREINAMENTO ---
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    # Usar average="weighted" é crucial aqui por causa do desbalanceamento
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": acc["accuracy"], "f1": f1["f1"]}

training_args = TrainingArguments(
    output_dir="/Users/giossaurus/Developer/leia_tcc/models/leia_classifier_weak_supervision",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,  # Com mais de 5k dados, 3 épocas é um ótimo ponto de partida
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

print("\n--- INICIANDO TREINAMENTO COM SUPERVISÃO FRACA (DATASET GRANDE) ---")
trainer.train()
print("\n--- TREINAMENTO CONCLUÍDO ---")

trainer.save_model("/Users/giossaurus/Developer/leia_tcc/models/leia_classifier_weak_supervision_final")
print("Modelo de Supervisão Fraca salvo em '/Users/giossaurus/Developer/leia_tcc/models/leia_classifier_weak_supervision_final'")

  trainer = Trainer(



--- INICIANDO TREINAMENTO COM SUPERVISÃO FRACA (DATASET GRANDE) ---




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.47873,0.365,0.202322
2,No log,1.732278,0.255,0.217301
3,0.796200,2.151726,0.33,0.255963
4,0.796200,2.235281,0.305,0.235569
5,0.796200,2.325731,0.28,0.223832





--- TREINAMENTO CONCLUÍDO ---
Modelo de Supervisão Fraca salvo em '/Users/giossaurus/Developer/leia_tcc/models/leia_classifier_weak_supervision_final'
