# 🎯 Otimização: Threshold Tuning + Gamma Search

**Objetivo:** Melhorar o F1 Macro do BERTimbau-Medical (baseline: 0.787) com duas técnicas:
- **Parte A:** Threshold tuning por classe (sem retreinar — usa o modelo já treinado)
- **Parte B:** Retreinar com diferentes valores de gamma no Focal Loss (1.0, 2.0, 3.0)

**Modelo base:** BERTimbau-Medical (MLM) + Focal Loss, salvo no Google Drive


## Etapa 1 — Setup

In [1]:
# ===== ETAPA 1: SETUP =====
!pip install -q transformers datasets accelerate scikit-learn

import os
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f'✅ GPU: {gpu_name} ({gpu_mem:.1f} GB)')
print(f'Device: {device}')

✅ GPU: NVIDIA L4 (23.7 GB)
Device: cuda


## Etapa 2 — Carregar dados e modelo

In [4]:
# ===== ETAPA 2: DRIVE + DADOS =====
from google.colab import drive
drive.mount('/content/drive')

# Caminhos
MODEL_PATH = '/content/drive/MyDrive/spr_2026/models/bertimbau-medical'
CHECKPOINT_DIR = '/content/drive/MyDrive/spr_2026/checkpoints/finetune-focal'
DATA_DIR = '/content'

# Dataset — upload manual do zip se necessário
!unzip -o /content/spr-2026-mammography-report-classification.zip -d /content

train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
train_df['label'] = train_df['target']

NUM_LABELS = train_df['label'].nunique()
print(f'Train: {train_df.shape}, Test: {test_df.shape}')
print(f'Classes: {NUM_LABELS}')
print(f'\nDistribuição:')
print(train_df['label'].value_counts().sort_index())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Archive:  /content/spr-2026-mammography-report-classification.zip
  inflating: /content/submission.csv  
  inflating: /content/test.csv       
  inflating: /content/train.csv      
Train: (18272, 4), Test: (4, 2)
Classes: 7

Distribuição:
label
0      610
1      693
2    15968
3      713
4      214
5       29
6       45
Name: count, dtype: int64


## Etapa 3 — Tokenizar

In [5]:
# ===== ETAPA 3: TOKENIZAR =====
from transformers import AutoTokenizer
from datasets import Dataset, ClassLabel

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
MAX_LEN = 256

def tokenize(examples):
    return tokenizer(examples['text'], truncation=True, max_length=MAX_LEN, padding='max_length')

# Preparar datasets
train_data = Dataset.from_dict({
    'text': train_df['report'].tolist(),
    'label': train_df['label'].tolist()
})
test_data = Dataset.from_dict({
    'text': test_df['report'].tolist(),
})

train_tokenized = train_data.map(tokenize, batched=True, remove_columns=['text'])
test_tokenized = test_data.map(tokenize, batched=True, remove_columns=['text'])

train_tokenized.set_format('torch')
test_tokenized.set_format('torch')

# Split stratified
train_tokenized = train_tokenized.cast_column('label', ClassLabel(num_classes=NUM_LABELS))
split = train_tokenized.train_test_split(test_size=0.1, seed=42, stratify_by_column='label')
train_split = split['train']
val_split = split['test']

print(f'✅ Train: {len(train_split)}, Val: {len(val_split)}, Test: {len(test_tokenized)}')

Map:   0%|          | 0/18272 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/18272 [00:00<?, ? examples/s]

✅ Train: 16444, Val: 1828, Test: 4


## Etapa 4 — Definir Focal Loss e Trainer

In [6]:
# ===== ETAPA 4: FOCAL LOSS + TRAINER =====
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None, num_classes=7):
        super().__init__()
        self.gamma = gamma
        self.alpha = torch.tensor(alpha, dtype=torch.float32) if alpha is not None else None
        self.num_classes = num_classes

    def forward(self, logits, targets):
        probs = F.softmax(logits, dim=-1)
        targets_one_hot = F.one_hot(targets, num_classes=self.num_classes).float()
        pt = (probs * targets_one_hot).sum(dim=-1)
        focal_weight = (1 - pt) ** self.gamma
        ce_loss = F.cross_entropy(logits, targets, reduction='none')
        loss = focal_weight * ce_loss
        if self.alpha is not None:
            alpha_weight = self.alpha.to(logits.device)[targets]
            loss = alpha_weight * loss
        return loss.mean()

class FocalLossTrainer(Trainer):
    def __init__(self, focal_loss_fn, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.focal_loss_fn = focal_loss_fn

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.focal_loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Métricas
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    f1_macro = f1_score(labels, preds, average='macro')
    f1_per_class = f1_score(labels, preds, average=None)
    metrics = {'f1_macro': f1_macro}
    for i, f1 in enumerate(f1_per_class):
        metrics[f'f1_class_{i}'] = f1
    return metrics

# Pesos das classes
class_counts = train_df['label'].value_counts().sort_index().values
class_weights = 1.0 / class_counts
class_weights = class_weights / class_weights.sum() * len(class_weights)

print(f'Pesos das classes: {dict(zip(range(len(class_weights)), class_weights.round(3)))}')
print('✅ Focal Loss e Trainer definidos')

Pesos das classes: {0: np.float64(0.174), 1: np.float64(0.153), 2: np.float64(0.007), 3: np.float64(0.149), 4: np.float64(0.496), 5: np.float64(3.661), 6: np.float64(2.36)}
✅ Focal Loss e Trainer definidos


---
# 🅰️ PARTE A — Threshold Tuning por Classe

**Ideia:** Em vez de usar `argmax` direto nas probabilidades, encontrar o limiar ótimo para cada classe que maximiza o F1 individual. Isso não requer retreinar — usa o modelo que já temos.


## A1 — Carregar modelo treinado e gerar probabilidades

In [7]:
# ===== A1: CARREGAR MELHOR MODELO =====
# Procura o melhor checkpoint salvo no Drive
import glob

checkpoints = sorted(glob.glob(f'{CHECKPOINT_DIR}/checkpoint-*'))
if checkpoints:
    best_ckpt = checkpoints[-1]  # último salvo (pode ser o melhor se load_best_model_at_end=True)
    print(f'Checkpoint encontrado: {best_ckpt}')
else:
    best_ckpt = CHECKPOINT_DIR
    print(f'Usando diretório principal: {best_ckpt}')

# Carregar modelo
model = AutoModelForSequenceClassification.from_pretrained(
    best_ckpt,
    num_labels=NUM_LABELS,
    ignore_mismatched_sizes=True,
)
model.to(device)
model.eval()
print(f'✅ Modelo carregado: {sum(p.numel() for p in model.parameters()):,} params')

Checkpoint encontrado: /content/drive/MyDrive/spr_2026/checkpoints/finetune-focal/checkpoint-5140


Loading weights:   0%|          | 0/393 [00:00<?, ?it/s]

✅ Modelo carregado: 334,403,591 params


In [10]:
# ===== A2: GERAR PROBABILIDADES NO VALIDATION SET =====
from torch.utils.data import DataLoader

val_dataloader = DataLoader(val_split, batch_size=32, shuffle=False)

all_logits = []
all_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        all_logits.append(outputs.logits.cpu())
        all_labels.append(labels)

all_logits = torch.cat(all_logits)
all_labels = torch.cat(all_labels).numpy()
all_probs = F.softmax(all_logits, dim=-1).numpy()

print(f'✅ Probabilidades geradas: {all_probs.shape}')
print(f'Labels: {all_labels.shape}')

preds_argmax = np.argmax(all_probs, axis=-1)
f1_baseline = f1_score(all_labels, preds_argmax, average='macro')
print(f'\n📊 F1 Macro (argmax baseline): {f1_baseline:.5f}')

✅ Probabilidades geradas: (1828, 7)
Labels: (1828,)

📊 F1 Macro (argmax baseline): 0.77986


## A2 — Encontrar thresholds ótimos por classe

In [11]:
# ===== A3: THRESHOLD TUNING =====
from sklearn.metrics import precision_recall_curve

def find_best_thresholds(probs, labels, num_classes):
    """
    Para cada classe, encontra o threshold que maximiza o F1.
    Usa abordagem one-vs-rest.
    """
    best_thresholds = []

    for c in range(num_classes):
        binary_labels = (labels == c).astype(int)
        class_probs = probs[:, c]

        best_f1 = 0
        best_thresh = 0.5

        # Testar thresholds de 0.05 a 0.95
        for thresh in np.arange(0.05, 0.96, 0.01):
            preds = (class_probs >= thresh).astype(int)
            if preds.sum() == 0:
                continue
            f1 = f1_score(binary_labels, preds, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_thresh = thresh

        best_thresholds.append(best_thresh)
        count = (labels == c).sum()
        print(f'Classe {c}: threshold={best_thresh:.2f}, F1={best_f1:.4f} (n={count})')

    return best_thresholds

print('🔍 Buscando thresholds ótimos por classe...\n')
best_thresholds = find_best_thresholds(all_probs, all_labels, NUM_LABELS)
print(f'\nThresholds: {[f"{t:.2f}" for t in best_thresholds]}')

🔍 Buscando thresholds ótimos por classe...

Classe 0: threshold=0.38, F1=0.8261 (n=61)
Classe 1: threshold=0.77, F1=0.9362 (n=69)
Classe 2: threshold=0.12, F1=0.9810 (n=1598)
Classe 3: threshold=0.61, F1=0.6067 (n=71)
Classe 4: threshold=0.73, F1=0.7317 (n=21)
Classe 5: threshold=0.42, F1=1.0000 (n=3)
Classe 6: threshold=0.43, F1=0.9091 (n=5)

Thresholds: ['0.38', '0.77', '0.12', '0.61', '0.73', '0.42', '0.43']


In [12]:
# ===== A4: APLICAR THRESHOLDS E COMPARAR =====
def predict_with_thresholds(probs, thresholds):
    """
    Prediz usando thresholds por classe.
    Se nenhuma classe passa o threshold, usa argmax.
    Se múltiplas passam, usa a de maior probabilidade entre as que passaram.
    """
    preds = []
    for i in range(len(probs)):
        # Checar quais classes passam o threshold
        passed = []
        for c in range(len(thresholds)):
            if probs[i, c] >= thresholds[c]:
                passed.append(c)

        if len(passed) == 0:
            # Nenhuma passou -> usa argmax
            preds.append(np.argmax(probs[i]))
        elif len(passed) == 1:
            preds.append(passed[0])
        else:
            # Múltiplas passaram -> maior probabilidade entre elas
            best = max(passed, key=lambda c: probs[i, c])
            preds.append(best)

    return np.array(preds)

# Predições com thresholds otimizados
preds_threshold = predict_with_thresholds(all_probs, best_thresholds)
f1_threshold = f1_score(all_labels, preds_threshold, average='macro')

print('='*60)
print('📊 COMPARAÇÃO: Argmax vs Threshold Tuning')
print('='*60)
print(f'F1 Macro (argmax):     {f1_baseline:.5f}')
print(f'F1 Macro (threshold):  {f1_threshold:.5f}')
diff = f1_threshold - f1_baseline
sinal = '+' if diff > 0 else ''
print(f'Diferença:             {sinal}{diff:.5f}')
print()

# Detalhamento por classe
print('F1 por classe:')
f1_argmax_per = f1_score(all_labels, preds_argmax, average=None)
f1_thresh_per = f1_score(all_labels, preds_threshold, average=None)
print(f'{"Classe":<10} {"Argmax":<12} {"Threshold":<12} {"Diff":<10}')
for c in range(NUM_LABELS):
    d = f1_thresh_per[c] - f1_argmax_per[c]
    s = '+' if d > 0 else ''
    print(f'{c:<10} {f1_argmax_per[c]:<12.4f} {f1_thresh_per[c]:<12.4f} {s}{d:.4f}')

📊 COMPARAÇÃO: Argmax vs Threshold Tuning
F1 Macro (argmax):     0.77986
F1 Macro (threshold):  0.84896
Diferença:             +0.06911

F1 por classe:
Classe     Argmax       Threshold    Diff      
0          0.8120       0.8382       +0.0262
1          0.8903       0.9362       +0.0458
2          0.9614       0.9738       +0.0125
3          0.5438       0.6188       +0.0750
4          0.6182       0.6667       +0.0485
5          0.8000       1.0000       +0.2000
6          0.8333       0.9091       +0.0758


---
# 🅱️ PARTE B — Grid Search de Gamma no Focal Loss

**Ideia:** Retreinar o modelo com diferentes valores de gamma (1.0, 2.0, 3.0).
- gamma baixo (1.0): menos foco nos exemplos difíceis
- gamma alto (3.0): mais foco nos exemplos difíceis/raros

O modelo é carregado do zero (BERTimbau-Medical) a cada teste.


In [13]:
# ===== B1: GRID SEARCH DE GAMMA =====
gammas_to_test = [1.0, 2.0, 3.0]
results = {}

for gamma in gammas_to_test:
    print(f'\n{"="*60}')
    print(f'🔄 Treinando com gamma={gamma}')
    print(f'{"="*60}')

    # Carregar modelo do zero (BERTimbau-Medical)
    model_g = AutoModelForSequenceClassification.from_pretrained(
        MODEL_PATH,
        num_labels=NUM_LABELS,
        ignore_mismatched_sizes=True,
    )
    model_g.to(device)

    # Focal Loss com gamma atual
    focal = FocalLoss(gamma=gamma, alpha=class_weights.tolist(), num_classes=NUM_LABELS)

    # Training args
    output_dir = f'/content/drive/MyDrive/spr_2026/checkpoints/gamma_{gamma}'
    args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_ratio=0.1,
        logging_steps=100,
        eval_strategy='epoch',
        save_strategy='epoch',
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model='f1_macro',
        greater_is_better=True,
        fp16=True,
        dataloader_num_workers=2,
        report_to='none',
        seed=42,
    )

    trainer_g = FocalLossTrainer(
        focal_loss_fn=focal,
        model=model_g,
        args=args,
        train_dataset=train_split,
        eval_dataset=val_split,
        compute_metrics=compute_metrics,
    )

    # Treinar
    train_result = trainer_g.train()

    # Avaliar
    eval_result = trainer_g.evaluate()
    f1 = eval_result['eval_f1_macro']

    results[gamma] = {
        'f1_macro': f1,
        'train_loss': train_result.training_loss,
        'per_class': {i: eval_result.get(f'eval_f1_class_{i}', 0) for i in range(NUM_LABELS)},
        'trainer': trainer_g,
    }

    print(f'\n✅ gamma={gamma}: F1 Macro = {f1:.5f}')

    # Liberar memória
    if gamma != gammas_to_test[-1]:
        del model_g
        torch.cuda.empty_cache()

print('\n\n' + '='*60)
print('📊 RESULTADOS DO GRID SEARCH')
print('='*60)


🔄 Treinando com gamma=1.0


Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: /content/drive/MyDrive/spr_2026/models/bertimbau-medical
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.decoder.weight             | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.bias                            | MISSING    | 
bert.pooler.dense.weight                   | MISSING    | 
bert.pooler.dense.bias                     | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing fro

Epoch,Training Loss,Validation Loss,F1 Macro,F1 Class 0,F1 Class 1,F1 Class 2,F1 Class 3,F1 Class 4,F1 Class 5,F1 Class 6
1,0.02525,0.027522,0.542756,0.670391,0.873418,0.925403,0.319328,0.344086,0.0,0.666667
2,0.020813,0.01477,0.693307,0.783217,0.873418,0.947368,0.442478,0.506667,0.5,0.8
3,0.009291,0.01147,0.733226,0.780488,0.907895,0.95459,0.461538,0.561404,0.666667,0.8
4,0.006532,0.012159,0.764665,0.84375,0.901961,0.966118,0.566038,0.680851,0.666667,0.727273
5,0.001775,0.0135,0.790603,0.861538,0.907895,0.966473,0.561905,0.680851,0.666667,0.888889


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La


✅ gamma=1.0: F1 Macro = 0.79060

🔄 Treinando com gamma=2.0


Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: /content/drive/MyDrive/spr_2026/models/bertimbau-medical
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.decoder.weight             | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.bias                            | MISSING    | 
bert.pooler.dense.weight                   | MISSING    | 
bert.pooler.dense.bias                     | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing fro

Epoch,Training Loss,Validation Loss,F1 Macro,F1 Class 0,F1 Class 1,F1 Class 2,F1 Class 3,F1 Class 4,F1 Class 5,F1 Class 6
1,0.021056,0.027615,0.541973,0.728395,0.802326,0.929455,0.34632,0.44186,0.0,0.545455
2,0.020072,0.014494,0.622443,0.614525,0.877419,0.925042,0.407692,0.465753,0.4,0.666667
3,0.011321,0.012205,0.707891,0.769231,0.901961,0.956266,0.512397,0.615385,0.4,0.8
4,0.00934,0.013334,0.660037,0.823529,0.867925,0.964447,0.564356,0.6,0.0,0.8
5,0.001401,0.01485,0.738146,0.827068,0.890323,0.967118,0.568627,0.625,0.4,0.888889


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La


✅ gamma=2.0: F1 Macro = 0.73815

🔄 Treinando com gamma=3.0


Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: /content/drive/MyDrive/spr_2026/models/bertimbau-medical
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.decoder.weight             | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.bias                            | MISSING    | 
bert.pooler.dense.weight                   | MISSING    | 
bert.pooler.dense.bias                     | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing fro

Epoch,Training Loss,Validation Loss,F1 Macro,F1 Class 0,F1 Class 1,F1 Class 2,F1 Class 3,F1 Class 4,F1 Class 5,F1 Class 6
1,0.018159,0.02301,0.519343,0.662921,0.846626,0.92178,0.342629,0.361446,0.0,0.5
2,0.01837,0.011496,0.561943,0.622754,0.871795,0.91954,0.380623,0.472222,0.0,0.666667
3,0.00502,0.009471,0.724588,0.75,0.8625,0.953876,0.462151,0.576923,0.666667,0.8
4,0.003739,0.008922,0.62197,0.81203,0.8625,0.96039,0.518868,0.472727,0.0,0.727273
5,0.000645,0.010485,0.705575,0.861538,0.878981,0.965428,0.550725,0.54902,0.333333,0.8


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La


✅ gamma=3.0: F1 Macro = 0.72377


📊 RESULTADOS DO GRID SEARCH


## B2 — Comparar resultados

In [14]:
# ===== B2: COMPARAR GAMMAS =====
print(f'{"Gamma":<10} {"F1 Macro":<12} {"Loss":<12}')
print('-'*34)
best_gamma = None
best_f1 = 0

for gamma, res in sorted(results.items()):
    marker = ''
    if res['f1_macro'] > best_f1:
        best_f1 = res['f1_macro']
        best_gamma = gamma
    print(f'{gamma:<10} {res["f1_macro"]:<12.5f} {res["train_loss"]:<12.4f}')

print(f'\n🏆 Melhor gamma: {best_gamma} (F1 Macro: {best_f1:.5f})')

# Detalhamento por classe do melhor
print(f'\nF1 por classe (gamma={best_gamma}):')
for c in range(NUM_LABELS):
    f1_c = results[best_gamma]['per_class'][c]
    print(f'  Classe {c}: {f1_c:.4f}')

Gamma      F1 Macro     Loss        
----------------------------------
1.0        0.79060      0.0182      
2.0        0.73815      0.0148      
3.0        0.72377      0.0119      

🏆 Melhor gamma: 1.0 (F1 Macro: 0.79060)

F1 por classe (gamma=1.0):
  Classe 0: 0.8615
  Classe 1: 0.9079
  Classe 2: 0.9665
  Classe 3: 0.5619
  Classe 4: 0.6809
  Classe 5: 0.6667
  Classe 6: 0.8889


---
# 🏆 Parte C — Combinar: Melhor Gamma + Threshold Tuning

Aplica threshold tuning no modelo com o melhor gamma.


In [15]:
# ===== C1: THRESHOLD NO MELHOR GAMMA =====
best_trainer = results[best_gamma]['trainer']

# Gerar probabilidades do melhor modelo
val_preds = best_trainer.predict(val_split)
best_probs = F.softmax(torch.tensor(val_preds.predictions), dim=-1).numpy()
best_labels = val_preds.label_ids

# Threshold tuning
print(f'🔍 Threshold tuning no melhor modelo (gamma={best_gamma})...\n')
best_thresholds_v2 = find_best_thresholds(best_probs, best_labels, NUM_LABELS)

# Aplicar
preds_combined = predict_with_thresholds(best_probs, best_thresholds_v2)
f1_combined = f1_score(best_labels, preds_combined, average='macro')

print(f'\n{"="*60}')
print(f'📊 RESULTADO FINAL COMBINADO')
print(f'{"="*60}')
print(f'Baseline original (gamma=2.0, argmax):  0.78665')
print(f'Melhor gamma ({best_gamma}) + argmax:          {results[best_gamma]["f1_macro"]:.5f}')
print(f'Melhor gamma ({best_gamma}) + threshold:       {f1_combined:.5f}')
print(f'Score anterior no Kaggle:               0.79696')
print(f'Meta (top leaderboard):                 ~0.84')

🔍 Threshold tuning no melhor modelo (gamma=1.0)...

Classe 0: threshold=0.59, F1=0.8730 (n=61)
Classe 1: threshold=0.95, F1=0.9412 (n=69)
Classe 2: threshold=0.10, F1=0.9822 (n=1598)
Classe 3: threshold=0.60, F1=0.6105 (n=71)
Classe 4: threshold=0.55, F1=0.7111 (n=21)
Classe 5: threshold=0.15, F1=0.8571 (n=3)
Classe 6: threshold=0.31, F1=0.8889 (n=5)

📊 RESULTADO FINAL COMBINADO
Baseline original (gamma=2.0, argmax):  0.78665
Melhor gamma (1.0) + argmax:          0.79060
Melhor gamma (1.0) + threshold:       0.80193
Score anterior no Kaggle:               0.79696
Meta (top leaderboard):                 ~0.84


## Etapa Final — Gerar submissão com o melhor modelo

In [17]:
# ===== SUBMISSION =====
# Gerar predições no test set com o melhor modelo + thresholds
test_preds = best_trainer.predict(test_tokenized)
test_probs = F.softmax(torch.tensor(test_preds.predictions), dim=-1).numpy()

# Aplicar thresholds otimizados
test_final = predict_with_thresholds(test_probs, best_thresholds_v2)

# Criar submission
submission = pd.read_csv(f'{DATA_DIR}/submission.csv')
print(f'Template: {submission.shape}')
print(f'Colunas: {submission.columns.tolist()}')
print(f'Predições: {len(test_final)}')
print(f'\nTemplate:')
print(submission.head())

# Preencher só as primeiras linhas que temos predição
# O test público tem menos exemplos que o submission final
submission.loc[:len(test_final)-1, 'label'] = test_final

# Salvar
SUBMISSION_PATH = f'/content/submission_gamma{best_gamma}_threshold.csv'
submission.to_csv(SUBMISSION_PATH, index=False)

DRIVE_SUB = f'/content/drive/MyDrive/spr_2026/submissions/submission_gamma{best_gamma}_threshold.csv'
os.makedirs(os.path.dirname(DRIVE_SUB), exist_ok=True)
submission.to_csv(DRIVE_SUB, index=False)

print(f'\n✅ Submissão salva!')
print(f'   Local: {SUBMISSION_PATH}')
print(f'   Drive: {DRIVE_SUB}')

Template: (10, 2)
Colunas: ['ID', 'target']
Predições: 4

Template:
      ID  target
0   Acc0      -1
1   Acc2      -1
2   Acc4      -1
3  Acc10      -1
4  Acc11      -1

✅ Submissão salva!
   Local: /content/submission_gamma1.0_threshold.csv
   Drive: /content/drive/MyDrive/spr_2026/submissions/submission_gamma1.0_threshold.csv


In [18]:
# ===== RESUMO FINAL =====
print('='*60)
print('📊 RESUMO COMPLETO DOS EXPERIMENTOS')
print('='*60)

print(f'\n{"Experimento":<40} {"F1 Macro":<12}')
print('-'*52)
print(f'{"Baseline Kaggle (sem MLM)":<40} {"0.79696":<12}')
print(f'{"MLM + gamma=2.0 + argmax":<40} {"0.78665":<12}')

if f1_threshold > f1_baseline:
    print(f'{"MLM + gamma=2.0 + threshold":<40} {f1_threshold:<12.5f}')

for gamma, res in sorted(results.items()):
    print(f'{"MLM + gamma=" + str(gamma) + " + argmax":<40} {res["f1_macro"]:<12.5f}')

print(f'{"MLM + gamma=" + str(best_gamma) + " + threshold":<40} {f1_combined:<12.5f}')

print(f'\n🎯 Meta: ~0.84')
print(f'🏆 Melhor resultado: gamma={best_gamma} + threshold = {f1_combined:.5f}')
print('='*60)

📊 RESUMO COMPLETO DOS EXPERIMENTOS

Experimento                              F1 Macro    
----------------------------------------------------
Baseline Kaggle (sem MLM)                0.79696     
MLM + gamma=2.0 + argmax                 0.78665     
MLM + gamma=2.0 + threshold              0.84896     
MLM + gamma=1.0 + argmax                 0.79060     
MLM + gamma=2.0 + argmax                 0.73815     
MLM + gamma=3.0 + argmax                 0.72377     
MLM + gamma=1.0 + threshold              0.80193     

🎯 Meta: ~0.84
🏆 Melhor resultado: gamma=1.0 + threshold = 0.80193
