# SPR 2026 - FLAN-T5 (Seq2Seq)

Classificação usando FLAN-T5 como modelo seq2seq.

**Modelo:** google/flan-t5-small ou google/flan-t5-base

**Formato:** Code Competition (Kaggle) / Google Colab

In [None]:
# ============================================================
# SETUP - Ambiente, Download de Dados e Google Drive
# ============================================================
import os
import sys

IS_KAGGLE = os.path.exists('/kaggle/input')
IS_COLAB = 'google.colab' in sys.modules

print(f"Ambiente: {'Kaggle' if IS_KAGGLE else 'Colab' if IS_COLAB else 'Local'}")

if IS_KAGGLE:
    DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
    OUTPUT_DIR = '/kaggle/working'
elif IS_COLAB:
    # Montar Google Drive para salvar outputs
    from google.colab import drive
    drive.mount('/content/drive')
    
    DRIVE_OUTPUT = '/content/drive/MyDrive/SPR_2026_outputs'
    os.makedirs(DRIVE_OUTPUT, exist_ok=True)
    OUTPUT_DIR = DRIVE_OUTPUT
    
    # Download dados Kaggle
    !pip install kaggle transformers accelerate sentencepiece -q
    from google.colab import userdata
    os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
    os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')
    !mkdir -p ~/.kaggle
    !echo '{"username":"'$KAGGLE_USERNAME'","key":"'$KAGGLE_KEY'"}' > ~/.kaggle/kaggle.json
    !chmod 600 ~/.kaggle/kaggle.json
    !kaggle competitions download -c spr-2026-mammography-report-classification -q
    !mkdir -p data && unzip -o -q spr-2026-mammography-report-classification.zip -d data/
    DATA_DIR = 'data'
else:
    DATA_DIR = '../data'
    OUTPUT_DIR = '../submissions'
    os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"DATA_DIR: {DATA_DIR}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import warnings
warnings.filterwarnings('ignore')

# Config
SEED = 42
MODEL_NAME = 'google/flan-t5-small'  # ou 'google/flan-t5-base'
MAX_SOURCE_LEN = 512
MAX_TARGET_LEN = 8
BATCH_SIZE = 8
EPOCHS = 5
LR = 3e-4

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

np.random.seed(SEED)
torch.manual_seed(SEED)

## 1. Carregar Dados

In [None]:
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
print(f"Train: {train_df.shape}")

test_path = os.path.join(DATA_DIR, 'test.csv')
if os.path.exists(test_path):
    test_df = pd.read_csv(test_path)
    print(f"Test: {test_df.shape}")
else:
    test_df = None
    print("test.csv não disponível - será carregado no runtime Kaggle")

## 2. Dataset Seq2Seq

In [None]:
# Prompt para classificação
PROMPT = "Classifique o relatório de mamografia em BI-RADS (0-6): "

class T5Dataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_source_len=512, max_target_len=8):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_source_len = max_source_len
        self.max_target_len = max_target_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        source = PROMPT + str(self.texts[idx])
        source_enc = self.tokenizer(
            source,
            max_length=self.max_source_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        item = {
            'input_ids': source_enc['input_ids'].squeeze(),
            'attention_mask': source_enc['attention_mask'].squeeze(),
        }
        
        if self.labels is not None:
            target = str(self.labels[idx])
            target_enc = self.tokenizer(
                target,
                max_length=self.max_target_len,
                truncation=True,
                padding='max_length',
                return_tensors='pt'
            )
            labels = target_enc['input_ids'].squeeze()
            labels[labels == self.tokenizer.pad_token_id] = -100
            item['labels'] = labels
            
        return item

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

## 3. Treinar Modelo

In [None]:
# Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['report'].tolist(),
    train_df['target'].tolist(),
    test_size=0.1,
    stratify=train_df['target'],
    random_state=SEED
)

train_ds = T5Dataset(train_texts, train_labels, tokenizer, MAX_SOURCE_LEN, MAX_TARGET_LEN)
val_ds = T5Dataset(val_texts, val_labels, tokenizer, MAX_SOURCE_LEN, MAX_TARGET_LEN)

print(f"Train: {len(train_ds)}, Val: {len(val_ds)}")

In [None]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

args = TrainingArguments(
    output_dir=f'{OUTPUT_DIR}/flan_t5',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE*2,
    learning_rate=LR,
    weight_decay=0.01,
    warmup_ratio=0.1,
    fp16=torch.cuda.is_available(),
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    report_to='none',
    seed=SEED,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
)

trainer.train()

## 4. Avaliar no Validation

In [None]:
def predict_batch(texts, model, tokenizer, max_len=512):
    model.eval()
    preds = []
    with torch.no_grad():
        for text in texts:
            inputs = tokenizer(PROMPT + text, max_length=max_len, truncation=True, return_tensors='pt').to(device)
            outputs = model.generate(**inputs, max_new_tokens=4)
            decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
            try:
                preds.append(int(decoded.strip()))
            except:
                preds.append(2)  # default
    return preds

model.to(device)
val_preds = predict_batch(val_texts, model, tokenizer)
val_f1 = f1_score(val_labels, val_preds, average='macro')
print(f"Validation F1-Macro: {val_f1:.4f}")

## 5. Gerar Submissão

In [None]:
if test_df is not None:
    test_preds = predict_batch(test_df['report'].tolist(), model, tokenizer)
    
    submission = pd.DataFrame({'ID': test_df['ID'], 'target': test_preds})
    submission.to_csv(os.path.join(OUTPUT_DIR, 'submission.csv'), index=False)
    print(f"Submissão salva!")
    print(submission['target'].value_counts().sort_index())
else:
    print("Test não disponível - submeta no Kaggle")

In [None]:
if IS_COLAB and test_df is not None:
    from google.colab import files
    files.download('submission.csv')