# SPR 2026 Mammography Report Classification
## Kaggle Submission Notebook

Este notebook treina um modelo BERT e gera a submissão para o Kaggle.

In [None]:
# Instalar dependências (rodar apenas se necessário)
# !pip install transformers accelerate datasets scikit-learn -q

In [None]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
import warnings
warnings.filterwarnings('ignore')

# Configurações
SEED = 42
N_FOLDS = 5
MODEL_NAME = "neuralmind/bert-base-portuguese-cased"  # BERTimbau
MAX_LENGTH = 512
BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5

# Caminhos Kaggle
INPUT_DIR = "/kaggle/input/spr-2026-mammography-report-classification"
OUTPUT_DIR = "/kaggle/working"

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

In [None]:
# Set seeds
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)

## 1. Carregar Dados

In [None]:
# Carregar treino
train_path = os.path.join(INPUT_DIR, "train.csv")
train_df = pd.read_csv(train_path)
print(f"Train shape: {train_df.shape}")
print(f"Target distribution:\n{train_df['target'].value_counts().sort_index()}")

In [None]:
# Carregar teste (disponível apenas durante avaliação)
test_path = os.path.join(INPUT_DIR, "test.csv")

if os.path.exists(test_path):
    test_df = pd.read_csv(test_path)
    print(f"Test shape: {test_df.shape}")
else:
    print("Test file not found - expected in evaluation runtime")
    test_df = None

## 2. Dataset

In [None]:
class MammographyDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
        )
        
        item = {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
        }
        
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
            
        return item

## 3. Treinamento

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"f1_macro": f1_score(labels, predictions, average="macro")}

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# K-Fold
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

oof_predictions = np.zeros((len(train_df), 7))
models = []

In [None]:
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['target'])):
    print(f"\n{'='*50}")
    print(f"Fold {fold + 1}/{N_FOLDS}")
    print(f"{'='*50}")
    
    # Split data
    train_texts = train_df.iloc[train_idx]['report'].tolist()
    train_labels = train_df.iloc[train_idx]['target'].tolist()
    val_texts = train_df.iloc[val_idx]['report'].tolist()
    val_labels = train_df.iloc[val_idx]['target'].tolist()
    
    # Create datasets
    train_dataset = MammographyDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
    val_dataset = MammographyDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)
    
    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=7,
    )
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"{OUTPUT_DIR}/fold_{fold}",
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE * 2,
        learning_rate=LR,
        weight_decay=0.01,
        warmup_ratio=0.1,
        fp16=True,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        report_to="none",
        seed=SEED,
    )
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )
    
    # Train
    trainer.train()
    
    # Get OOF predictions
    preds = trainer.predict(val_dataset)
    oof_predictions[val_idx] = preds.predictions
    
    # Evaluate
    val_pred_labels = np.argmax(preds.predictions, axis=1)
    fold_f1 = f1_score(val_labels, val_pred_labels, average="macro")
    print(f"Fold {fold + 1} F1-Macro: {fold_f1:.4f}")
    
    models.append(model)

In [None]:
# OOF score
oof_pred_labels = np.argmax(oof_predictions, axis=1)
oof_f1 = f1_score(train_df['target'], oof_pred_labels, average="macro")
print(f"\nOverall OOF F1-Macro: {oof_f1:.4f}")

## 4. Inferência e Submissão

In [None]:
if test_df is not None:
    print("Generating predictions for test set...")
    
    # Create test dataset
    test_dataset = MammographyDataset(
        test_df['report'].tolist(),
        labels=None,
        tokenizer=tokenizer,
        max_length=MAX_LENGTH,
    )
    
    # Ensemble predictions
    all_predictions = []
    
    for fold, model in enumerate(models):
        print(f"Predicting with model {fold + 1}/{len(models)}")
        model.eval()
        
        trainer = Trainer(model=model)
        preds = trainer.predict(test_dataset)
        all_predictions.append(preds.predictions)
    
    # Average predictions
    test_predictions = np.mean(all_predictions, axis=0)
    test_pred_labels = np.argmax(test_predictions, axis=1)
    
    # Create submission
    submission = pd.DataFrame({
        'ID': test_df['ID'],
        'target': test_pred_labels,
    })
    
    submission.to_csv('submission.csv', index=False)
    print(f"\nSubmission saved!")
    print(f"Submission shape: {submission.shape}")
    print(f"Prediction distribution:\n{submission['target'].value_counts().sort_index()}")
else:
    print("Test file not available - run this in Kaggle evaluation environment")

In [None]:
# Preview submission
if os.path.exists('submission.csv'):
    submission = pd.read_csv('submission.csv')
    print(submission.head())