# SPR 2026 - TF-IDF + TabPFN-2.5 (Offline)

## ✅ Compatível com competições que exigem Internet OFF

**Este notebook usa TabPFN-2.5 do Kaggle Models (Prior Labs).**

### Configuração:

1. **NÃO precisa de internet** - funciona offline
2. Add Input → **Models** → pesquise `tabpfn` → selecione **TabPFN-2.5** (Prior Labs - 1 Variation)
3. Settings → Accelerator → **GPU T4 x2** (recomendado)
4. Run All

### Vantagens do TabPFN-2.5:
- Transformer-based foundation model para dados tabulares
- Suporta mais amostras e features que v1
- Melhor performance em classificação

---

In [None]:
# =============================================================================
# SPR 2026 - TFIDF + TABPFN-2.5 (OFFLINE - KAGGLE MODELS)
# =============================================================================
# REQUER: Modelo "TabPFN-2.5" adicionado ao notebook (Prior Labs)
# =============================================================================

import subprocess
import sys
import os

print("="*60)
print("SPR 2026 - TF-IDF + TabPFN-2.5 (OFFLINE)")
print("="*60)

# ==== DETECTAR MODELO TABPFN-2.5 ====
print("\n[0/5] Detectando TabPFN-2.5...")

# Listar inputs disponíveis
print("    Inputs disponíveis:")
for item in os.listdir('/kaggle/input'):
    print(f"      - {item}")

# Paths possíveis para TabPFN-2.5 como modelo Kaggle (Prior Labs)
TABPFN_PATHS = [
    '/kaggle/input/tabpfn-2.5/pytorch/default/1',
    '/kaggle/input/tabpfn-2.5',
    '/kaggle/input/tabpfn-v2/pytorch/default/1',
    '/kaggle/input/tabpfn-v2-models/pytorch/default/1',
    '/kaggle/input/tabpfn-v2',
    '/kaggle/input/tabpfn-v2-models',
]

TABPFN_PATH = None
for path in TABPFN_PATHS:
    if os.path.exists(path):
        TABPFN_PATH = path
        print(f"    ✓ TabPFN encontrado em: {TABPFN_PATH}")
        # Listar conteúdo
        for f in os.listdir(TABPFN_PATH)[:10]:
            print(f"        └── {f}")
        break

if TABPFN_PATH is None:
    # Procurar recursivamente
    print("\n    Procurando TabPFN recursivamente...")
    for root, dirs, files in os.walk('/kaggle/input'):
        for f in files:
            if 'tabpfn' in f.lower() or 'tabpfn' in root.lower():
                print(f"      Encontrado: {os.path.join(root, f)}")
    raise FileNotFoundError(
        "TabPFN-2.5 não encontrado!\n"
        "Adicione: Add Input → Models → pesquise 'tabpfn' → TabPFN-2.5 (Prior Labs)"
    )

In [None]:
# ==== INSTALAR/IMPORTAR TABPFN-2.5 ====
# TabPFN-2.5 do Kaggle Models - instalar do path local

# Adicionar ao path
sys.path.insert(0, TABPFN_PATH)

# Procurar e instalar wheel se existir
wheel_files = []
for root, dirs, files in os.walk(TABPFN_PATH):
    for f in files:
        if f.endswith('.whl'):
            wheel_files.append(os.path.join(root, f))

if wheel_files:
    print(f"    Instalando de: {wheel_files[0]}")
    subprocess.run([sys.executable, "-m", "pip", "install", wheel_files[0], "-q"], check=True)

# Tentar importar
try:
    from tabpfn import TabPFNClassifier
    print("    ✓ TabPFN importado com sucesso!")
except ImportError as e:
    print(f"    Erro ao importar: {e}")
    # Tentar tabpfn_client para versões mais novas
    try:
        from tabpfn_client import TabPFNClassifier
        print("    ✓ TabPFN client importado!")
    except:
        raise ImportError("Não foi possível importar TabPFN")

In [None]:
# ==== IMPORTS ====
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
import torch
import warnings
warnings.filterwarnings('ignore')

# ==== CONFIGURAÇÕES ====
SEED = 42
SVD_COMPONENTS = 100  # Reduzir features para TabPFN
MAX_TRAIN_SIZE = 3000  # TabPFN-2.5 suporta mais amostras
N_ENSEMBLE_CONFIGS = 16  # Número de configurações de ensemble
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
USE_GPU = torch.cuda.is_available()
np.random.seed(SEED)

print(f"    ✓ GPU disponível: {USE_GPU}")

In [None]:
# ==== CARREGAR DADOS ====
print("\n[1/5] Carregando dados...")
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f"    Train: {train.shape} | Test: {test.shape}")

In [None]:
# ==== TF-IDF ====
print("\n[2/5] Aplicando TF-IDF...")
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)
X_train_tfidf = tfidf.fit_transform(train['report'])
X_test_tfidf = tfidf.transform(test['report'])
y_train = train['target'].values
print(f"    TF-IDF esparso: {X_train_tfidf.shape}")

In [None]:
# ==== SVD - REDUZIR FEATURES ====
print(f"\n[3/5] Aplicando SVD: {X_train_tfidf.shape[1]} → {SVD_COMPONENTS} features...")
svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=SEED)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_test_svd = svd.transform(X_test_tfidf)
print(f"    Variância explicada: {svd.explained_variance_ratio_.sum():.2%}")
print(f"    ✅ Shape denso: {X_train_svd.shape}")

# Normalizar (importante para TabPFN)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_svd)
X_test = scaler.transform(X_test_svd)

In [None]:
# ==== TABPFN-2.5 ====
print(f"\n[4/5] Executando TabPFN-2.5 ({'GPU' if USE_GPU else 'CPU'})...")
device = 'cuda' if USE_GPU else 'cpu'

# Usar subsampling se dataset for muito grande
if len(X_train) > MAX_TRAIN_SIZE:
    print(f"    Dataset grande ({len(X_train)}), usando ensemble com subsampling...")
    n_ensembles = 5
    all_preds = []
    
    splitter = StratifiedShuffleSplit(
        n_splits=n_ensembles, 
        train_size=MAX_TRAIN_SIZE, 
        random_state=SEED
    )
    
    for i, (train_idx, _) in enumerate(splitter.split(X_train, y_train)):
        print(f"    Ensemble {i+1}/{n_ensembles}...")
        
        X_subset = X_train[train_idx]
        y_subset = y_train[train_idx]
        
        model = TabPFNClassifier(
            device=device,
            N_ensemble_configurations=N_ENSEMBLE_CONFIGS
        )
        model.fit(X_subset, y_subset)
        preds = model.predict_proba(X_test)
        all_preds.append(preds)
    
    # Média das probabilidades
    avg_probs = np.mean(all_preds, axis=0)
    predictions = np.argmax(avg_probs, axis=1)
else:
    print(f"    Treinando em {len(X_train)} amostras...")
    model = TabPFNClassifier(
        device=device,
        N_ensemble_configurations=N_ENSEMBLE_CONFIGS
    )
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

print("    ✓ TabPFN-2.5 executado!")

In [None]:
# ==== SUBMISSÃO ====
print("\n[5/5] Gerando submissão...")
submission = pd.DataFrame({
    'ID': test['ID'],
    'target': predictions
})
submission.to_csv('submission.csv', index=False)

print("="*60)
print("✅ CONCLUÍDO - submission.csv criado!")
print("="*60)
print("\nDistribuição das predições:")
print(submission['target'].value_counts().sort_index())