# SPR 2026 - TF-IDF + TabPFN (OTIMIZADO)

**TabPFN: Prior-Data Fitted Network**

Modelo transformer pré-treinado para classificação tabular que funciona sem treinamento!

**Otimizações aplicadas:**
1. ✅ TruncatedSVD para reduzir dimensionalidade (100 componentes)
2. ✅ Ensemble com subsampling se dataset > 10k amostras
3. ✅ GPU acceleration (device='cuda')

**Limitações do TabPFN:**
- Máximo 10.000 amostras de treino
- Máximo 500 features (usamos SVD com 100)
- Máximo 10 classes ✅ (temos 7)

**Tempo esperado:** ~2-5 min

---
**CONFIGURAÇÃO KAGGLE:**
1. Settings → Internet → **OFF**
2. Settings → Accelerator → **GPU T4 x2** (obrigatório)
3. Add Data → Datasets → `Prior Labs / TabPFN v2 Weights`
---

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
import torch
import warnings
warnings.filterwarnings('ignore')

# TabPFN - tentar múltiplos caminhos
import sys
TABPFN_PATHS = [
    '/kaggle/input/tabpfn-v2-weights',
    '/kaggle/input/tabpfn-weights',
    '/kaggle/input/tabpfnv2'
]
for path in TABPFN_PATHS:
    sys.path.insert(0, path)

try:
    from tabpfn import TabPFNClassifier
    print('TabPFN carregado!')
except ImportError:
    print('ERRO: TabPFN não encontrado!')
    print('Adicione o dataset: Prior Labs / TabPFN v2 Weights')

SEED = 42
SVD_COMPONENTS = 100  # TabPFN funciona melhor com menos features
MAX_TRAIN_SIZE = 10000  # Limite do TabPFN

USE_GPU = torch.cuda.is_available()
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

np.random.seed(SEED)
print(f'GPU disponível: {USE_GPU}')

In [None]:
# Carregar dados
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')

print(f'Train: {train.shape}')
print(f'Test: {test.shape}')

In [None]:
# TF-IDF Vectorizer (compacto para TabPFN)
tfidf = TfidfVectorizer(
    max_features=5000,   # Menor para SVD mais eficiente
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

X_train_tfidf = tfidf.fit_transform(train['report'])
X_test_tfidf = tfidf.transform(test['report'])
y_train = train['target'].values

print(f'TF-IDF shape: {X_train_tfidf.shape}')

In [None]:
# TruncatedSVD para TabPFN (máx 500 features, usamos 100 para eficiência)
print(f'Aplicando SVD: {X_train_tfidf.shape[1]} → {SVD_COMPONENTS} features...')

svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=SEED)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_test_svd = svd.transform(X_test_tfidf)

print(f'Variância explicada: {svd.explained_variance_ratio_.sum():.2%}')

# Normalizar (importante para TabPFN)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_svd)
X_test = scaler.transform(X_test_svd)

print(f'Shape final: {X_train.shape}')

In [None]:
# TabPFN - funciona sem treinamento! 
device = 'cuda' if USE_GPU else 'cpu'
print(f'Usando device: {device}')

if len(X_train) <= MAX_TRAIN_SIZE:
    # Usar todos os dados diretamente
    print(f'Dataset pequeno ({len(X_train)} amostras), usando todos os dados...')
    
    model = TabPFNClassifier(
        device=device,
        N_ensemble_configurations=16,  # Mais configurações = melhor ensemble interno
    )
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
else:
    # Ensemble com subsampling estratificado (dataset > 10k)
    print(f'Dataset grande ({len(X_train)} amostras), usando ensemble com subsampling...')
    
    n_ensembles = 5
    all_preds = []
    
    splitter = StratifiedShuffleSplit(
        n_splits=n_ensembles, 
        train_size=MAX_TRAIN_SIZE, 
        random_state=SEED
    )
    
    for i, (train_idx, _) in enumerate(splitter.split(X_train, y_train)):
        print(f'Ensemble {i+1}/{n_ensembles}...')
        
        X_subset = X_train[train_idx]
        y_subset = y_train[train_idx]
        
        model = TabPFNClassifier(
            device=device,
            N_ensemble_configurations=8,
        )
        
        model.fit(X_subset, y_subset)
        preds = model.predict_proba(X_test)
        all_preds.append(preds)
    
    # Média das probabilidades
    avg_probs = np.mean(all_preds, axis=0)
    predictions = np.argmax(avg_probs, axis=1)

print('Modelo pronto!')

In [None]:
# Predições e submissão
submission = pd.DataFrame({
    'ID': test['ID'],
    'target': predictions
})

submission.to_csv('submission.csv', index=False)

print('submission.csv criado!')
print(submission['target'].value_counts().sort_index())