# SPR 2026 - TF-IDF + TabPFN-2.5 (Offline)

## Compativel com competicoes que exigem Internet OFF

**Este notebook usa TabPFN-2.5 do Kaggle Models (Prior Labs).**

### Configuracao:

1. **NAO precisa de internet** - funciona offline
2. Add Input -> Models -> pesquise tabpfn -> selecione TabPFN-2.5 (Prior Labs)
3. Settings -> Accelerator -> GPU T4 x2
4. Run All

---

In [None]:
import subprocess, sys, os
print('='*60)
print('SPR 2026 - TF-IDF + TabPFN-2.5 (OFFLINE)')
print('='*60)
print('\n[0/5] Detectando TabPFN-2.5...')
print('    Inputs disponiveis:')
for item in os.listdir('/kaggle/input'):
    print(f'      - {item}')

TABPFN_PATHS = [
    '/kaggle/input/tabpfn-2.5/pytorch/default/1',
    '/kaggle/input/tabpfn-2.5',
]
TABPFN_PATH = None
for path in TABPFN_PATHS:
    if os.path.exists(path):
        TABPFN_PATH = path
        print(f'    TabPFN encontrado em: {TABPFN_PATH}')
        for f in os.listdir(TABPFN_PATH)[:10]:
            print(f'        - {f}')
        break

if TABPFN_PATH is None:
    print('Procurando TabPFN recursivamente...')
    for root, dirs, files in os.walk('/kaggle/input'):
        for f in files:
            if 'tabpfn' in f.lower() or 'tabpfn' in root.lower():
                print(f'  Encontrado: {os.path.join(root, f)}')
    raise FileNotFoundError('TabPFN-2.5 nao encontrado! Add Input -> Models -> TabPFN-2.5')

In [None]:
sys.path.insert(0, TABPFN_PATH)
wheel_files = []
for root, dirs, files in os.walk(TABPFN_PATH):
    for f in files:
        if f.endswith('.whl'):
            wheel_files.append(os.path.join(root, f))
if wheel_files:
    print(f'    Instalando de: {wheel_files[0]}')
    subprocess.run([sys.executable, '-m', 'pip', 'install', wheel_files[0], '-q'], check=True)
try:
    from tabpfn import TabPFNClassifier
    print('    TabPFN importado!')
except ImportError:
    from tabpfn_client import TabPFNClassifier
    print('    TabPFN client importado!')

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
import torch
import warnings
warnings.filterwarnings('ignore')

SEED = 42
SVD_COMPONENTS = 100
MAX_TRAIN_SIZE = 3000
N_ENSEMBLE_CONFIGS = 16
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
USE_GPU = torch.cuda.is_available()
np.random.seed(SEED)
print(f'    GPU disponivel: {USE_GPU}')

In [None]:
print('\n[1/5] Carregando dados...')
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f'    Train: {train.shape} | Test: {test.shape}')

In [None]:
print('\n[2/5] Aplicando TF-IDF...')
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.95, sublinear_tf=True)
X_train_tfidf = tfidf.fit_transform(train['report'])
X_test_tfidf = tfidf.transform(test['report'])
y_train = train['target'].values
print(f'    TF-IDF esparso: {X_train_tfidf.shape}')

In [None]:
print(f'\n[3/5] Aplicando SVD: {X_train_tfidf.shape[1]} -> {SVD_COMPONENTS} features...')
svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=SEED)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_test_svd = svd.transform(X_test_tfidf)
print(f'    Variancia explicada: {svd.explained_variance_ratio_.sum():.2%}')
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_svd)
X_test = scaler.transform(X_test_svd)

In [None]:
print(f'\n[4/5] Executando TabPFN-2.5...')
device = 'cuda' if USE_GPU else 'cpu'

if len(X_train) > MAX_TRAIN_SIZE:
    print(f'    Dataset grande ({len(X_train)}), usando ensemble com subsampling...')
    n_ensembles = 5
    all_preds = []
    splitter = StratifiedShuffleSplit(n_splits=n_ensembles, train_size=MAX_TRAIN_SIZE, random_state=SEED)
    for i, (train_idx, _) in enumerate(splitter.split(X_train, y_train)):
        print(f'    Ensemble {i+1}/{n_ensembles}...')
        X_subset = X_train[train_idx]
        y_subset = y_train[train_idx]
        model = TabPFNClassifier(device=device, N_ensemble_configurations=N_ENSEMBLE_CONFIGS)
        model.fit(X_subset, y_subset)
        preds = model.predict_proba(X_test)
        all_preds.append(preds)
    avg_probs = np.mean(all_preds, axis=0)
    predictions = np.argmax(avg_probs, axis=1)
else:
    print(f'    Treinando em {len(X_train)} amostras...')
    model = TabPFNClassifier(device=device, N_ensemble_configurations=N_ENSEMBLE_CONFIGS)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
print('    TabPFN-2.5 executado!')

In [None]:
print('\n[5/5] Gerando submissao...')
submission = pd.DataFrame({'ID': test['ID'], 'target': predictions})
submission.to_csv('submission.csv', index=False)
print('='*60)
print('CONCLUIDO - submission.csv criado!')
print('='*60)
print(submission['target'].value_counts().sort_index())