# SPR 2026 - TF-IDF + CatBoost (OTIMIZADO)

**Otimizações aplicadas:**
1. ✅ TruncatedSVD: 15k features → 500 features densas
2. ✅ GPU acceleration: `task_type='GPU'`
3. ✅ Early stopping para evitar overfitting

**Tempo esperado:** ~5-10 min (vs 1h+ sem SVD!)

---
**CONFIGURAÇÃO KAGGLE:**
1. Settings → Internet → **OFF**
2. Settings → Accelerator → **GPU T4 x2** (recomendado) ou **CPU**
3. **IMPORTANTE:** Execute "Run All" após commit para garantir versão atualizada
---

In [None]:
# =============================================================================
# SPR 2026 - TFIDF + CATBOOST COM SVD (CÓDIGO CONSOLIDADO)
# =============================================================================
# Execute TODA esta célula para garantir que SVD seja aplicado corretamente!
# Verifique no LOG: "Shape denso: (N, 500)" confirma SVD funcionando
# =============================================================================

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import torch
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("SPR 2026 - TF-IDF + CatBoost COM SVD")
print("="*60)

# ==== CONFIGURAÇÕES ====
SEED = 42
SVD_COMPONENTS = 500  # CRÍTICO: Reduz 15k+ features para 500
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
USE_GPU = torch.cuda.is_available()
np.random.seed(SEED)

print(f"✓ GPU disponível: {USE_GPU}")

# ==== CARREGAR DADOS ====
print("\n[1/5] Carregando dados...")
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f"    Train: {train.shape} | Test: {test.shape}")

# ==== TF-IDF ====
print("\n[2/5] Aplicando TF-IDF...")
tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)
X_train_tfidf = tfidf.fit_transform(train['report'])
X_test_tfidf = tfidf.transform(test['report'])
y_train = train['target'].values
print(f"    TF-IDF esparso: {X_train_tfidf.shape}")

# ==== SVD - CRÍTICO PARA VELOCIDADE ====
print(f"\n[3/5] Aplicando SVD: {X_train_tfidf.shape[1]} → {SVD_COMPONENTS} features...")
svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=SEED)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_test_svd = svd.transform(X_test_tfidf)
print(f"    Variância explicada: {svd.explained_variance_ratio_.sum():.2%}")
print(f"    ✅ Shape denso: {X_train_svd.shape} ← CONFIRME QUE TEM 500 COLUNAS!")

# Normalizar
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_svd)
X_test = scaler.transform(X_test_svd)

# ==== SPLIT PARA EARLY STOPPING ====
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=SEED, stratify=y_train
)

# ==== CATBOOST OTIMIZADO ====
print(f"\n[4/5] Treinando CatBoost ({'GPU' if USE_GPU else 'CPU'})...")

model = CatBoostClassifier(
    iterations=300,
    depth=6,
    learning_rate=0.1,
    l2_leaf_reg=3,
    auto_class_weights='Balanced',
    task_type='GPU' if USE_GPU else 'CPU',
    devices='0' if USE_GPU else None,
    random_seed=SEED,
    verbose=50,
    early_stopping_rounds=30
)

model.fit(X_tr, y_tr, eval_set=(X_val, y_val), verbose=50)
print(f"    ✓ Melhor iteração: {model.best_iteration_}")
print("    ✓ Modelo treinado!")

# ==== SUBMISSÃO ====
print("\n[5/5] Gerando submissão...")
predictions = model.predict(X_test)

submission = pd.DataFrame({
    'ID': test['ID'],
    'target': predictions.flatten()
})
submission.to_csv('submission.csv', index=False)

print("="*60)
print("✅ CONCLUÍDO - submission.csv criado!")
print("="*60)
print("\nDistribuição das predições:")
print(submission['target'].value_counts().sort_index())