# SPR 2026 - TF-IDF + XGBoost (OTIMIZADO)

**Otimizações aplicadas:**
1. ✅ TruncatedSVD para reduzir dimensionalidade (15k → 500 features densas)
2. ✅ Hiperparâmetros mais leves (menos árvores, profundidade menor)
3. ✅ GPU acceleration (tree_method='gpu_hist')

**Tempo esperado:** ~2-5 min (vs 1h+ sem otimização)

---
**CONFIGURAÇÃO KAGGLE:**
1. Settings → Internet → **OFF**
2. Settings → Accelerator → **GPU T4 x2** (para GPU) ou **CPU** (mais lento)
---

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.utils.class_weight import compute_sample_weight
import xgboost as xgb
import torch
import warnings
warnings.filterwarnings('ignore')

SEED = 42
SVD_COMPONENTS = 500  # Reduzir dimensionalidade
USE_GPU = torch.cuda.is_available()  # Detectar GPU automaticamente

DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
np.random.seed(SEED)

print('Bibliotecas carregadas!')
print(f'GPU disponível: {USE_GPU}')

In [None]:
# Carregar dados
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')

print(f'Train: {train.shape}')
print(f'Test: {test.shape}')

In [None]:
# TF-IDF Vectorizer (reduzido para acelerar SVD)
tfidf = TfidfVectorizer(
    max_features=10000,  # Reduzido de 20k
    ngram_range=(1, 2),  # Reduzido de (1,3)
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

X_train_tfidf = tfidf.fit_transform(train['report'])
X_test_tfidf = tfidf.transform(test['report'])
y_train = train['target'].values

print(f'TF-IDF shape (esparso): {X_train_tfidf.shape}')

# TruncatedSVD para converter em features densas
print(f'\nAplicando SVD: {X_train_tfidf.shape[1]} → {SVD_COMPONENTS} features...')
svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=SEED)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_test_svd = svd.transform(X_test_tfidf)

print(f'Variância explicada: {svd.explained_variance_ratio_.sum():.2%}')
print(f'Shape final (denso): {X_train_svd.shape}')

# Normalizar
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_svd)
X_test = scaler.transform(X_test_svd)

In [None]:
# Sample weights para classes desbalanceadas
sample_weights = compute_sample_weight('balanced', y_train)

# XGBoost OTIMIZADO
model = xgb.XGBClassifier(
    n_estimators=200,       # Reduzido de 300
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    reg_alpha=0.1,
    reg_lambda=1.0,
    tree_method='gpu_hist' if USE_GPU else 'hist',  # GPU se disponível
    gpu_id=0 if USE_GPU else None,
    random_state=SEED,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

print(f'Usando: {"GPU (gpu_hist)" if USE_GPU else "CPU (hist)"}')

# Cross-validation (opcional - pode comentar para economizar tempo)
# scores = cross_val_score(model, X_train, y_train, cv=3, scoring='f1_macro')
# print(f'CV F1-Macro: {scores.mean():.5f} (+/- {scores.std():.5f})')

# Treinar modelo final
print('Treinando XGBoost...')
model.fit(X_train, y_train, sample_weight=sample_weights)
print('Modelo treinado!')

In [None]:
# Predições e submissão
predictions = model.predict(X_test)

submission = pd.DataFrame({
    'ID': test['ID'],
    'target': predictions
})

submission.to_csv('submission.csv', index=False)

print('submission.csv criado!')
print(submission['target'].value_counts().sort_index())