# SPR 2026 - TF-IDF + SVD + XGBoost

**OTIMIZAÇÃO: Reduzir dimensionalidade antes do boosting**

Problema: TF-IDF gera 15k+ features esparsas → XGBoost fica MUITO LENTO.

Solução: TruncatedSVD reduz para 500 features densas → XGBoost rápido!

---
**CONFIGURAÇÃO OFFLINE:**
1. No Kaggle, Settings → Internet → **OFF**
2. Tempo estimado: **< 5 min** (vs 1h+ sem SVD)
---

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight
import warnings
warnings.filterwarnings('ignore')

SEED = 42
SVD_COMPONENTS = 500  # Reduzir de 15k → 500 features
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

np.random.seed(SEED)
print('Bibliotecas carregadas!')

In [None]:
# Carregar dados
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')

print(f'Train: {train.shape}')
print(f'Test: {test.shape}')

In [None]:
# TF-IDF (mesmo que antes, mas com menos features iniciais)
tfidf = TfidfVectorizer(
    max_features=10000,  # Reduzido de 15k
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

X_train_tfidf = tfidf.fit_transform(train['report'])
X_test_tfidf = tfidf.transform(test['report'])

print(f'TF-IDF shape (esparso): {X_train_tfidf.shape}')
print(f'Sparsity: {100 * (1 - X_train_tfidf.nnz / (X_train_tfidf.shape[0] * X_train_tfidf.shape[1])):.2f}%')

In [None]:
# TruncatedSVD (LSA) - Redução de Dimensionalidade
print(f'Aplicando SVD: {X_train_tfidf.shape[1]} → {SVD_COMPONENTS} features...')

svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=SEED)

X_train_svd = svd.fit_transform(X_train_tfidf)
X_test_svd = svd.transform(X_test_tfidf)

print(f'Variância explicada: {svd.explained_variance_ratio_.sum():.2%}')
print(f'Shape final (denso): {X_train_svd.shape}')

In [None]:
# Normalizar (importante para alguns modelos)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_svd)
X_test_scaled = scaler.transform(X_test_svd)

y_train = train['target'].values

print('Features normalizadas!')

In [None]:
# Sample weights para classes desbalanceadas
sample_weights = compute_sample_weight('balanced', y_train)

# XGBoost - Agora MUITO MAIS RÁPIDO com features densas!
model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=SEED,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

print('Treinando XGBoost...')
model.fit(X_train_scaled, y_train, sample_weight=sample_weights)
print('Modelo treinado!')

In [None]:
# Predições e submissão
predictions = model.predict(X_test_scaled)

submission = pd.DataFrame({
    'ID': test['ID'],
    'target': predictions
})

submission.to_csv('submission.csv', index=False)

print('submission.csv criado!')
print(submission['target'].value_counts().sort_index())