# SPR 2026 - TF-IDF + LightGBM (OTIMIZADO)

**Otimizações aplicadas:**
1. ✅ TruncatedSVD para reduzir dimensionalidade (15k → 500 features densas)
2. ✅ Hiperparâmetros mais leves (menos árvores, menos folhas)
3. ✅ GPU acceleration (device='gpu')
4. ✅ Early stopping

**Tempo esperado:** ~2-5 min (vs 30min+ sem otimização)

---
**CONFIGURAÇÃO KAGGLE:**
1. Settings → Internet → **OFF**
2. Settings → Accelerator → **GPU T4 x2** (para GPU) ou **CPU** (mais lento)
---

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import torch
import warnings
warnings.filterwarnings('ignore')

SEED = 42
SVD_COMPONENTS = 500  # Reduzir dimensionalidade
USE_GPU = torch.cuda.is_available()  # Detectar GPU automaticamente

DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
np.random.seed(SEED)

print('Bibliotecas carregadas!')
print(f'GPU disponível: {USE_GPU}')

In [None]:
# Carregar dados
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')

print(f'Train: {train.shape}')
print(f'Test: {test.shape}')

In [None]:
# TF-IDF Vectorizer (reduzido para acelerar SVD)
tfidf = TfidfVectorizer(
    max_features=10000,  # Reduzido de 20k
    ngram_range=(1, 2),  # Reduzido de (1,3)
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

X_train_tfidf = tfidf.fit_transform(train['report'])
X_test_tfidf = tfidf.transform(test['report'])
y_train = train['target'].values

print(f'TF-IDF shape (esparso): {X_train_tfidf.shape}')

# TruncatedSVD para converter em features densas
print(f'\nAplicando SVD: {X_train_tfidf.shape[1]} → {SVD_COMPONENTS} features...')
svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=SEED)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_test_svd = svd.transform(X_test_tfidf)

print(f'Variância explicada: {svd.explained_variance_ratio_.sum():.2%}')
print(f'Shape final (denso): {X_train_svd.shape}')

# Normalizar
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_svd)
X_test = scaler.transform(X_test_svd)

In [None]:
# Split para early stopping
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=SEED, stratify=y_train
)

# LightGBM OTIMIZADO
model = lgb.LGBMClassifier(
    n_estimators=300,        # Reduzido de 300 mas com early stopping
    max_depth=8,             # Reduzido de 12
    learning_rate=0.1,       # Aumentado
    num_leaves=31,           # Reduzido de 64
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    device='gpu' if USE_GPU else 'cpu',
    random_state=SEED,
    n_jobs=-1,
    verbose=-1
)

print(f'Usando: {"GPU" if USE_GPU else "CPU"}')
print('Treinando LightGBM...')

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    callbacks=[
        lgb.early_stopping(stopping_rounds=30, verbose=True),
        lgb.log_evaluation(period=50)
    ]
)

print(f'Melhor iteração: {model.best_iteration_}')
print('Modelo treinado!')

In [None]:
# Predições e submissão
predictions = model.predict(X_test)

submission = pd.DataFrame({
    'ID': test['ID'],
    'target': predictions
})

submission.to_csv('submission.csv', index=False)

print('submission.csv criado!')
print(submission['target'].value_counts().sort_index())