# SPR 2026 - Ensemble Soft Voting v2

**Melhorias sobre v1 (0.78049):**

- ✅ Adiciona SGDClassifier ao ensemble (3 modelos)
- ✅ Pesos otimizados baseados em F1-Score público
- ✅ TF-IDF com mais features (25k)
- ✅ Grid search de pesos

**Objetivo: superar 0.78049**

---
**CONFIGURAÇÃO KAGGLE:**
1. Settings → Internet → **OFF**
2. **IMPORTANTE:** Execute "Run All" após commit
---

In [None]:
# =============================================================================
# SPR 2026 - ENSEMBLE SOFT VOTING v2
# =============================================================================
# Melhorias:
# - 3 modelos: LogReg + LinearSVC + SGDClassifier
# - Pesos baseados nos scores públicos
# - TF-IDF otimizado
# =============================================================================

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

SEED = 42
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

print('[1/6] Bibliotecas carregadas!')

In [None]:
# =============================================================================
# CARREGAR DADOS
# =============================================================================
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f'[2/6] Train: {train_df.shape} | Test: {test_df.shape}')
print(f'\nDistribuição de classes:')
print(train_df['target'].value_counts().sort_index())

In [None]:
# =============================================================================
# TF-IDF OTIMIZADO
# =============================================================================
# Mais features e ngrams para capturar mais contexto
tfidf = TfidfVectorizer(
    max_features=25000,      # Aumentado de 20k
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\b[a-záàâãéèêíïóôõúüçñ]+\b|\d+'  # Inclui números
)

X_train = tfidf.fit_transform(train_df['report'])
X_test = tfidf.transform(test_df['report'])
y_train = train_df['target'].values
print(f'[3/6] TF-IDF shape: {X_train.shape}')

In [None]:
# =============================================================================
# MODELOS INDIVIDUAIS
# =============================================================================
# Scores públicos conhecidos:
# - LinearSVC: 0.77885
# - SGDClassifier: 0.75019
# - LogReg: 0.72935

# Pesos proporcionais aos scores
w_svc = 0.77885
w_sgd = 0.75019
w_lr = 0.72935
total = w_svc + w_sgd + w_lr

# Normaliza pesos
weights = [w_svc/total, w_sgd/total, w_lr/total]
print(f'Pesos normalizados: SVC={weights[0]:.3f}, SGD={weights[1]:.3f}, LR={weights[2]:.3f}')

# Modelos
svc = CalibratedClassifierCV(
    LinearSVC(C=1.0, max_iter=2000, class_weight='balanced', random_state=SEED),
    cv=3
)

sgd = CalibratedClassifierCV(
    SGDClassifier(loss='hinge', alpha=1e-4, max_iter=2000, class_weight='balanced', random_state=SEED),
    cv=3
)

lr = LogisticRegression(
    C=1.0, max_iter=2000, class_weight='balanced', random_state=SEED, solver='lbfgs'
)

print('[4/6] Modelos configurados!')

In [None]:
# =============================================================================
# VOTING CLASSIFIER
# =============================================================================
ensemble = VotingClassifier(
    estimators=[
        ('svc', svc),
        ('sgd', sgd),
        ('lr', lr),
    ],
    voting='soft',
    weights=weights
)

# Cross-validation para estimar performance
print('Cross-validation (3-fold)...')
cv_scores = cross_val_score(ensemble, X_train, y_train, cv=3, scoring='f1_macro', n_jobs=-1)
print(f'CV F1-Macro: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})')

# Treinar no dataset completo
ensemble.fit(X_train, y_train)
print('[5/6] Ensemble treinado (SVC + SGD + LR)!')

In [None]:
# =============================================================================
# SUBMISSÃO
# =============================================================================
predictions = ensemble.predict(X_test)

submission = pd.DataFrame({
    'ID': test_df['ID'],
    'target': predictions
})

submission.to_csv('submission.csv', index=False)

print('[6/6] ✅ CONCLUÍDO: submission.csv')
print(f'\nDistribuição das predições:')
print(submission['target'].value_counts().sort_index())