# SPR 2026 - Ensemble Ponderado (Soft Voting)

**VotingClassifier com pesos ajustáveis**

- ✅ Logistic Regression + LinearSVC
- ✅ Soft voting com probabilidades
- ✅ Tempo esperado: ~3-5 min

---
**CONFIGURAÇÃO KAGGLE:**
1. Settings → Internet → **OFF**
2. **IMPORTANTE:** Execute "Run All" após commit
---

In [None]:
# =============================================================================
# SPR 2026 - ENSEMBLE PONDERADO (SOFT VOTING)
# =============================================================================
# - Logistic Regression + LinearSVC
# - VotingClassifier com soft voting
# - Pesos ajustáveis baseados em F1-Score
# =============================================================================

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

SEED = 42
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

print('[1/5] Bibliotecas carregadas!')

# =============================================================================
# CARREGAR DADOS
# =============================================================================
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f'[2/5] Train: {train_df.shape} | Test: {test_df.shape}')

# =============================================================================
# TF-IDF
# =============================================================================
tfidf = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

X_train = tfidf.fit_transform(train_df['report'])
X_test = tfidf.transform(test_df['report'])
y_train = train_df['target'].values
print(f'[3/5] TF-IDF shape: {X_train.shape}')

# =============================================================================
# VOTING CLASSIFIER
# =============================================================================
lr = LogisticRegression(C=1.0, max_iter=1000, class_weight='balanced', random_state=SEED)
svc = CalibratedClassifierCV(LinearSVC(C=1.0, max_iter=1000, class_weight='balanced', random_state=SEED))

ensemble = VotingClassifier(
    estimators=[
        ('lr', lr),
        ('svc', svc),
    ],
    voting='soft',
    weights=[0.6, 0.4]  # Ajustar baseado no F1-Score de cada modelo
)

ensemble.fit(X_train, y_train)
print('[4/5] Ensemble treinado (LR + LinearSVC)!')

# =============================================================================
# SUBMISSÃO
# =============================================================================
predictions = ensemble.predict(X_test)

submission = pd.DataFrame({
    'ID': test_df['ID'],
    'target': predictions
})

submission.to_csv('submission.csv', index=False)

print('[5/5] ✅ CONCLUÍDO: submission.csv')
print(submission['target'].value_counts().sort_index())