# SPR 2026 - Ensemble v3 (SGD v3 + LinearSVC + LogReg Blend)

**Objetivo:** Combinar os 3 melhores modelos TF-IDF com pesos baseados em performance

**Modelos no blend:**
- SGDClassifier v3: 0.77036 (único que melhorou!)
- LinearSVC: 0.77885 (baseline estável)
- LogisticRegression: 0.72935 (baseline estável)

**Estratégia:** Weighted soft voting com pesos proporcionais ao score

**Meta:** Alcançar 0.79+ F1-Macro

---
**CONFIGURAÇÃO KAGGLE:** Internet OFF
---

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from scipy.stats import loguniform
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("SPR 2026 - Ensemble v3 (SGD + SVC + LogReg Blend)")
print("="*60)

SEED = 42
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
np.random.seed(SEED)

# ========== PESOS DO ENSEMBLE ==========
# Baseado nos scores públicos
WEIGHT_SGD = 0.40      # 0.77036 - único que melhorou
WEIGHT_SVC = 0.35      # 0.77885 - baseline mais alto
WEIGHT_LR = 0.25       # 0.72935 - baseline
# =======================================

print(f"Pesos: SGD={WEIGHT_SGD}, SVC={WEIGHT_SVC}, LR={WEIGHT_LR}")

# Dados
print("\n[1/6] Carregando dados...")
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f"Train: {train.shape} | Test: {test.shape}")

# Auto-detectar colunas
def find_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
        for col in df.columns:
            if col.lower() == c.lower():
                return col
    return None

TEXT_COL = find_col(train, ['report', 'text', 'laudo', 'texto', 'content'])
LABEL_COL = find_col(train, ['target', 'label', 'birads', 'classe', 'class'])
ID_COL = find_col(test, ['ID', 'id', 'Id', 'index', 'idx'])
print(f"Colunas: texto={TEXT_COL}, label={LABEL_COL}, id={ID_COL}")

In [None]:
# TF-IDF
print("\n[2/6] TF-IDF...")
tfidf = TfidfVectorizer(
    max_features=20000, 
    ngram_range=(1, 2), 
    min_df=2, 
    max_df=0.95, 
    sublinear_tf=True
)
X_train = tfidf.fit_transform(train[TEXT_COL])
X_test = tfidf.transform(test[TEXT_COL])
y_train = train[LABEL_COL].values
print(f"Shape: {X_train.shape}")

In [None]:
# Modelo 1: SGDClassifier (config do v3 que funcionou)
print("\n[3/6] Treinando SGDClassifier (config v3)...")

sgd = SGDClassifier(
    loss='log_loss',
    penalty='elasticnet',
    alpha=0.0001,
    l1_ratio=0.3,
    class_weight='balanced',
    random_state=SEED,
    max_iter=2000,
    early_stopping=True,
    validation_fraction=0.1,
    n_jobs=-1
)
sgd.fit(X_train, y_train)
sgd_proba = sgd.predict_proba(X_test)
print(f"  SGD treinado! Classes: {sgd.classes_}")

In [None]:
# Modelo 2: LinearSVC com calibração (para ter probabilidades)
print("\n[4/6] Treinando LinearSVC + Calibração...")

svc_base = LinearSVC(
    C=1.0,
    class_weight='balanced',
    random_state=SEED,
    max_iter=5000
)
svc = CalibratedClassifierCV(svc_base, cv=3, method='sigmoid')
svc.fit(X_train, y_train)
svc_proba = svc.predict_proba(X_test)
print(f"  SVC treinado! Classes: {svc.classes_}")

In [None]:
# Modelo 3: LogisticRegression
print("\n[5/6] Treinando LogisticRegression...")

lr = LogisticRegression(
    C=1.0,
    class_weight='balanced',
    random_state=SEED,
    max_iter=2000,
    solver='lbfgs',
    n_jobs=-1
)
lr.fit(X_train, y_train)
lr_proba = lr.predict_proba(X_test)
print(f"  LR treinado! Classes: {lr.classes_}")

In [None]:
# Ensemble: Weighted Soft Voting
print("\n[6/6] Ensemble - Weighted Soft Voting...")

# Combinar probabilidades
ensemble_proba = (
    WEIGHT_SGD * sgd_proba + 
    WEIGHT_SVC * svc_proba + 
    WEIGHT_LR * lr_proba
)

# Predição final
classes = sgd.classes_
predictions = classes[np.argmax(ensemble_proba, axis=1)]

# Submissão
sample_path = f'{DATA_DIR}/sample_submission.csv'
if os.path.exists(sample_path):
    sample_sub = pd.read_csv(sample_path)
    SUB_ID = sample_sub.columns[0]
    SUB_LABEL = sample_sub.columns[1]
else:
    SUB_ID = ID_COL
    SUB_LABEL = LABEL_COL

submission = pd.DataFrame({SUB_ID: test[ID_COL], SUB_LABEL: predictions})
submission.to_csv('/kaggle/working/submission.csv', index=False)

print("="*60)
print("Ensemble v3 CONCLUÍDO!")
print("="*60)
print(f"\nPesos: SGD={WEIGHT_SGD}, SVC={WEIGHT_SVC}, LR={WEIGHT_LR}")
print("\nDistribuição das predições:")
print(submission[SUB_LABEL].value_counts().sort_index())
print("\n✅ submission.csv criado!")