# SPR 2026 - Ensemble v4 (Top TF-IDF + Tratamento + SMOTE)

**Scores dos modelos base:**
- LinearSVC: 0.77885
- SGDClassifier v3: 0.77036
- LogisticRegression: 0.72935

## Melhorias v4:
1. **Tratamento de dados:**
   - Normalização de termos médicos
   - Features para negações

2. **Aumentação:**
   - SMOTE para classes 5 e 6

3. **Ensemble:**
   - Weighted soft voting
   - Pesos proporcionais ao score
   - CalibratedClassifierCV para probabilidades

**Meta:** Superar 0.79+ F1-Macro

---
## CONFIGURAÇÃO KAGGLE:
1. **Add Input** → **Competition** → `spr-2026-mammography-report-classification`
2. **Settings** → Internet → **OFF**
---

In [None]:
# ===== SPR 2026 - ENSEMBLE v4 =====

import os
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("SPR 2026 - Ensemble v4 (Top TF-IDF + Tratamento + SMOTE)")
print("="*60)

SEED = 42
DATA_DIR = '/kaggle/input/competitions/spr-2026-mammography-report-classification'

# ========== VERIFICAR DATASET PRIMEIRO ==========
if not os.path.exists(DATA_DIR):
    print("\n" + "="*60)
    print("ERRO: Dataset não encontrado!")
    print("="*60)
    print("\nAdicione o dataset:")
    print("Add Input -> Competition -> spr-2026-mammography-report-classification")
    raise FileNotFoundError(f"Dataset não encontrado: {DATA_DIR}")
print(f"Dataset: {DATA_DIR}")

np.random.seed(SEED)

# ========== PESOS DO ENSEMBLE ==========
# Baseado nos scores públicos (normalizados)
WEIGHTS = {
    'linearsvc': 0.40,   # 0.77885 - melhor TF-IDF
    'sgd': 0.35,         # 0.77036 - único que melhorou
    'logreg': 0.25       # 0.72935 - para diversidade
}
print(f"Pesos: {WEIGHTS}")

USE_SMOTE = True

In [None]:
# ========== FUNÇÕES DE TRATAMENTO ==========
print("\n[1/7] Funções de tratamento...")

def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
    return text.strip()

def normalize_medical_terms(text):
    # BI-RADS
    text = re.sub(r'bi-?rads?', 'BIRADS', text, flags=re.IGNORECASE)
    text = re.sub(r'birads\s*(\d)', r'BIRADS_\1', text, flags=re.IGNORECASE)
    
    # Quadrantes
    text = re.sub(r'qse|quadrante\s*superior\s*externo', 'QSE', text, flags=re.IGNORECASE)
    text = re.sub(r'qsi|quadrante\s*superior\s*interno', 'QSI', text, flags=re.IGNORECASE)
    text = re.sub(r'qie|quadrante\s*inferior\s*externo', 'QIE', text, flags=re.IGNORECASE)
    text = re.sub(r'qii|quadrante\s*inferior\s*interno', 'QII', text, flags=re.IGNORECASE)
    
    # Achados
    text = re.sub(r'micro-?calcifica[çc][õo]es', 'MICROCALCIFICACOES', text, flags=re.IGNORECASE)
    text = re.sub(r'n[oó]dulo', 'NODULO', text, flags=re.IGNORECASE)
    text = re.sub(r'assimetria\s*focal', 'ASSIMETRIA_FOCAL', text, flags=re.IGNORECASE)
    text = re.sub(r'distorção\s*arquitetural', 'DISTORCAO_ARQUITETURAL', text, flags=re.IGNORECASE)
    
    # Negações importantes
    text = re.sub(r'ausência\s*de|sem\s*evidência\s*de|não\s*se\s*observa', 'NEGACAO_', text, flags=re.IGNORECASE)
    
    return text.lower()

def preprocess_text(text):
    text = clean_text(text)
    text = normalize_medical_terms(text)
    return text

print("Funções definidas!")

In [None]:
# ========== CARREGAR DADOS ==========
print("\n[2/7] Carregando dados...")

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f"Train: {train.shape} | Test: {test.shape}")

# Auto-detectar colunas
def find_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

TEXT_COL = find_col(train, ['report', 'text', 'laudo'])
LABEL_COL = find_col(train, ['target', 'label', 'birads'])
ID_COL = find_col(test, ['ID', 'id', 'Id'])
print(f"Colunas: texto={TEXT_COL}, label={LABEL_COL}, id={ID_COL}")

# Tratamento
train['text_processed'] = train[TEXT_COL].apply(preprocess_text)
test['text_processed'] = test[TEXT_COL].apply(preprocess_text)
print("Tratamento aplicado!")

In [None]:
# ========== TF-IDF ==========
print("\n[3/7] TF-IDF...")

tfidf = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

X_train = tfidf.fit_transform(train['text_processed'])
X_test = tfidf.transform(test['text_processed'])
y_train = train[LABEL_COL].values
print(f"Shape: {X_train.shape}")

In [None]:
# ========== SMOTE ==========
print("\n[4/7] SMOTE...")

if USE_SMOTE:
    try:
        from imblearn.over_sampling import SMOTE
        smote = SMOTE(sampling_strategy={5: 500, 6: 500}, random_state=SEED, k_neighbors=3)
        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
        print(f"SMOTE: {X_train.shape[0]} -> {X_train_smote.shape[0]} amostras")
        X_train_final, y_train_final = X_train_smote, y_train_smote
    except ImportError:
        print("imblearn não disponível")
        X_train_final, y_train_final = X_train, y_train
else:
    X_train_final, y_train_final = X_train, y_train

In [None]:
# ========== TREINAR MODELOS ==========
print("\n[5/7] Treinando modelos...")

# LinearSVC + Calibração
print("LinearSVC...")
svc_base = LinearSVC(C=1.0, class_weight='balanced', random_state=SEED, max_iter=5000, dual=True)
svc = CalibratedClassifierCV(svc_base, cv=5, method='sigmoid')
svc.fit(X_train_final, y_train_final)

# SGDClassifier (config v3)
print("SGDClassifier...")
sgd = SGDClassifier(
    loss='log_loss',
    penalty='elasticnet',
    alpha=0.0001,
    l1_ratio=0.3,
    class_weight='balanced',
    random_state=SEED,
    max_iter=2000,
    early_stopping=True,
    validation_fraction=0.1,
    n_jobs=-1
)
sgd.fit(X_train_final, y_train_final)

# LogisticRegression
print("LogisticRegression...")
logreg = LogisticRegression(
    C=1.0,
    class_weight='balanced',
    random_state=SEED,
    max_iter=2000,
    n_jobs=-1
)
logreg.fit(X_train_final, y_train_final)

print("Todos os modelos treinados!")

In [None]:
# ========== ENSEMBLE VOTING ==========
print("\n[6/7] Ensemble soft voting...")

# Obter probabilidades
probs_svc = svc.predict_proba(X_test)
probs_sgd = sgd.predict_proba(X_test)
probs_logreg = logreg.predict_proba(X_test)

# Weighted average
probs_ensemble = (
    WEIGHTS['linearsvc'] * probs_svc +
    WEIGHTS['sgd'] * probs_sgd +
    WEIGHTS['logreg'] * probs_logreg
)

predictions = np.argmax(probs_ensemble, axis=1)
print(f"Predições geradas: {len(predictions)}")

In [None]:
# ========== SUBMISSION ==========
print("\n[7/7] Gerando submission...")

submission = pd.DataFrame({
    ID_COL: test[ID_COL],
    LABEL_COL: predictions
})

submission.to_csv('submission.csv', index=False)
print(f"\nSubmission salva: submission.csv")
print(submission.head())
print(f"\nDistribuição:")
print(pd.Series(predictions).value_counts().sort_index())