# SPR 2026 - SGDClassifier v5 (Tratamento + SMOTE + RandomizedSearch)

**Histórico:**
- v2: 0.72305
- v3: 0.77036 (ElasticNet realmente foi melhor!)

## Melhorias v5:
1. **Tratamento de dados:**
   - Normalização de termos médicos (BI-RADS, quadrantes)
   - Features para negações

2. **SMOTE:**
   - Oversample classes 5 e 6

3. **RandomizedSearchCV:**
   - Busca de hiperparâmetros otimizada
   - Foco em alpha, l1_ratio

**Meta:** Superar 0.78+ F1-Macro

---
## CONFIGURAÇÃO KAGGLE:
1. **Add Input** → **Competition** → `spr-2026-mammography-report-classification`
2. **Settings** → Internet → **OFF**
---

In [None]:
# ===== SPR 2026 - SGD v5 =====

import os
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, make_scorer
from scipy.stats import loguniform
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("SPR 2026 - SGDClassifier v5")
print("="*60)

SEED = 42
DATA_DIR = '/kaggle/input/competitions/spr-2026-mammography-report-classification'

# ========== VERIFICAR DATASET PRIMEIRO ==========
if not os.path.exists(DATA_DIR):
    print("\n" + "="*60)
    print("ERRO: Dataset não encontrado!")
    print("="*60)
    print("\nAdicione o dataset:")
    print("Add Input -> Competition -> spr-2026-mammography-report-classification")
    raise FileNotFoundError(f"Dataset não encontrado: {DATA_DIR}")
print(f"Dataset: {DATA_DIR}")

np.random.seed(SEED)
USE_SMOTE = True

In [None]:
# ========== FUNÇÕES DE TRATAMENTO ==========
print("\n[1/6] Funções de tratamento...")

def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
    return text.strip()

def normalize_medical_terms(text):
    # BI-RADS
    text = re.sub(r'bi-?rads?', 'BIRADS', text, flags=re.IGNORECASE)
    text = re.sub(r'birads\s*(\d)', r'BIRADS_\1', text, flags=re.IGNORECASE)
    
    # Quadrantes
    text = re.sub(r'qse|quadrante\s*superior\s*externo', 'QSE', text, flags=re.IGNORECASE)
    text = re.sub(r'qsi|quadrante\s*superior\s*interno', 'QSI', text, flags=re.IGNORECASE)
    text = re.sub(r'qie|quadrante\s*inferior\s*externo', 'QIE', text, flags=re.IGNORECASE)
    text = re.sub(r'qii|quadrante\s*inferior\s*interno', 'QII', text, flags=re.IGNORECASE)
    
    # Achados
    text = re.sub(r'micro-?calcifica[çc][õo]es', 'MICROCALCIFICACOES', text, flags=re.IGNORECASE)
    text = re.sub(r'n[oó]dulo', 'NODULO', text, flags=re.IGNORECASE)
    text = re.sub(r'assimetria\s*focal', 'ASSIMETRIA_FOCAL', text, flags=re.IGNORECASE)
    
    # Negações
    text = re.sub(r'ausência\s*de|sem\s*evidência\s*de|não\s*se\s*observa', 'NEGACAO_', text, flags=re.IGNORECASE)
    
    return text.lower()

def preprocess_text(text):
    text = clean_text(text)
    text = normalize_medical_terms(text)
    return text

print("Funções definidas!")

In [None]:
# ========== CARREGAR DADOS ==========
print("\n[2/6] Carregando dados...")

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f"Train: {train.shape} | Test: {test.shape}")

# Auto-detectar colunas
def find_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

TEXT_COL = find_col(train, ['report', 'text', 'laudo'])
LABEL_COL = find_col(train, ['target', 'label', 'birads'])
ID_COL = find_col(test, ['ID', 'id', 'Id'])
print(f"Colunas: texto={TEXT_COL}, label={LABEL_COL}, id={ID_COL}")

# Tratamento
train['text_processed'] = train[TEXT_COL].apply(preprocess_text)
test['text_processed'] = test[TEXT_COL].apply(preprocess_text)
print("Texto tratado!")

# TF-IDF
print("\n[3/6] TF-IDF (20k features)...")
tfidf = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

X_train = tfidf.fit_transform(train['text_processed'])
X_test = tfidf.transform(test['text_processed'])
y_train = train[LABEL_COL].values
print(f"Shape: {X_train.shape}")

In [None]:
# ========== SMOTE ==========
print("\n[4/6] SMOTE...")

if USE_SMOTE:
    try:
        from imblearn.over_sampling import SMOTE
        smote = SMOTE(sampling_strategy={5: 500, 6: 500}, random_state=SEED, k_neighbors=3)
        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
        print(f"SMOTE: {X_train.shape[0]} -> {X_train_smote.shape[0]} amostras")
        X_final = X_train_smote
        y_final = y_train_smote
    except ImportError:
        print("imblearn não disponível, usando dados originais")
        X_final = X_train
        y_final = y_train
else:
    X_final = X_train
    y_final = y_train

In [None]:
# ========== RANDOMIZED SEARCH ==========
print("\n[5/6] RandomizedSearchCV...")

param_dist = {
    'alpha': loguniform(1e-5, 1e-2),
    'l1_ratio': [0.0, 0.15, 0.3, 0.5, 0.7, 0.85, 1.0],
    'penalty': ['elasticnet'],
    'loss': ['log_loss', 'modified_huber']
}

base_sgd = SGDClassifier(
    class_weight='balanced',
    random_state=SEED,
    max_iter=2000,
    early_stopping=True,
    validation_fraction=0.1,
    n_jobs=-1
)

f1_macro = make_scorer(f1_score, average='macro')

search = RandomizedSearchCV(
    base_sgd,
    param_distributions=param_dist,
    n_iter=20,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED),
    scoring=f1_macro,
    random_state=SEED,
    n_jobs=-1,
    verbose=1
)

search.fit(X_final, y_final)
print(f"\nMelhor score CV: {search.best_score_:.5f}")
print(f"Melhores params: {search.best_params_}")

In [None]:
# ========== SUBMISSION ==========
print("\n[6/6] Gerando submission...")

best_model = search.best_estimator_
predictions = best_model.predict(X_test)

submission = pd.DataFrame({
    ID_COL: test[ID_COL],
    LABEL_COL: predictions
})

submission.to_csv('submission.csv', index=False)
print(f"\nSubmission salva: submission.csv")
print(submission.head())
print(f"\nDistribuição:")
print(pd.Series(predictions).value_counts().sort_index())