# SPR 2026 - LinearSVC v3 (RandomizedSearch)

**Score baseline (v2):** 0.77885

**Melhorias:**
- RandomizedSearchCV para tuning
- Class weights balanceados
- Calibracao para probabilidades
- Threshold tuning por classe
- SMOTE para classes 5/6 (opcional)
- Flag para remover classe 2

---
**CONFIGURACAO KAGGLE:** Internet OFF
---

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import loguniform
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("SPR 2026 - LinearSVC v3 (RandomizedSearch)")
print("="*60)

SEED = 42
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
np.random.seed(SEED)

# FLAGS - AJUSTE AQUI
REMOVE_CLASS_2 = False      # Testar sem classe 2
USE_SMOTE = False           # SMOTE para classes 5/6
USE_THRESHOLD_TUNING = True # Ajuste de threshold
N_SEARCH_ITER = 20          # Iterações do RandomSearch

# Dados
print("\n[1/6] Carregando dados...")
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f"Train: {train.shape} | Test: {test.shape}")

# Auto-detectar colunas
def find_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
        for col in df.columns:
            if col.lower() == c.lower():
                return col
    return None

TEXT_COL = find_col(train, ['report', 'text', 'laudo', 'texto', 'content'])
LABEL_COL = find_col(train, ['target', 'label', 'birads', 'classe', 'class'])
ID_COL = find_col(test, ['ID', 'id', 'Id', 'index', 'idx'])
print(f"Colunas: texto={TEXT_COL}, label={LABEL_COL}, id={ID_COL}")

if REMOVE_CLASS_2:
    train = train[train[LABEL_COL] != 2].reset_index(drop=True)
    print(f"Sem classe 2: {train.shape}")

# TF-IDF
print("\n[2/6] TF-IDF...")
tfidf = TfidfVectorizer(max_features=15000, ngram_range=(1,2), min_df=2, max_df=0.95, sublinear_tf=True)
X_train = tfidf.fit_transform(train[TEXT_COL])
X_test = tfidf.transform(test[TEXT_COL])
y_train = train[LABEL_COL].values
print(f"Shape: {X_train.shape}")

# SMOTE opcional
if USE_SMOTE:
    try:
        from imblearn.over_sampling import SMOTE
        smote = SMOTE(sampling_strategy={5:500, 6:500}, random_state=SEED, k_neighbors=3)
        X_train, y_train = smote.fit_resample(X_train, y_train)
        print(f"SMOTE aplicado: {X_train.shape}")
    except ImportError:
        print("imblearn nao disponivel")

# RandomizedSearch
print(f"\n[3/6] RandomizedSearchCV ({N_SEARCH_ITER} iter)...")
param_dist = {
    'C': loguniform(0.01, 10),
    'loss': ['hinge', 'squared_hinge'],
    'max_iter': [1000, 2000, 3000]
}
base = LinearSVC(class_weight='balanced', random_state=SEED, dual=True)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
search = RandomizedSearchCV(base, param_dist, n_iter=N_SEARCH_ITER, cv=cv, scoring='f1_macro', n_jobs=-1, random_state=SEED, verbose=1)
search.fit(X_train, y_train)
print(f"Best params: {search.best_params_}")
print(f"Best F1-macro: {search.best_score_:.4f}")

# Calibracao para probabilidades
print("\n[4/6] Calibrando modelo...")
calibrated = CalibratedClassifierCV(search.best_estimator_, cv=3, method='sigmoid')
calibrated.fit(X_train, y_train)

# Threshold tuning
if USE_THRESHOLD_TUNING:
    print("\n[5/6] Threshold tuning...")
    proba = calibrated.predict_proba(X_test)
    classes = calibrated.classes_
    # Classes 5/6 com threshold menor (mais sensivel)
    thresholds = {0:0.5, 1:0.5, 2:0.5, 3:0.5, 4:0.5, 5:0.35, 6:0.35}
    preds = []
    for i in range(len(proba)):
        adj = proba[i].copy()
        for j,c in enumerate(classes):
            if c in thresholds:
                adj[j] *= (0.5/thresholds[c])
        preds.append(classes[np.argmax(adj)])
    predictions = np.array(preds)
else:
    print("\n[5/6] Predicao padrao...")
    predictions = calibrated.predict(X_test)

# Submissao
print("\n[6/6] Gerando submissao...")

# Ler sample_submission para colunas corretas
sample_path = f'{DATA_DIR}/sample_submission.csv'
if os.path.exists(sample_path):
    sample_sub = pd.read_csv(sample_path)
    SUB_ID = sample_sub.columns[0]
    SUB_LABEL = sample_sub.columns[1]
else:
    SUB_ID = ID_COL
    SUB_LABEL = LABEL_COL

submission = pd.DataFrame({SUB_ID: test[ID_COL], SUB_LABEL: predictions})
submission.to_csv('/kaggle/working/submission.csv', index=False)
print("="*60)
print("CONCLUIDO!")
print("="*60)
print("\nDistribuicao:")
print(submission[SUB_LABEL].value_counts().sort_index())