# SPR 2026 - LightGBM v3 (RandomizedSearch)

**Score baseline (v2):** 0.70273

**Melhorias:**
- RandomizedSearchCV para tuning
- Class weights balanceados
- TruncatedSVD + StandardScaler
- Early stopping
- Threshold tuning por classe
- SMOTE para classes 5/6 (opcional)
- Flag para remover classe 2

---
**CONFIGURACAO KAGGLE:** Internet OFF, GPU T4 x2
---

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from scipy.stats import randint, uniform
import lightgbm as lgb
import torch
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("SPR 2026 - LightGBM v3 (RandomizedSearch)")
print("="*60)

SEED = 42
SVD_COMPONENTS = 500
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

# ========== VERIFICAR DATASET PRIMEIRO ==========
if not os.path.exists(DATA_DIR):
    print("\n" + "="*60)
    print("ERRO: Dataset não encontrado!")
    print("="*60)
    print("\nAdicione o dataset:")
    print("Add Input → Competition → spr-2026-mammography-report-classification")
    raise FileNotFoundError(f"Dataset não encontrado: {DATA_DIR}")
print(f"Dataset: {DATA_DIR}")

USE_GPU = torch.cuda.is_available()
np.random.seed(SEED)

# FLAGS - AJUSTE AQUI
REMOVE_CLASS_2 = False
USE_SMOTE = False
USE_THRESHOLD_TUNING = True
N_SEARCH_ITER = 20

print(f"GPU: {USE_GPU}")

# Dados
print("\n[1/7] Carregando dados...")
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f"Train: {train.shape} | Test: {test.shape}")

# Auto-detectar colunas
def find_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
        for col in df.columns:
            if col.lower() == c.lower():
                return col
    return None

TEXT_COL = find_col(train, ['report', 'text', 'laudo', 'texto', 'content'])
LABEL_COL = find_col(train, ['target', 'label', 'birads', 'classe', 'class'])
ID_COL = find_col(test, ['ID', 'id', 'Id', 'index', 'idx'])
print(f"Colunas: texto={TEXT_COL}, label={LABEL_COL}, id={ID_COL}")

if REMOVE_CLASS_2:
    train = train[train[LABEL_COL] != 2].reset_index(drop=True)
    print(f"Sem classe 2: {train.shape}")

# TF-IDF
print("\n[2/7] TF-IDF...")
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2), min_df=2, max_df=0.95, sublinear_tf=True)
X_train_tfidf = tfidf.fit_transform(train[TEXT_COL])
X_test_tfidf = tfidf.transform(test[TEXT_COL])
y_train = train[LABEL_COL].values
print(f"TF-IDF: {X_train_tfidf.shape}")

# SVD
print(f"\n[3/7] SVD -> {SVD_COMPONENTS} features...")
svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=SEED)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_test_svd = svd.transform(X_test_tfidf)
print(f"Variancia: {svd.explained_variance_ratio_.sum():.2%}")

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_svd)
X_test = scaler.transform(X_test_svd)

# SMOTE opcional
if USE_SMOTE:
    try:
        from imblearn.over_sampling import SMOTE
        smote = SMOTE(sampling_strategy={5:500, 6:500}, random_state=SEED, k_neighbors=3)
        X_train, y_train = smote.fit_resample(X_train, y_train)
        print(f"SMOTE: {X_train.shape}")
    except ImportError:
        print("imblearn nao disponivel")

# RandomizedSearch
print(f"\n[4/7] RandomizedSearchCV ({N_SEARCH_ITER} iter)...")
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(4, 12),
    'learning_rate': uniform(0.01, 0.2),
    'num_leaves': randint(15, 63),
    'min_child_samples': randint(10, 50),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0, 1),
}
base = lgb.LGBMClassifier(class_weight='balanced', device='gpu' if USE_GPU else 'cpu', random_state=SEED, n_jobs=-1, verbose=-1)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
search = RandomizedSearchCV(base, param_dist, n_iter=N_SEARCH_ITER, cv=cv, scoring='f1_macro', n_jobs=1, random_state=SEED, verbose=1)
search.fit(X_train, y_train)
print(f"Best params: {search.best_params_}")
print(f"Best F1-macro: {search.best_score_:.4f}")

# Retreinar com early stopping
print("\n[5/7] Retreinando com early stopping...")
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=SEED, stratify=y_train)
best_params = search.best_params_.copy()
best_params['n_estimators'] = 500
final_model = lgb.LGBMClassifier(**best_params, class_weight='balanced', device='gpu' if USE_GPU else 'cpu', random_state=SEED, n_jobs=-1, verbose=-1)
final_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(30, verbose=True), lgb.log_evaluation(50)])
print(f"Best iteration: {final_model.best_iteration_}")

# Threshold tuning
if USE_THRESHOLD_TUNING:
    print("\n[6/7] Threshold tuning...")
    proba = final_model.predict_proba(X_test)
    classes = final_model.classes_
    thresholds = {0:0.5, 1:0.5, 2:0.5, 3:0.5, 4:0.5, 5:0.35, 6:0.35}
    preds = []
    for i in range(len(proba)):
        adj = proba[i].copy()
        for j,c in enumerate(classes):
            if c in thresholds:
                adj[j] *= (0.5/thresholds[c])
        preds.append(classes[np.argmax(adj)])
    predictions = np.array(preds)
else:
    print("\n[6/7] Predicao padrao...")
    predictions = final_model.predict(X_test)

# Submissao
print("\n[7/7] Gerando submissao...")

# Ler sample_submission para colunas corretas
sample_path = f'{DATA_DIR}/sample_submission.csv'
if os.path.exists(sample_path):
    sample_sub = pd.read_csv(sample_path)
    SUB_ID = sample_sub.columns[0]
    SUB_LABEL = sample_sub.columns[1]
else:
    SUB_ID = ID_COL
    SUB_LABEL = LABEL_COL

submission = pd.DataFrame({SUB_ID: test[ID_COL], SUB_LABEL: predictions})
submission.to_csv('/kaggle/working/submission.csv', index=False)
print("="*60)
print("CONCLUIDO!")
print("="*60)
print("\nDistribuicao:")
print(submission[SUB_LABEL].value_counts().sort_index())