# SPR 2026 - LightGBM v3 (RandomizedSearch)

**Score baseline (v2):** 0.70273

**Melhorias:**
- RandomizedSearchCV para tuning
- Class weights balanceados
- TruncatedSVD + StandardScaler
- Early stopping
- Threshold tuning por classe
- SMOTE para classes 5/6 (opcional)
- Flag para remover classe 2

---
**CONFIGURACAO KAGGLE:** Internet OFF, GPU T4 x2
---

In [None]:
import numpy as npimport pandas as pdfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.decomposition import TruncatedSVDfrom sklearn.preprocessing import StandardScalerfrom sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_splitfrom scipy.stats import randint, uniformimport lightgbm as lgbimport torchimport warningswarnings.filterwarnings('ignore')print("="*60)print("SPR 2026 - LightGBM v3 (RandomizedSearch)")print("="*60)SEED = 42SVD_COMPONENTS = 500DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'USE_GPU = torch.cuda.is_available()np.random.seed(SEED)# FLAGS - AJUSTE AQUIREMOVE_CLASS_2 = FalseUSE_SMOTE = FalseUSE_THRESHOLD_TUNING = TrueN_SEARCH_ITER = 20print(f"GPU: {USE_GPU}")# Dadosprint("\n[1/7] Carregando dados...")train = pd.read_csv(f'{DATA_DIR}/train.csv')test = pd.read_csv(f'{DATA_DIR}/test.csv')print(f"Train: {train.shape} | Test: {test.shape}")if REMOVE_CLASS_2:    train = train[train['target'] != 2].reset_index(drop=True)    print(f"Sem classe 2: {train.shape}")# TF-IDFprint("\n[2/7] TF-IDF...")tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2), min_df=2, max_df=0.95, sublinear_tf=True)X_train_tfidf = tfidf.fit_transform(train['report'])X_test_tfidf = tfidf.transform(test['report'])y_train = train['target'].valuesprint(f"TF-IDF: {X_train_tfidf.shape}")# SVDprint(f"\n[3/7] SVD -> {SVD_COMPONENTS} features...")svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=SEED)X_train_svd = svd.fit_transform(X_train_tfidf)X_test_svd = svd.transform(X_test_tfidf)print(f"Variancia: {svd.explained_variance_ratio_.sum():.2%}")scaler = StandardScaler()X_train = scaler.fit_transform(X_train_svd)X_test = scaler.transform(X_test_svd)# SMOTE opcionalif USE_SMOTE:    try:        from imblearn.over_sampling import SMOTE        smote = SMOTE(sampling_strategy={5:500, 6:500}, random_state=SEED, k_neighbors=3)        X_train, y_train = smote.fit_resample(X_train, y_train)        print(f"SMOTE: {X_train.shape}")    except ImportError:        print("imblearn nao disponivel")# RandomizedSearchprint(f"\n[4/7] RandomizedSearchCV ({N_SEARCH_ITER} iter)...")param_dist = {    'n_estimators': randint(100, 500),    'max_depth': randint(4, 12),    'learning_rate': uniform(0.01, 0.2),    'num_leaves': randint(15, 63),    'min_child_samples': randint(10, 50),    'subsample': uniform(0.6, 0.4),    'colsample_bytree': uniform(0.6, 0.4),    'reg_alpha': uniform(0, 1),    'reg_lambda': uniform(0, 1),}base = lgb.LGBMClassifier(class_weight='balanced', device='gpu' if USE_GPU else 'cpu', random_state=SEED, n_jobs=-1, verbose=-1)cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)search = RandomizedSearchCV(base, param_dist, n_iter=N_SEARCH_ITER, cv=cv, scoring='f1_macro', n_jobs=1, random_state=SEED, verbose=1)search.fit(X_train, y_train)print(f"Best params: {search.best_params_}")print(f"Best F1-macro: {search.best_score_:.4f}")# Retreinar com early stoppingprint("\n[5/7] Retreinando com early stopping...")X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=SEED, stratify=y_train)best_params = search.best_params_.copy()best_params['n_estimators'] = 500final_model = lgb.LGBMClassifier(**best_params, class_weight='balanced', device='gpu' if USE_GPU else 'cpu', random_state=SEED, n_jobs=-1, verbose=-1)final_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(30, verbose=True), lgb.log_evaluation(50)])print(f"Best iteration: {final_model.best_iteration_}")# Threshold tuningif USE_THRESHOLD_TUNING:    print("\n[6/7] Threshold tuning...")    proba = final_model.predict_proba(X_test)    classes = final_model.classes_    thresholds = {0:0.5, 1:0.5, 2:0.5, 3:0.5, 4:0.5, 5:0.35, 6:0.35}    preds = []    for i in range(len(proba)):        adj = proba[i].copy()        for j,c in enumerate(classes):            if c in thresholds:                adj[j] *= (0.5/thresholds[c])        preds.append(classes[np.argmax(adj)])    predictions = np.array(preds)else:    print("\n[6/7] Predicao padrao...")    predictions = final_model.predict(X_test)# Submissaoprint("\n[7/7] Gerando submissao...")submission = pd.DataFrame({'ID': test['ID'], 'target': predictions})submission.to_csv('submission.csv', index=False)print("="*60)print("CONCLUIDO!")print("="*60)print("\nDistribuicao:")print(submission['target'].value_counts().sort_index())