# SPR 2026 - Stacking com Meta-Learner

**OOF predictions → Meta-Learner**

- ✅ K-Fold OOF para modelos base
- ✅ LightGBM como meta-learner
- ✅ Tempo esperado: ~5-10 min

---
**CONFIGURAÇÃO KAGGLE:**
1. Settings → Internet → **OFF**
2. **IMPORTANTE:** Execute "Run All" após commit
---

In [None]:
# =============================================================================
# SPR 2026 - STACKING COM META-LEARNER
# =============================================================================
# - K-Fold OOF para modelos base (Logistic Regression + Naive Bayes)
# - Meta-learner: Logistic Regression
# - TF-IDF como features base
# =============================================================================

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

SEED = 42
N_FOLDS = 5
NUM_CLASSES = 7
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

np.random.seed(SEED)
print('[1/7] Bibliotecas carregadas!')

# =============================================================================
# CARREGAR DADOS
# =============================================================================
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
y_train = train_df['target'].values
print(f'[2/7] Train: {train_df.shape} | Test: {test_df.shape}')

# =============================================================================
# TF-IDF
# =============================================================================
tfidf = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

X_tfidf_train = tfidf.fit_transform(train_df['report'])
X_tfidf_test = tfidf.transform(test_df['report'])
print(f'[3/7] TF-IDF shape: {X_tfidf_train.shape}')

# =============================================================================
# FUNÇÃO OOF
# =============================================================================
def get_oof_predictions(model, X, y, X_test, n_folds=5, seed=42):
    """Gerar predições Out-of-Fold para stacking"""
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    
    n_classes = len(np.unique(y))
    oof_train = np.zeros((X.shape[0], n_classes))
    oof_test = np.zeros((X_test.shape[0], n_classes))
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]
        
        model.fit(X_tr, y_tr)
        
        if hasattr(model, 'predict_proba'):
            oof_train[val_idx] = model.predict_proba(X_val)
            oof_test += model.predict_proba(X_test) / n_folds
        else:
            preds = model.predict(X_val)
            oof_train[val_idx] = np.eye(n_classes)[preds]
            preds_test = model.predict(X_test)
            oof_test += np.eye(n_classes)[preds_test] / n_folds
    
    return oof_train, oof_test

# =============================================================================
# GERAR OOF PARA MODELOS BASE
# =============================================================================
print('Gerando OOF para Logistic Regression...')
oof_lr_train, oof_lr_test = get_oof_predictions(
    LogisticRegression(C=1.0, max_iter=1000, class_weight='balanced', random_state=SEED),
    X_tfidf_train, y_train, X_tfidf_test
)
print('[4/7] OOF Logistic Regression gerado!')

print('Gerando OOF para Naive Bayes...')
oof_nb_train, oof_nb_test = get_oof_predictions(
    MultinomialNB(alpha=0.1),
    X_tfidf_train, y_train, X_tfidf_test
)
print('[5/7] OOF Naive Bayes gerado!')

# =============================================================================
# CONCATENAR META-FEATURES
# =============================================================================
X_train_meta = np.hstack([oof_lr_train, oof_nb_train])
X_test_meta = np.hstack([oof_lr_test, oof_nb_test])
print(f'[6/7] Meta-features: train {X_train_meta.shape} | test {X_test_meta.shape}')

# =============================================================================
# META-LEARNER
# =============================================================================
meta_learner = LogisticRegression(C=1.0, max_iter=1000, random_state=SEED)

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
scores = cross_val_score(meta_learner, X_train_meta, y_train, cv=skf, scoring='f1_macro')
print(f'Meta-Learner CV F1-Macro: {scores.mean():.4f} (+/- {scores.std():.4f})')

meta_learner.fit(X_train_meta, y_train)

# =============================================================================
# SUBMISSÃO
# =============================================================================
predictions = meta_learner.predict(X_test_meta)

submission = pd.DataFrame({
    'ID': test_df['ID'],
    'target': predictions
})

submission.to_csv('submission.csv', index=False)

print('[7/7] ✅ CONCLUÍDO: submission.csv')
print(submission['target'].value_counts().sort_index())