# SPR 2026 - Stacking com Meta-Learner

**Experimento #7 - Média Prioridade**

Stacking: usar predições OOF dos melhores modelos como features para um meta-learner.

---
**CONFIGURAÇÃO:**
Este notebook assume predições OOF (Out-of-Fold) salvas.

**Pré-requisitos:**
1. Treinar modelos base com K-Fold CV
2. Salvar predições OOF (train) e predições test
3. Usar OOF como features para meta-learner
---

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

SEED = 42
N_FOLDS = 5
NUM_CLASSES = 7

DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

np.random.seed(SEED)

In [None]:
# Carregar dados
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')

y_train = train_df['target'].values

print(f'Train: {train_df.shape}')
print(f'Test: {test_df.shape}')

In [None]:
# =================================================================
# CARREGAR FEATURES OOF DOS MODELOS BASE
# =================================================================
# Substitua pelos seus arquivos de OOF predictions

# Formato esperado:
# - oof_train_*.npy: (n_train_samples, n_classes) probabilidades OOF
# - oof_test_*.npy: (n_test_samples, n_classes) probabilidades médias no test

# Exemplo:
# oof_train_bertimbau = np.load('oof_train_bertimbau.npy')
# oof_train_deberta = np.load('oof_train_deberta.npy')
# oof_train_xlmr = np.load('oof_train_xlmroberta.npy')

# oof_test_bertimbau = np.load('oof_test_bertimbau.npy')
# oof_test_deberta = np.load('oof_test_deberta.npy')
# oof_test_xlmr = np.load('oof_test_xlmroberta.npy')

# # Concatenar features
# X_train_meta = np.hstack([oof_train_bertimbau, oof_train_deberta, oof_train_xlmr])
# X_test_meta = np.hstack([oof_test_bertimbau, oof_test_deberta, oof_test_xlmr])

# print(f'Meta-features train: {X_train_meta.shape}')
# print(f'Meta-features test: {X_test_meta.shape}')

In [None]:
# =================================================================
# EXEMPLO: Criar features de modelos simples (TF-IDF based)
# =================================================================

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

# TF-IDF
tfidf = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

X_tfidf_train = tfidf.fit_transform(train_df['report'])
X_tfidf_test = tfidf.transform(test_df['report'])

print(f'TF-IDF shape: {X_tfidf_train.shape}')

In [None]:
# Criar OOF predictions para cada modelo base
def get_oof_predictions(model, X, y, X_test, n_folds=5, seed=42):
    """Gerar predições Out-of-Fold para stacking"""
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    
    n_classes = len(np.unique(y))
    oof_train = np.zeros((X.shape[0], n_classes))
    oof_test = np.zeros((X_test.shape[0], n_classes))
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]
        
        model.fit(X_tr, y_tr)
        
        # Predições OOF
        if hasattr(model, 'predict_proba'):
            oof_train[val_idx] = model.predict_proba(X_val)
            oof_test += model.predict_proba(X_test) / n_folds
        else:
            # Para modelos sem predict_proba, usar one-hot
            preds = model.predict(X_val)
            oof_train[val_idx] = np.eye(n_classes)[preds]
            preds_test = model.predict(X_test)
            oof_test += np.eye(n_classes)[preds_test] / n_folds
    
    return oof_train, oof_test

In [None]:
# Gerar OOF para modelos base
print('Gerando OOF para Logistic Regression...')
oof_lr_train, oof_lr_test = get_oof_predictions(
    LogisticRegression(C=1.0, max_iter=1000, class_weight='balanced', random_state=SEED),
    X_tfidf_train, y_train, X_tfidf_test
)

print('Gerando OOF para Naive Bayes...')
oof_nb_train, oof_nb_test = get_oof_predictions(
    MultinomialNB(alpha=0.1),
    X_tfidf_train, y_train, X_tfidf_test
)

print('OOF gerados!')

In [None]:
# Concatenar meta-features
X_train_meta = np.hstack([oof_lr_train, oof_nb_train])
X_test_meta = np.hstack([oof_lr_test, oof_nb_test])

print(f'Meta-features train: {X_train_meta.shape}')
print(f'Meta-features test: {X_test_meta.shape}')

In [None]:
# Meta-Learner
meta_learner = LogisticRegression(C=1.0, max_iter=1000, random_state=SEED)

# Validação cruzada do meta-learner
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
scores = cross_val_score(meta_learner, X_train_meta, y_train, cv=skf, scoring='f1_macro')
print(f'Meta-Learner CV F1-Macro: {scores.mean():.4f} (+/- {scores.std():.4f})')

# Treinar final
meta_learner.fit(X_train_meta, y_train)

In [None]:
# Predições
predictions = meta_learner.predict(X_test_meta)

# Submissão
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'target': predictions
})

submission.to_csv('submission.csv', index=False)

print('submission.csv criado!')
print(submission['target'].value_counts().sort_index())