# SPR 2026 - Ensemble (TF-IDF + Word2Vec)

**Votação majoritária de 3 modelos:**
- TF-IDF + Logistic Regression
- TF-IDF + LightGBM
- Word2Vec + LightGBM

- ✅ Não requer modelos externos
- ✅ Tempo esperado: ~5-10 min

---
**CONFIGURAÇÃO KAGGLE:**
1. Settings → Internet → **OFF**
2. **IMPORTANTE:** Execute "Run All" após commit
---

In [None]:
# =============================================================================
# SPR 2026 - ENSEMBLE: TF-IDF + WORD2VEC (VOTAÇÃO MAJORITÁRIA)
# =============================================================================
# - Modelo 1: TF-IDF + Logistic Regression
# - Modelo 2: TF-IDF + LightGBM
# - Modelo 3: Word2Vec + LightGBM
# - Combinação: Votação majoritária (mode)
# =============================================================================

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from gensim.models import Word2Vec
from scipy.stats import mode
import re
import warnings
warnings.filterwarnings('ignore')

SEED = 42
EMBEDDING_DIM = 100
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
np.random.seed(SEED)
print('[1/7] Bibliotecas carregadas!')

# =============================================================================
# CARREGAR DADOS
# =============================================================================
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
y = train['target'].values
print(f'[2/7] Train: {train.shape} | Test: {test.shape}')

# =============================================================================
# MODELO 1: TF-IDF + LOGISTIC REGRESSION
# =============================================================================
tfidf1 = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True,
)

X_tfidf_train = tfidf1.fit_transform(train['report'])
X_tfidf_test = tfidf1.transform(test['report'])

model_lr = LogisticRegression(C=1.0, max_iter=1000, class_weight='balanced', random_state=SEED, n_jobs=-1)
model_lr.fit(X_tfidf_train, y)
preds_lr = model_lr.predict(X_tfidf_test)
print(f'[3/7] Modelo 1 (TF-IDF + LR): {len(preds_lr)} predições')

# =============================================================================
# MODELO 2: TF-IDF + LIGHTGBM
# =============================================================================
tfidf2 = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True,
)

X_tfidf2_train = tfidf2.fit_transform(train['report'])
X_tfidf2_test = tfidf2.transform(test['report'])

model_lgbm = lgb.LGBMClassifier(
    n_estimators=300,
    max_depth=12,
    learning_rate=0.05,
    num_leaves=64,
    class_weight='balanced',
    random_state=SEED,
    n_jobs=-1,
    verbose=-1
)
model_lgbm.fit(X_tfidf2_train, y)
preds_lgbm = model_lgbm.predict(X_tfidf2_test)
print(f'[4/7] Modelo 2 (TF-IDF + LGBM): {len(preds_lgbm)} predições')

# =============================================================================
# MODELO 3: WORD2VEC + LIGHTGBM
# =============================================================================
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-záàâãéèêíïóôõöúçñ\s]', ' ', text)
    return text.split()

train['tokens'] = train['report'].apply(preprocess)
test['tokens'] = test['report'].apply(preprocess)

all_texts = train['tokens'].tolist() + test['tokens'].tolist()
w2v = Word2Vec(sentences=all_texts, vector_size=EMBEDDING_DIM, window=5, min_count=2, workers=4, epochs=10, seed=SEED)

def text_to_embedding(tokens, model, dim):
    vectors = [model.wv[w] for w in tokens if w in model.wv]
    if len(vectors) == 0:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)

X_w2v_train = np.array([text_to_embedding(t, w2v, EMBEDDING_DIM) for t in train['tokens']])
X_w2v_test = np.array([text_to_embedding(t, w2v, EMBEDDING_DIM) for t in test['tokens']])

model_w2v = lgb.LGBMClassifier(
    n_estimators=200,
    max_depth=10,
    learning_rate=0.05,
    class_weight='balanced',
    random_state=SEED,
    verbose=-1
)
model_w2v.fit(X_w2v_train, y)
preds_w2v = model_w2v.predict(X_w2v_test)
print(f'[5/7] Modelo 3 (Word2Vec + LGBM): {len(preds_w2v)} predições')

# =============================================================================
# ENSEMBLE: VOTAÇÃO MAJORITÁRIA
# =============================================================================
preds_matrix = np.column_stack([preds_lr, preds_lgbm, preds_w2v])
ensemble_preds = mode(preds_matrix, axis=1).mode.flatten()
print(f'[6/7] Ensemble: {len(ensemble_preds)} predições')

# =============================================================================
# SUBMISSÃO
# =============================================================================
submission = pd.DataFrame({
    'ID': test['ID'],
    'target': ensemble_preds.astype(int)
})

submission.to_csv('submission.csv', index=False)
print('[7/7] ✅ CONCLUÍDO: submission.csv')
print(submission['target'].value_counts().sort_index())

# Distribuição por modelo
print('\nDistribuição por modelo:')
print(f'LR:       {np.bincount(preds_lr, minlength=7)}')
print(f'LGBM:     {np.bincount(preds_lgbm, minlength=7)}')
print(f'W2V:      {np.bincount(preds_w2v, minlength=7)}')
print(f'Ensemble: {np.bincount(ensemble_preds.astype(int), minlength=7)}')