# SPR 2026 - Word2Vec + SVM

**Word2Vec + Support Vector Machine**

SVM funciona bem com embeddings densos normalizados.

- ✅ Embeddings 100D + StandardScaler
- ✅ RBF kernel com class_weight='balanced'
- ✅ Tempo esperado: ~3-5 min

---
**CONFIGURAÇÃO KAGGLE:**
1. Settings → Internet → **OFF**
2. **IMPORTANTE:** Execute "Run All" após commit
---

In [None]:
# =============================================================================
# SPR 2026 - WORD2VEC + SVM (CÓDIGO CONSOLIDADO)
# =============================================================================

import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from gensim.models import Word2Vec
import re
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("SPR 2026 - Word2Vec + SVM")
print("="*60)

# ==== CONFIGURAÇÕES ====
SEED = 42
EMBEDDING_DIM = 100
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
np.random.seed(SEED)

# ==== CARREGAR DADOS ====
print("\n[1/6] Carregando dados...")
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f"    Train: {train.shape} | Test: {test.shape}")

# ==== PREPROCESSAMENTO ====
print("\n[2/6] Preprocessando textos...")
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-záàâãéèêíïóôõöúçñ\s]', ' ', text)
    return text.split()

train['tokens'] = train['report'].apply(preprocess)
test['tokens'] = test['report'].apply(preprocess)

# ==== WORD2VEC ====
print("\n[3/6] Treinando Word2Vec...")
all_texts = train['tokens'].tolist() + test['tokens'].tolist()

w2v = Word2Vec(
    sentences=all_texts,
    vector_size=EMBEDDING_DIM,
    window=5,
    min_count=2,
    workers=4,
    epochs=10,
    seed=SEED
)
print(f"    Vocabulário: {len(w2v.wv)} palavras")

# ==== GERAR EMBEDDINGS ====
print("\n[4/6] Gerando embeddings (Mean Pooling)...")
def text_to_embedding(tokens, model, dim):
    vectors = [model.wv[w] for w in tokens if w in model.wv]
    if len(vectors) == 0:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)

X_train = np.array([text_to_embedding(t, w2v, EMBEDDING_DIM) for t in train['tokens']])
X_test = np.array([text_to_embedding(t, w2v, EMBEDDING_DIM) for t in test['tokens']])
y_train = train['target'].values
print(f"    X_train shape: {X_train.shape}")

# ==== NORMALIZAR ====
print("\n[5/6] Normalizando features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ==== SVM ====
print("\n[6/6] Treinando SVM...")
model = SVC(
    C=1.0,
    kernel='rbf',
    gamma='scale',
    class_weight='balanced',
    random_state=SEED
)
model.fit(X_train_scaled, y_train)
print("    ✓ Modelo treinado!")

# ==== SUBMISSÃO ====
predictions = model.predict(X_test_scaled)
submission = pd.DataFrame({'ID': test['ID'], 'target': predictions})
submission.to_csv('submission.csv', index=False)

print("="*60)
print("✅ CONCLUÍDO - submission.csv criado!")
print("="*60)
print("\nDistribuição das predições:")
print(submission['target'].value_counts().sort_index())