# SPR 2026 - Word2Vec + Classifiers

Embeddings Word2Vec com classificadores ML.

**Formato:** Code Competition (Kaggle) / Google Colab

In [1]:
# ============================================================
# SETUP - Ambiente e Dados
# ============================================================
import os
import sys

# Verificar Colab PRIMEIRO (mais confiável)
IS_COLAB = 'google.colab' in sys.modules
IS_KAGGLE = os.path.exists('/kaggle/input') and not IS_COLAB

print(f"Ambiente: {'Kaggle' if IS_KAGGLE else 'Colab' if IS_COLAB else 'Local'}")

if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    
    DRIVE_BASE = '/content/drive/MyDrive/SPR_2026_outputs'
    DATA_DIR = f'{DRIVE_BASE}/data'
    OUTPUT_DIR = DRIVE_BASE
    
    # Verificar se dados existem no Drive
    if not os.path.exists(f'{DATA_DIR}/train.csv'):
        print("⚠️ Dados não encontrados no Drive!")
        print("Execute primeiro o notebook 00_download_data.ipynb")
        raise FileNotFoundError(f"Arquivo não encontrado: {DATA_DIR}/train.csv")
elif IS_KAGGLE:
    DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
    OUTPUT_DIR = '/kaggle/working'
else:
    DATA_DIR = '../data'
    OUTPUT_DIR = '../submissions'
    os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"DATA_DIR: {DATA_DIR}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")

Ambiente: Colab
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
DATA_DIR: /content/drive/MyDrive/SPR_2026_outputs/data
OUTPUT_DIR: /content/drive/MyDrive/SPR_2026_outputs


In [4]:
!pip install gensim -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from gensim.models import Word2Vec
import re
import warnings
warnings.filterwarnings('ignore')

SEED = 42
N_FOLDS = 5
EMBEDDING_DIM = 100
np.random.seed(SEED)

ModuleNotFoundError: No module named 'gensim'

## 1. Carregar Dados

In [None]:
train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
print(f"Train: {train.shape}")

test_path = os.path.join(DATA_DIR, 'test.csv')
if os.path.exists(test_path):
    test = pd.read_csv(test_path)
    print(f"Test: {test.shape}")
else:
    test = None
    print("test.csv não disponível - será carregado no runtime Kaggle")

## 2. Preprocessamento

In [None]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-záàâãéèêíïóôõöúçñ\s]', ' ', text)
    return text.split()

train['tokens'] = train['report'].apply(preprocess)
if test is not None:
    test['tokens'] = test['report'].apply(preprocess)

print(f"Exemplo: {train['tokens'].iloc[0][:10]}")

## 3. Treinar Word2Vec

In [None]:
# Combinar todos os textos para treinar Word2Vec
all_texts = train['tokens'].tolist()
if test is not None:
    all_texts += test['tokens'].tolist()

w2v = Word2Vec(
    sentences=all_texts,
    vector_size=EMBEDDING_DIM,
    window=5,
    min_count=2,
    workers=4,
    epochs=10,
    seed=SEED
)

print(f"Vocabulário: {len(w2v.wv)} palavras")

In [None]:
def text_to_embedding(tokens, model, dim):
    """Média dos vetores de palavras"""
    vectors = [model.wv[w] for w in tokens if w in model.wv]
    if len(vectors) == 0:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)

X = np.array([text_to_embedding(t, w2v, EMBEDDING_DIM) for t in train['tokens']])
y = train['target'].values

print(f"X shape: {X.shape}")

## 4. Treinar Classificadores

In [None]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=SEED),
    'LightGBM': lgb.LGBMClassifier(n_estimators=200, max_depth=10, class_weight='balanced', random_state=SEED, verbose=-1),
}

results = {}
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=skf, scoring='f1_macro', n_jobs=-1)
    results[name] = scores
    print(f"{name}: F1-Macro = {scores.mean():.4f} (+/- {scores.std()*2:.4f})")

best_name = max(results, key=lambda k: results[k].mean())
print(f"\nMelhor: {best_name}")

## 5. Gerar Submissão

In [None]:
# ============================================================
# Geração de Submissão
# ============================================================
# Treinar modelo final
best_model = models[best_name]
best_model.fit(X, y)

# Carregar test
test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
test['tokens'] = test['report'].apply(preprocess)

# Fazer predições
X_test = np.array([text_to_embedding(t, w2v, EMBEDDING_DIM) for t in test['tokens']])
predictions = best_model.predict(X_test)

# Criar submission
submission = pd.DataFrame({'ID': test['ID'], 'target': predictions})

# SEMPRE salvar submission.csv no diretório atual (exigido pelo Kaggle)
submission.to_csv('submission.csv', index=False)
print("✅ submission.csv salvo no diretório atual")

# Também salvar no OUTPUT_DIR para persistência (Colab/Local)
if not IS_KAGGLE:
    submission_path = os.path.join(OUTPUT_DIR, 'submission_word2vec.csv')
    submission.to_csv(submission_path, index=False)
    print(f"✅ Cópia salva em: {submission_path}")

print(f"\nDistribuição das predições:")
print(submission['target'].value_counts().sort_index())

In [None]:
# Download no Colab (opcional)
if IS_COLAB and os.path.exists('submission.csv'):
    from google.colab import files
    files.download('submission.csv')