# SPR 2026 - Word Embeddings (Word2Vec, FastText, GloVe)
## Notebook para Google Colab

Abordagem usando word embeddings pré-treinados com classificadores.

**Embeddings testados:**
- Word2Vec (NILC - Portuguese)
- FastText (Portuguese)
- GloVe (Portuguese)

## 1. Setup e Download dos Dados

In [None]:
# Instalar dependências
!pip install kaggle gensim lightgbm -q

In [None]:
# Configurar Kaggle API
import os

try:
    from google.colab import userdata
    os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
    os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')
    print("Kaggle credentials loaded from Colab secrets")
except:
    # os.environ['KAGGLE_USERNAME'] = 'seu_username'
    # os.environ['KAGGLE_KEY'] = 'sua_key'
    print("Configure suas credenciais Kaggle manualmente")

!mkdir -p ~/.kaggle
!echo '{"username":"'$KAGGLE_USERNAME'","key":"'$KAGGLE_KEY'"}' > ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Download dos dados da competição
!kaggle competitions download -c spr-2026-mammography-report-classification
!unzip -o spr-2026-mammography-report-classification.zip -d data/

In [None]:
# Download Word2Vec Português (NILC)
# Opções: cbow_s50, cbow_s100, cbow_s300, skip_s50, skip_s100, skip_s300
!mkdir -p embeddings
!wget -nc http://143.107.183.175:22980/download.php?file=embeddings/word2vec/cbow_s100.zip -O embeddings/cbow_s100.zip
!unzip -o embeddings/cbow_s100.zip -d embeddings/

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from gensim.models import KeyedVectors
import lightgbm as lgb
import re
import warnings
warnings.filterwarnings('ignore')

SEED = 42
N_FOLDS = 5
np.random.seed(SEED)

## 2. Carregar Dados e Embeddings

In [None]:
# Carregar dados
train_df = pd.read_csv('data/train.csv')
print(f"Train shape: {train_df.shape}")

test_path = 'data/test.csv'
test_df = pd.read_csv(test_path) if os.path.exists(test_path) else None

In [None]:
# Carregar Word2Vec
print("Carregando Word2Vec...")
w2v_model = KeyedVectors.load_word2vec_format('embeddings/cbow_s100.txt')
print(f"Vocabulário: {len(w2v_model)} palavras")
print(f"Dimensão: {w2v_model.vector_size}")

## 3. Preprocessamento e Vetorização

In [None]:
def preprocess_text(text):
    """Preprocessamento básico do texto."""
    text = str(text).lower()
    text = re.sub(r'[^a-záàâãéêíóôõúç\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.split()

def get_sentence_embedding(tokens, model, method='mean'):
    """Obter embedding da sentença."""
    vectors = []
    for token in tokens:
        if token in model:
            vectors.append(model[token])
    
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    
    vectors = np.array(vectors)
    
    if method == 'mean':
        return vectors.mean(axis=0)
    elif method == 'max':
        return vectors.max(axis=0)
    elif method == 'concat':
        return np.concatenate([vectors.mean(axis=0), vectors.max(axis=0)])
    else:
        return vectors.mean(axis=0)

def texts_to_embeddings(texts, model, method='mean'):
    """Converter lista de textos para embeddings."""
    embeddings = []
    for text in texts:
        tokens = preprocess_text(text)
        emb = get_sentence_embedding(tokens, model, method)
        embeddings.append(emb)
    return np.array(embeddings)

In [None]:
# Converter textos para embeddings
print("Convertendo textos para embeddings...")

# Testar diferentes métodos de agregação
methods = ['mean', 'max', 'concat']
embeddings_cache = {}

for method in methods:
    print(f"  Método: {method}")
    X = texts_to_embeddings(train_df['report'].tolist(), w2v_model, method)
    embeddings_cache[method] = X
    print(f"    Shape: {X.shape}")

## 4. Treinar e Avaliar Modelos

In [None]:
y = train_df['target'].values
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

results = []

for method in methods:
    X = embeddings_cache[method]
    
    # Logistic Regression
    lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=SEED)
    scores = cross_val_score(lr, X, y, cv=skf, scoring='f1_macro')
    results.append({'method': method, 'model': 'LogisticRegression', 'f1_macro': scores.mean(), 'std': scores.std()})
    print(f"{method} + LR: F1={scores.mean():.4f} (+/- {scores.std()*2:.4f})")
    
    # LightGBM
    lgbm = lgb.LGBMClassifier(n_estimators=200, class_weight='balanced', random_state=SEED, verbose=-1)
    scores = cross_val_score(lgbm, X, y, cv=skf, scoring='f1_macro')
    results.append({'method': method, 'model': 'LightGBM', 'f1_macro': scores.mean(), 'std': scores.std()})
    print(f"{method} + LGBM: F1={scores.mean():.4f} (+/- {scores.std()*2:.4f})")

In [None]:
# Resumo
results_df = pd.DataFrame(results).sort_values('f1_macro', ascending=False)
print("\n" + "="*60)
print("RESUMO DOS RESULTADOS")
print("="*60)
print(results_df.to_string(index=False))

best = results_df.iloc[0]
print(f"\nMelhor: {best['method']} + {best['model']} (F1={best['f1_macro']:.4f})")

## 5. Treinar Modelo Final e Gerar Submissão

In [None]:
# Usar melhor configuração
best_method = best['method']
X_train = embeddings_cache[best_method]

# Treinar modelo final
if best['model'] == 'LogisticRegression':
    final_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=SEED)
else:
    final_model = lgb.LGBMClassifier(n_estimators=200, class_weight='balanced', random_state=SEED, verbose=-1)

final_model.fit(X_train, y)
print("Modelo final treinado!")

In [None]:
# Gerar submissão
if test_df is not None:
    X_test = texts_to_embeddings(test_df['report'].tolist(), w2v_model, best_method)
    predictions = final_model.predict(X_test)
    
    submission = pd.DataFrame({
        'ID': test_df['ID'],
        'target': predictions,
    })
    
    submission.to_csv('submission_word2vec.csv', index=False)
    print("Submissão salva: submission_word2vec.csv")
    print(submission['target'].value_counts().sort_index())
else:
    print("Test file não disponível")

In [None]:
# Download
try:
    from google.colab import files
    files.download('submission_word2vec.csv')
except:
    print("Submissão disponível em: submission_word2vec.csv")