# SPR 2026 - Word2Vec + TF-IDF Weighted Mean

**Agregação ponderada por TF-IDF**

Em vez de média simples, pondera cada embedding pela importância TF-IDF da palavra.
Isso dá mais peso a termos mais informativos e menos peso a palavras comuns.

---
**CONFIGURAÇÃO OFFLINE:**
1. No Kaggle, vá em Settings → Internet → **OFF**
2. Gensim e scikit-learn já estão pré-instalados
---

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import re
import warnings
warnings.filterwarnings('ignore')

SEED = 42
EMBEDDING_DIM = 100
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
np.random.seed(SEED)
print('Bibliotecas carregadas!')

In [None]:
# Carregar dados
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')

print(f'Train: {train.shape}')
print(f'Test: {test.shape}')

In [None]:
# Preprocessamento
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-záàâãéèêíïóôõöúçñ\s]', ' ', text)
    return text.split()

def preprocess_for_tfidf(text):
    text = text.lower()
    text = re.sub(r'[^a-záàâãéèêíïóôõöúçñ\s]', ' ', text)
    return text

train['tokens'] = train['report'].apply(preprocess)
test['tokens'] = test['report'].apply(preprocess)
train['clean_text'] = train['report'].apply(preprocess_for_tfidf)
test['clean_text'] = test['report'].apply(preprocess_for_tfidf)

print(f'Exemplo tokens: {train["tokens"].iloc[0][:5]}')

In [None]:
# Treinar Word2Vec
all_texts = train['tokens'].tolist() + test['tokens'].tolist()

w2v = Word2Vec(
    sentences=all_texts,
    vector_size=EMBEDDING_DIM,
    window=5,
    min_count=2,
    workers=4,
    epochs=10,
    seed=SEED
)

print(f'Vocabulário W2V: {len(w2v.wv)} palavras')

In [None]:
# Criar TF-IDF para pesos
tfidf = TfidfVectorizer(min_df=2, max_df=0.95)
all_clean_texts = train['clean_text'].tolist() + test['clean_text'].tolist()
tfidf.fit(all_clean_texts)

# Criar dicionário palavra -> IDF
word2idf = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))
print(f'Vocabulário TF-IDF: {len(word2idf)} palavras')

In [None]:
# Agregação ponderada por TF-IDF
def text_to_embedding_tfidf_weighted(tokens, w2v_model, idf_dict, dim):
    vectors = []
    weights = []
    
    for word in tokens:
        if word in w2v_model.wv:
            vectors.append(w2v_model.wv[word])
            # Usar IDF como peso (default=1.0 para palavras sem IDF)
            weights.append(idf_dict.get(word, 1.0))
    
    if len(vectors) == 0:
        return np.zeros(dim)
    
    vectors = np.array(vectors)
    weights = np.array(weights).reshape(-1, 1)
    
    # Média ponderada
    weighted_sum = np.sum(vectors * weights, axis=0)
    weighted_avg = weighted_sum / np.sum(weights)
    
    return weighted_avg

In [None]:
# Gerar embeddings
X_train = np.array([text_to_embedding_tfidf_weighted(t, w2v, word2idf, EMBEDDING_DIM) for t in train['tokens']])
X_test = np.array([text_to_embedding_tfidf_weighted(t, w2v, word2idf, EMBEDDING_DIM) for t in test['tokens']])
y_train = train['target'].values

print(f'X_train shape: {X_train.shape}')

In [None]:
# Treinar Logistic Regression
model = LogisticRegression(
    C=1.0,
    max_iter=1000,
    class_weight='balanced',
    random_state=SEED,
    n_jobs=-1
)

model.fit(X_train, y_train)
print('Modelo treinado!')

In [None]:
# Predições e submissão
predictions = model.predict(X_test)

submission = pd.DataFrame({
    'ID': test['ID'],
    'target': predictions
})

submission.to_csv('submission.csv', index=False)

print('submission.csv criado!')
print(submission['target'].value_counts().sort_index())