# SPR 2026 - Sentence Transformers

**SBERT: embeddings densos de alta qualidade**

- ✅ paraphrase-multilingual-MiniLM-L12-v2
- ✅ Embeddings 384D pré-treinados
- ✅ Tempo esperado: ~5-10 min

---
**CONFIGURAÇÃO KAGGLE:**
1. Settings → Internet → **OFF**
2. Add Data → Models → `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`
3. **IMPORTANTE:** Execute "Run All" após commit
---

In [None]:
# =============================================================================
# SPR 2026 - SBERT: SENTENCE TRANSFORMERS + LIGHTGBM
# =============================================================================
# - paraphrase-multilingual-MiniLM-L12-v2 (offline)
# - Embeddings 384D
# - LightGBM classifier
# =============================================================================

import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

SEED = 42
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
# MODEL_PATH will be auto-detected from /kaggle/input if present

# prefer exact HF model name but look for local folder in /kaggle/input
MODEL_NAME = 'paraphrase-multilingual-MiniLM-L12-v2'
base = '/kaggle/input'
model_candidates = []
if os.path.exists(base):
    for d in os.listdir(base):
        path = os.path.join(base, d)
        dn = d.lower()
        if os.path.isdir(path) and (MODEL_NAME.lower() in dn or ('paraphrase' in dn and 'minilm' in dn)):
            model_candidates.append(path)

if model_candidates:
    MODEL_PATH = model_candidates[0]
    local_only = True
else:
    MODEL_PATH = MODEL_NAME
    local_only = False

np.random.seed(SEED)
print('[1/5] Bibliotecas carregadas!')
print('DATA_DIR ->', DATA_DIR)
print('MODEL_PATH ->', MODEL_PATH, '| local_only =', local_only)

# =============================================================================
# CARREGAR DADOS
# =============================================================================
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f'[2/5] Train: {train.shape} | Test: {test.shape}')

# =============================================================================
# CARREGAR SENTENCE TRANSFORMER
# =============================================================================
try:
    model = SentenceTransformer(MODEL_PATH, local_files_only=local_only)
    print('[3/5] Modelo SentenceTransformer carregado de', MODEL_PATH)
except Exception as e:
    raise FileNotFoundError(
        f"Erro ao carregar o modelo '{MODEL_PATH}': {e}\nNo Kaggle: verifique Add Data → Models (modelo deve aparecer em /kaggle/input).\nPara baixar da HuggingFace ative Internet nas Settings."
,
,
,
,
,
,
,
,
,
,
,
4
5
,
,
,
,
,
,
,
,
,
5
5
,
,
,
,
,
,
,
,
,
,
,
,