# SPR 2026 - Sentence Transformers

**SBERT: embeddings densos de alta qualidade**

- ✅ paraphrase-multilingual-MiniLM-L12-v2
- ✅ Embeddings 384D pré-treinados
- ✅ Tempo esperado: ~5-10 min

---
**CONFIGURAÇÃO KAGGLE:**
1. Settings → Internet → **OFF**
2. **PRIMEIRO:** Rode `models/sbert/download_sbert.ipynb` com Internet **ON** e Save Version
3. Add Data → **Your Work** → `download_sbert` (ou qualquer output com o modelo SBERT)
4. **IMPORTANTE:** Execute "Run All" após commit

> ⚠️ O modelo `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` **não está disponível** 
> diretamente no Kaggle Models. Use o notebook de download primeiro.
---

In [None]:
# =============================================================================
# SPR 2026 - SBERT: SENTENCE TRANSFORMERS + LIGHTGBM
# =============================================================================
# - paraphrase-multilingual-MiniLM-L12-v2 (offline)
# - Embeddings 384D
# - LightGBM classifier
# =============================================================================

import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

SEED = 42
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

# =============================================================================
# AUTO-DETECTAR MODELO EM /kaggle/input
# =============================================================================
MODEL_NAME = 'paraphrase-multilingual-MiniLM-L12-v2'
base = '/kaggle/input'
MODEL_PATH = None

# Procurar modelo em /kaggle/input (vários formatos possíveis)
if os.path.exists(base):
    for d in os.listdir(base):
        path = os.path.join(base, d)
        dn = d.lower().replace('-', '').replace('_', '')
        search_terms = ['paraphrasemultilingualminilm', 'minilml12v2', 'sbertmultilingual']
        if os.path.isdir(path) and any(term in dn for term in search_terms):
            # Verificar se tem arquivos do modelo
            if os.path.exists(os.path.join(path, 'config.json')) or \
               os.path.exists(os.path.join(path, 'modules.json')) or \
               any(f.endswith('.bin') or f.endswith('.safetensors') for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))):
                MODEL_PATH = path
                break
            # Verificar subpastas (Kaggle Models às vezes tem estrutura aninhada)
            for sub in os.listdir(path):
                subpath = os.path.join(path, sub)
                if os.path.isdir(subpath):
                    if os.path.exists(os.path.join(subpath, 'config.json')) or \
                       os.path.exists(os.path.join(subpath, 'modules.json')):
                        MODEL_PATH = subpath
                        break
                    # Mais um nível
                    for sub2 in os.listdir(subpath):
                        subpath2 = os.path.join(subpath, sub2)
                        if os.path.isdir(subpath2) and (
                            os.path.exists(os.path.join(subpath2, 'config.json')) or
                            os.path.exists(os.path.join(subpath2, 'modules.json'))
                        ):
                            MODEL_PATH = subpath2
                            break

np.random.seed(SEED)
print('[1/5] Bibliotecas carregadas!')
print('DATA_DIR ->', DATA_DIR)

if MODEL_PATH:
    print('MODEL_PATH ->', MODEL_PATH, '(auto-detectado)')
else:
    print('⚠️ Modelo não encontrado em /kaggle/input')
    print('Pastas disponíveis:', os.listdir(base) if os.path.exists(base) else 'N/A')

# =============================================================================
# CARREGAR DADOS
# =============================================================================
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f'[2/5] Train: {train.shape} | Test: {test.shape}')

# =============================================================================
# CARREGAR SENTENCE TRANSFORMER
# =============================================================================
if MODEL_PATH is None:
    raise FileNotFoundError(
        f"Modelo '{MODEL_NAME}' não encontrado em /kaggle/input.\n"
        "No Kaggle:\n"
        "  1. Add Data → Datasets → procure 'sentence-transformers minilm multilingual'\n"
        "  2. Ou rode models/sbert/download_sbert.ipynb com Internet ON primeiro\n"
        "  3. Depois Add Data → Your Work → download_sbert"
    )

model = SentenceTransformer(MODEL_PATH)
print(f'[3/5] Modelo SentenceTransformer carregado de {MODEL_PATH}')

# =============================================================================
# GERAR EMBEDDINGS
# =============================================================================
print('Gerando embeddings do treino...')
X_train = model.encode(train['report'].tolist(), show_progress_bar=True, batch_size=32)
y_train = train['target'].values

print('Gerando embeddings do teste...')
X_test = model.encode(test['report'].tolist(), show_progress_bar=True, batch_size=32)
print(f'[4/5] Embeddings: X_train {X_train.shape} | X_test {X_test.shape}')

# =============================================================================
# TREINAR LIGHTGBM
# =============================================================================
clf = lgb.LGBMClassifier(
    n_estimators=200,
    max_depth=10,
    learning_rate=0.05,
    class_weight='balanced',
    random_state=SEED,
    verbose=-1
)

clf.fit(X_train, y_train)
print('[5/5] LightGBM treinado!')

# =============================================================================
# SUBMISSÃO
# =============================================================================
predictions = clf.predict(X_test)

submission = pd.DataFrame({
    'ID': test['ID'],
    'target': predictions
})

submission.to_csv('submission.csv', index=False)

print('✅ CONCLUÍDO: submission.csv')
print(submission['target'].value_counts().sort_index())