# SPR 2026 - Sentence Transformers

**SBERT: embeddings densos de alta qualidade**

- ‚úÖ paraphrase-multilingual-MiniLM-L12-v2
- ‚úÖ Embeddings 384D pr√©-treinados
- ‚úÖ Tempo esperado: ~5-10 min

---
**CONFIGURA√á√ÉO KAGGLE:**
1. Settings ‚Üí Internet ‚Üí **OFF**
2. Add Data ‚Üí **Models** ‚Üí `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`
3. **IMPORTANTE:** Execute "Run All" ap√≥s commit

> O notebook auto-detecta o modelo em `/kaggle/input/`.
---

In [None]:
# =============================================================================
# SPR 2026 - SBERT: SENTENCE TRANSFORMERS + LIGHTGBM
# =============================================================================
# - paraphrase-multilingual-MiniLM-L12-v2 (offline)
# - Embeddings 384D
# - LightGBM classifier
# =============================================================================

import os
import numpy as np
import pandas as pd
import lightgbm as lgb
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

SEED = 42
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

# =============================================================================
# AUTO-DETECTAR MODELO EM /kaggle/input
# =============================================================================
def find_sbert_model(base='/kaggle/input'):
    """
    Procura o modelo SBERT em /kaggle/input.
    Busca recursivamente por pastas contendo modules.json (SentenceTransformer).
    """
    if not os.path.exists(base):
        return None
    
    def is_sbert_folder(path):
        """Verifica se a pasta cont√©m um modelo SentenceTransformer v√°lido"""
        if not os.path.isdir(path):
            return False
        # SentenceTransformer salva modules.json
        return os.path.exists(os.path.join(path, 'modules.json'))
    
    # Busca em todos os n√≠veis de /kaggle/input
    for d in os.listdir(base):
        path = os.path.join(base, d)
        if not os.path.isdir(path):
            continue
        
        # N√≠vel 1: /kaggle/input/<modelo>/
        if is_sbert_folder(path):
            return path
        
        # N√≠vel 2: /kaggle/input/<modelo>/<subpasta>/
        for sub in os.listdir(path):
            subpath = os.path.join(path, sub)
            if is_sbert_folder(subpath):
                return subpath
            
            # N√≠vel 3: /kaggle/input/<modelo>/<subpasta>/<subsubpasta>/
            if os.path.isdir(subpath):
                for sub2 in os.listdir(subpath):
                    subpath2 = os.path.join(subpath, sub2)
                    if is_sbert_folder(subpath2):
                        return subpath2
                    
                    # N√≠vel 4 (Kaggle Models pode ter estrutura profunda)
                    if os.path.isdir(subpath2):
                        for sub3 in os.listdir(subpath2):
                            subpath3 = os.path.join(subpath2, sub3)
                            if is_sbert_folder(subpath3):
                                return subpath3
    
    return None

MODEL_PATH = find_sbert_model()

np.random.seed(SEED)
print('[1/5] Bibliotecas carregadas!')
print('DATA_DIR ->', DATA_DIR)

# Debug: mostrar estrutura do /kaggle/input
print('\nüìÅ Estrutura de /kaggle/input:')
base = '/kaggle/input'
if os.path.exists(base):
    for d in os.listdir(base):
        path = os.path.join(base, d)
        print(f'  {d}/')
        if os.path.isdir(path):
            for sub in os.listdir(path)[:5]:
                subpath = os.path.join(path, sub)
                marker = 'üìÇ' if os.path.isdir(subpath) else 'üìÑ'
                has_modules = ' ‚úÖ modules.json' if os.path.exists(os.path.join(subpath, 'modules.json')) else ''
                print(f'    {marker} {sub}{has_modules}')
            if len(os.listdir(path)) > 5:
                print(f'    ... (+{len(os.listdir(path))-5} mais)')

print()
if MODEL_PATH:
    print(f'‚úÖ MODEL_PATH -> {MODEL_PATH}')
else:
    print('‚ùå Modelo SBERT n√£o encontrado em /kaggle/input')

# =============================================================================
# CARREGAR DADOS
# =============================================================================
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f'[2/5] Train: {train.shape} | Test: {test.shape}')

# =============================================================================
# CARREGAR SENTENCE TRANSFORMER
# =============================================================================
if MODEL_PATH is None:
    raise FileNotFoundError(
        "Modelo SBERT n√£o encontrado em /kaggle/input.\n\n"
        "COMO RESOLVER:\n"
        "  Add Data ‚Üí Models ‚Üí sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    )

model = SentenceTransformer(MODEL_PATH)
print(f'[3/5] Modelo SentenceTransformer carregado de {MODEL_PATH}')

# =============================================================================
# GERAR EMBEDDINGS
# =============================================================================
print('Gerando embeddings do treino...')
X_train = model.encode(train['report'].tolist(), show_progress_bar=True, batch_size=32)
y_train = train['target'].values

print('Gerando embeddings do teste...')
X_test = model.encode(test['report'].tolist(), show_progress_bar=True, batch_size=32)
print(f'[4/5] Embeddings: X_train {X_train.shape} | X_test {X_test.shape}')

# =============================================================================
# TREINAR LIGHTGBM
# =============================================================================
clf = lgb.LGBMClassifier(
    n_estimators=200,
    max_depth=10,
    learning_rate=0.05,
    class_weight='balanced',
    random_state=SEED,
    verbose=-1
)

clf.fit(X_train, y_train)
print('[5/5] LightGBM treinado!')

# =============================================================================
# SUBMISS√ÉO
# =============================================================================
predictions = clf.predict(X_test)

submission = pd.DataFrame({
    'ID': test['ID'],
    'target': predictions
})

submission.to_csv('submission.csv', index=False)

print('‚úÖ CONCLU√çDO: submission.csv')
print(submission['target'].value_counts().sort_index())