# SPR 2026 - Sentence Transformers

Embeddings pré-treinados + classificadores ML.

**Modelo:** paraphrase-multilingual-MiniLM-L12-v2

**Formato:** Code Competition (Kaggle) / Google Colab

In [None]:
# ============================================================
# SETUP - Ambiente e Dados
# ============================================================
import os
import sys

# Verificar Colab PRIMEIRO (mais confiável)
IS_COLAB = 'google.colab' in sys.modules
IS_KAGGLE = os.path.exists('/kaggle/input') and not IS_COLAB

print(f"Ambiente: {'Kaggle' if IS_KAGGLE else 'Colab' if IS_COLAB else 'Local'}")

if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    
    DRIVE_BASE = '/content/drive/MyDrive/SPR_2026_outputs'
    DATA_DIR = f'{DRIVE_BASE}/data'
    OUTPUT_DIR = DRIVE_BASE
    
    # Verificar se dados existem no Drive
    if not os.path.exists(f'{DATA_DIR}/train.csv'):
        print("⚠️ Dados não encontrados no Drive!")
        print("Execute primeiro o notebook 00_download_data.ipynb")
        raise FileNotFoundError(f"Arquivo não encontrado: {DATA_DIR}/train.csv")
elif IS_KAGGLE:
    DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
    OUTPUT_DIR = '/kaggle/working'
else:
    DATA_DIR = '../data'
    OUTPUT_DIR = '../submissions'
    os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"DATA_DIR: {DATA_DIR}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import lightgbm as lgb
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

SEED = 42
N_FOLDS = 5
MODEL_NAME = 'paraphrase-multilingual-MiniLM-L12-v2'

np.random.seed(SEED)

## 1. Carregar Dados

In [None]:
train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
print(f"Train: {train.shape}")

test_path = os.path.join(DATA_DIR, 'test.csv')
if os.path.exists(test_path):
    test = pd.read_csv(test_path)
    print(f"Test: {test.shape}")
else:
    test = None
    print("test.csv não disponível - será carregado no runtime Kaggle")

## 2. Gerar Embeddings

In [None]:
model = SentenceTransformer(MODEL_NAME)
print(f"Model loaded: {MODEL_NAME}")

print("Gerando embeddings do treino...")
X = model.encode(train['report'].tolist(), show_progress_bar=True, batch_size=32)
y = train['target'].values

print(f"X shape: {X.shape}")

In [None]:
if test is not None:
    print("Gerando embeddings do teste...")
    X_test = model.encode(test['report'].tolist(), show_progress_bar=True, batch_size=32)
    print(f"X_test shape: {X_test.shape}")

## 3. Treinar Classificadores

In [None]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=SEED),
    'SVM': SVC(kernel='rbf', class_weight='balanced', random_state=SEED),
    'LightGBM': lgb.LGBMClassifier(n_estimators=200, max_depth=10, class_weight='balanced', random_state=SEED, verbose=-1),
}

results = {}
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

for name, clf in models.items():
    print(f"Training {name}...")
    scores = cross_val_score(clf, X, y, cv=skf, scoring='f1_macro', n_jobs=-1)
    results[name] = scores
    print(f"  F1-Macro: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")

best_name = max(results, key=lambda k: results[k].mean())
print(f"\nMelhor: {best_name}")

## 4. Gerar Submissão

In [None]:
# ============================================================
# Geração de Submissão
# ============================================================
# Treinar modelo final
best_clf = models[best_name]
best_clf.fit(X, y)

# Carregar test
test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

# Fazer predições
X_test = model.encode(test['report'].tolist(), show_progress_bar=True, batch_size=32)
predictions = best_clf.predict(X_test)

# Criar submission
submission = pd.DataFrame({'ID': test['ID'], 'target': predictions})

# Salvar submission.csv
if IS_KAGGLE:
    # Kaggle: salvar direto no working
    submission.to_csv('/kaggle/working/submission.csv', index=False)
    print("Submissão salva: /kaggle/working/submission.csv")
else:
    # Colab/Local: salvar no OUTPUT_DIR
    submission_path = os.path.join(OUTPUT_DIR, 'submission.csv')
    submission.to_csv(submission_path, index=False)
    print(f"Submissão salva: {submission_path}")

print(f"\nDistribuição das predições:")
print(submission['target'].value_counts().sort_index())

In [None]:
# Download no Colab
if IS_COLAB:
    from google.colab import files
    submission_path = os.path.join(OUTPUT_DIR, 'submission.csv')
    if os.path.exists(submission_path):
        files.download(submission_path)