# SPR 2026 - TF-IDF Pretrain (Domain Adaptation)

## ✅ Funciona OFFLINE (com dataset adicional)

**Este notebook usa dataset médico externo para domain adaptation:**
- Treina TF-IDF em corpus médico expandido
- Melhora representação de termos médicos raros
- Combina vocabulário de múltiplas fontes

---
### Configuração Kaggle:
1. **Add Input** → **Datasets** → buscar `medicaltranscriptions`
   - Dataset: `tboyle10/medicaltranscriptions`
2. Settings → **Internet OFF**
3. Settings → Accelerator → **None** (não precisa GPU)
4. Run All

---
### Estratégia:
1. Carregar Medical Transcriptions (5000+ textos médicos)
2. Combinar com dados de treino SPR 2026
3. Treinar TF-IDF no corpus combinado (vocabulário expandido)
4. Fine-tune classificadores apenas nos dados SPR 2026

---

In [None]:
# ===== SPR 2026 - TF-IDF PRETRAIN (DOMAIN ADAPTATION) =====

import os
import re
import numpy as np
import pandas as pd
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.calibration import CalibratedClassifierCV

print("="*60)
print("SPR 2026 - TF-IDF Pretrain (Domain Adaptation)")
print("="*60)

# ==== CONFIGURAÇÕES ====
SEED = 42
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

# Paths possíveis para Medical Transcriptions
PRETRAIN_PATHS = [
    '/kaggle/input/medicaltranscriptions/mtsamples.csv',
    '/kaggle/input/medical-transcriptions/mtsamples.csv',
    '/kaggle/input/tboyle10-medicaltranscriptions/mtsamples.csv',
]

np.random.seed(SEED)

In [None]:
# ==== CARREGAR DADOS COMPETIÇÃO ====
print("\n[1/7] Carregando dados da competição...")
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')

print(f"    Train SPR: {train.shape}")
print(f"    Test SPR: {test.shape}")

# Distribuição
print("\nDistribuição das classes:")
for label, count in train['label'].value_counts().sort_index().items():
    print(f"    Classe {label}: {count:5d}")

In [None]:
# ==== CARREGAR DATASET DE PRÉ-TREINO ====
print("\n[2/7] Carregando dataset de pré-treino...")

pretrain_df = None

# Tentar encontrar o dataset
for path in PRETRAIN_PATHS:
    if os.path.exists(path):
        pretrain_df = pd.read_csv(path)
        print(f"    Encontrado: {path}")
        break

# Se não encontrou, buscar em /kaggle/input
if pretrain_df is None:
    print("    Buscando em /kaggle/input...")
    for root, dirs, files in os.walk('/kaggle/input'):
        for f in files:
            if 'mtsample' in f.lower() or 'transcription' in f.lower():
                if f.endswith('.csv'):
                    path = os.path.join(root, f)
                    pretrain_df = pd.read_csv(path)
                    print(f"    Encontrado: {path}")
                    break
        if pretrain_df is not None:
            break

# Verificar se encontrou
if pretrain_df is None:
    print("\n⚠️ Dataset de pré-treino não encontrado!")
    print("\nPara usar este notebook:")
    print("1. Vá em 'Add Data'")
    print("2. Busque por 'medicaltranscriptions'")
    print("3. Adicione o dataset 'tboyle10/medicaltranscriptions'")
    print("\nContinuando sem pré-treino (usando apenas dados da competição)...")
    USE_PRETRAIN = False
else:
    USE_PRETRAIN = True
    print(f"\n    Pretrain dataset shape: {pretrain_df.shape}")
    print(f"    Colunas: {list(pretrain_df.columns)}")

In [None]:
# ==== PREPARAR CORPUS COMBINADO ====
print("\n[3/7] Preparando corpus combinado...")

def preprocess(text):
    """Preprocessamento básico."""
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'\d{5,}', '', text)  # Remove IDs longos
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Textos da competição
spr_texts = train['text'].apply(preprocess).tolist()
test_texts = test['text'].apply(preprocess).tolist()

if USE_PRETRAIN:
    # Identificar coluna de texto no dataset de pré-treino
    text_col = None
    for col in ['transcription', 'text', 'description', 'medical_specialty']:
        if col in pretrain_df.columns:
            text_col = col
            break
    
    if text_col is None:
        # Usar a coluna com mais texto
        text_col = pretrain_df.select_dtypes(include='object').columns[0]
    
    print(f"    Usando coluna: '{text_col}'")
    
    # Filtrar textos válidos
    pretrain_texts = pretrain_df[text_col].dropna().apply(preprocess).tolist()
    pretrain_texts = [t for t in pretrain_texts if len(t) > 50]  # Mínimo 50 chars
    
    # Combinar todos os textos para treinar TF-IDF
    all_texts = spr_texts + pretrain_texts + test_texts
    
    print(f"\n    Textos SPR (train): {len(spr_texts)}")
    print(f"    Textos pretrain: {len(pretrain_texts)}")
    print(f"    Textos SPR (test): {len(test_texts)}")
    print(f"    Total corpus: {len(all_texts)}")
else:
    # Sem pré-treino: usar apenas dados da competição
    all_texts = spr_texts + test_texts
    print(f"    Corpus (sem pretrain): {len(all_texts)}")

In [None]:
# ==== TF-IDF COM VOCABULÁRIO EXPANDIDO ====
print("\n[4/7] Treinando TF-IDF com vocabulário expandido...")

# Treinar TF-IDF no corpus completo (domain adaptation)
tfidf = TfidfVectorizer(
    max_features=20000,  # Mais features para capturar vocabulário médico
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True,
    strip_accents='unicode',
)

# Fit no corpus completo (transfere conhecimento do pretrain)
tfidf.fit(all_texts)

print(f"    Vocabulário: {len(tfidf.vocabulary_)} termos")

# Transform apenas nos dados da competição
X_train_tfidf = tfidf.transform(spr_texts)
X_test_tfidf = tfidf.transform(test_texts)

y_train = train['label'].values

print(f"    X_train shape: {X_train_tfidf.shape}")
print(f"    X_test shape: {X_test_tfidf.shape}")

In [None]:
# ==== ANÁLISE DO VOCABULÁRIO ====
print("\n--- Análise do vocabulário ---")

# Termos médicos importantes no vocabulário
medical_terms = [
    'nodulo', 'nódulo', 'calcificação', 'calcificacao', 'mama', 
    'benigno', 'maligno', 'birads', 'bi-rads', 'categoria',
    'assimetria', 'espiculado', 'massa', 'lesão', 'linfonodo',
    'mamografia', 'ultrassom', 'biópsia', 'axila', 'parênquima',
]

found_terms = [t for t in medical_terms if t in tfidf.vocabulary_]
missing_terms = [t for t in medical_terms if t not in tfidf.vocabulary_]

print(f"\n    Termos médicos encontrados: {len(found_terms)}/{len(medical_terms)}")
if missing_terms:
    print(f"    Faltando: {missing_terms[:10]}...")

# Top palavras por frequência
feature_names = tfidf.get_feature_names_out()
idf_scores = tfidf.idf_

# Palavras mais comuns (menor IDF)
common_idx = np.argsort(idf_scores)[:20]
print(f"\n    Top 20 termos mais comuns:")
print(f"    {[feature_names[i] for i in common_idx]}")

In [None]:
# ==== TREINAR MODELOS TOP 3 ====
print("\n[5/7] Treinando Top 3 modelos...")

# 1. LinearSVC (melhor single model: 0.77885)
print("\n--- LinearSVC ---")
svc = LinearSVC(
    C=0.5,
    loss='squared_hinge',
    max_iter=3000,
    class_weight='balanced',
    dual='auto',
    random_state=SEED
)
svc_calibrated = CalibratedClassifierCV(svc, cv=3)
svc_calibrated.fit(X_train_tfidf, y_train)
print("    Treinado!")

# 2. SGDClassifier (segundo melhor: 0.75019)
print("\n--- SGDClassifier ---")
sgd = SGDClassifier(
    loss='modified_huber',
    alpha=1e-4,
    penalty='l2',
    max_iter=1000,
    class_weight='balanced',
    random_state=SEED,
    n_jobs=-1
)
sgd.fit(X_train_tfidf, y_train)
print("    Treinado!")

# 3. LogisticRegression (terceiro: 0.72935)
print("\n--- LogisticRegression ---")
lr = LogisticRegression(
    C=1.0,
    solver='lbfgs',
    max_iter=1000,
    class_weight='balanced',
    multi_class='multinomial',
    random_state=SEED,
    n_jobs=-1
)
lr.fit(X_train_tfidf, y_train)
print("    Treinado!")

In [None]:
# ==== CROSS-VALIDATION ====
print("\n[6/7] Cross-Validation (5-fold)...")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

svc_cv = CalibratedClassifierCV(
    LinearSVC(C=0.5, class_weight='balanced', dual='auto', random_state=SEED), cv=3
)
sgd_cv = SGDClassifier(
    loss='modified_huber', alpha=1e-4, class_weight='balanced', random_state=SEED
)
lr_cv = LogisticRegression(
    C=1.0, class_weight='balanced', multi_class='multinomial', random_state=SEED
)

svc_scores = cross_val_score(svc_cv, X_train_tfidf, y_train, cv=cv, scoring='f1_macro')
sgd_scores = cross_val_score(sgd_cv, X_train_tfidf, y_train, cv=cv, scoring='f1_macro')
lr_scores = cross_val_score(lr_cv, X_train_tfidf, y_train, cv=cv, scoring='f1_macro')

print(f"\n    LinearSVC:     {svc_scores.mean():.5f} ± {svc_scores.std():.5f}")
print(f"    SGDClassifier: {sgd_scores.mean():.5f} ± {sgd_scores.std():.5f}")
print(f"    LogisticReg:   {lr_scores.mean():.5f} ± {lr_scores.std():.5f}")

In [None]:
# ==== ENSEMBLE PREDICTIONS ====
print("\n[7/7] Gerando predições ensemble...")

# Predições individuais
pred_svc = svc_calibrated.predict(X_test_tfidf)
pred_sgd = sgd.predict(X_test_tfidf)
pred_lr = lr.predict(X_test_tfidf)

# Probabilidades
proba_svc = svc_calibrated.predict_proba(X_test_tfidf)
proba_sgd = sgd.predict_proba(X_test_tfidf)
proba_lr = lr.predict_proba(X_test_tfidf)

# Weighted Voting
weights = np.array([0.77885, 0.75019, 0.72935])
weights = weights / weights.sum()

proba_weighted = (
    proba_svc * weights[0] + 
    proba_sgd * weights[1] + 
    proba_lr * weights[2]
)
pred_weighted = np.argmax(proba_weighted, axis=1)

print(f"\n    LinearSVC dist:  {dict(Counter(pred_svc))}")
print(f"    Weighted dist:   {dict(Counter(pred_weighted))}")

In [None]:
# ==== SUBMISSÕES ====
print("\n--- Gerando submissões ---")

# 1. LinearSVC com pretrain
submission_svc = pd.DataFrame({
    'id': test['id'],
    'label': pred_svc
})
submission_svc.to_csv('/kaggle/working/submission_linearsvc_pretrain.csv', index=False)
print("    submission_linearsvc_pretrain.csv")

# 2. Weighted Ensemble com pretrain
submission_weighted = pd.DataFrame({
    'id': test['id'],
    'label': pred_weighted
})
submission_weighted.to_csv('/kaggle/working/submission_weighted_pretrain.csv', index=False)
print("    submission_weighted_pretrain.csv")

# PRINCIPAL: LinearSVC (best single model)
submission_svc.to_csv('/kaggle/working/submission.csv', index=False)

print("\n" + "="*60)
pretrain_status = "COM" if USE_PRETRAIN else "SEM"
print(f"CONCLUÍDO - submission.csv ({pretrain_status} pretrain)")
print("="*60)
print("\nDomain Adaptation:")
if USE_PRETRAIN:
    print(f"  - Corpus expandido: {len(all_texts)} textos")
    print(f"  - Vocabulário: {len(tfidf.vocabulary_)} termos")
    print("  - Melhor representação de termos médicos raros")
else:
    print("  - Usando apenas dados da competição")
    print("  - Adicione 'medicaltranscriptions' para domain adaptation")