# Valida√ß√£o: LLMs Instruction-Tuned (Phi, Mistral, BioGPT)

**Comparativo LLMs com Instruction Tuning**

## üìä Modelos
- **Phi-3.5** (3.8B) - Microsoft, eficiente
- **Mistral** (7B) - Mistral AI, forte em tarefas
- **BioGPT** - Microsoft, treinado em PubMed

## üéØ Objetivo
Comparar LLMs instruction-tuned para classifica√ß√£o BI-RADS.

---

In [None]:
import os
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

if os.path.exists('/kaggle/input'):
    DATA_DIR = '/kaggle/input/competitions/spr-2026-mammography-report-classification'
else:
    DATA_DIR = '../data'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

In [None]:
# ===== DADOS =====
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')

# Amostra menor para valida√ß√£o
train_sample = train_df.groupby('target', group_keys=False).apply(
    lambda x: x.sample(min(20, len(x)), random_state=SEED)
).reset_index(drop=True)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_sample['report'].tolist(),
    train_sample['target'].tolist(),
    test_size=0.3,
    stratify=train_sample['target'],
    random_state=SEED
)

print(f'Train: {len(train_texts)}, Val: {len(val_texts)}')

In [None]:
# ===== PROMPT UNIVERSAL =====
SYSTEM_PROMPT = """You are a radiologist classifying mammography reports into BI-RADS categories.

Categories:
0 = Incomplete (needs more imaging)
1 = Negative (normal)
2 = Benign
3 = Probably Benign (<2% malignancy)
4 = Suspicious (2-95% malignancy, biopsy needed)
5 = Highly Suspicious (>95% malignancy)
6 = Known Malignancy (biopsy-proven)

Respond with ONLY the category number (0-6)."""

USER_TEMPLATE = """Report: {report}

BI-RADS:"""

In [None]:
# ===== FUN√á√ÉO HELPER =====
def find_model_in_kaggle(keyword):
    """Procura modelo no Kaggle input."""
    if not os.path.exists('/kaggle/input'):
        return None
    base = '/kaggle/input'
    for item in os.listdir(base):
        if keyword.lower() in item.lower():
            path = os.path.join(base, item)
            for sub in os.listdir(path):
                subpath = os.path.join(path, sub)
                if os.path.isdir(subpath) and os.path.exists(os.path.join(subpath, 'config.json')):
                    return subpath
            if os.path.exists(os.path.join(path, 'config.json')):
                return path
    return None

def load_model(model_path):
    """Carrega modelo e tokenizer."""
    tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_path, local_files_only=True,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        low_cpu_mem_usage=True,
        trust_remote_code=True
    )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer

def classify_with_model(model, tokenizer, report, system_prompt=SYSTEM_PROMPT):
    """Classifica um relat√≥rio."""
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": USER_TEMPLATE.format(report=report)}
    ]
    
    if hasattr(tokenizer, 'apply_chat_template'):
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    else:
        text = f"{system_prompt}\n\n{USER_TEMPLATE.format(report=report)}"
    
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=4096)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs, max_new_tokens=10, do_sample=False,
            pad_token_id=tokenizer.pad_token_id
        )
    
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    
    for char in response.strip():
        if char.isdigit() and char in '0123456':
            return int(char)
    return 2

In [None]:
# ===== TESTE PHI-3.5 =====
print("\n" + "="*50)
print("Phi-3.5 (3.8B)")
print("="*50)

phi_path = find_model_in_kaggle('phi') or 'microsoft/Phi-3.5-mini-instruct'
print(f'Model: {phi_path}')

try:
    phi_model, phi_tokenizer = load_model(phi_path)
    
    phi_preds = [
        classify_with_model(phi_model, phi_tokenizer, text)
        for text in tqdm(val_texts, desc='Phi-3.5')
    ]
    
    phi_f1 = f1_score(val_labels, phi_preds, average='macro')
    print(f'F1-Macro: {phi_f1:.5f}')
    print(classification_report(val_labels, phi_preds))
    
    del phi_model
    torch.cuda.empty_cache()
except Exception as e:
    print(f'Erro: {e}')
    phi_f1 = 0

In [None]:
# ===== TESTE MISTRAL =====
print("\n" + "="*50)
print("Mistral (7B)")
print("="*50)

mistral_path = find_model_in_kaggle('mistral') or 'mistralai/Mistral-7B-Instruct-v0.3'
print(f'Model: {mistral_path}')

try:
    mistral_model, mistral_tokenizer = load_model(mistral_path)
    
    mistral_preds = [
        classify_with_model(mistral_model, mistral_tokenizer, text)
        for text in tqdm(val_texts, desc='Mistral')
    ]
    
    mistral_f1 = f1_score(val_labels, mistral_preds, average='macro')
    print(f'F1-Macro: {mistral_f1:.5f}')
    print(classification_report(val_labels, mistral_preds))
    
    del mistral_model
    torch.cuda.empty_cache()
except Exception as e:
    print(f'Erro: {e}')
    mistral_f1 = 0

In [None]:
# ===== TESTE BIOGPT =====
print("\n" + "="*50)
print("BioGPT (PubMed)")
print("="*50)

biogpt_path = find_model_in_kaggle('biogpt') or 'microsoft/BioGPT-Large'
print(f'Model: {biogpt_path}')

try:
    biogpt_model, biogpt_tokenizer = load_model(biogpt_path)
    
    # BioGPT n√£o tem chat template, usar prompt simples
    BIOGPT_PROMPT = """Task: Classify mammography report into BI-RADS (0-6).
0=Incomplete, 1=Negative, 2=Benign, 3=Probably Benign, 4=Suspicious, 5=Highly Suspicious, 6=Known Malignancy
Report: {report}
BI-RADS:"""
    
    biogpt_preds = []
    for text in tqdm(val_texts, desc='BioGPT'):
        inputs = biogpt_tokenizer(
            BIOGPT_PROMPT.format(report=text),
            return_tensors="pt", truncation=True, max_length=1024
        )
        inputs = {k: v.to(biogpt_model.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = biogpt_model.generate(**inputs, max_new_tokens=10, do_sample=False)
        
        response = biogpt_tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        
        pred = 2
        for char in response.strip():
            if char.isdigit() and char in '0123456':
                pred = int(char)
                break
        biogpt_preds.append(pred)
    
    biogpt_f1 = f1_score(val_labels, biogpt_preds, average='macro')
    print(f'F1-Macro: {biogpt_f1:.5f}')
    print(classification_report(val_labels, biogpt_preds))
    
    del biogpt_model
    torch.cuda.empty_cache()
except Exception as e:
    print(f'Erro: {e}')
    biogpt_f1 = 0

In [None]:
# ===== RESUMO =====
print("\n" + "="*60)
print("üìä RESUMO - LLMs Instruction-Tuned")
print("="*60)

results = [
    ('Phi-3.5 (3.8B)', phi_f1),
    ('Mistral (7B)', mistral_f1),
    ('BioGPT', biogpt_f1),
]

print(f"{'Modelo':<20} {'F1-Macro':>10}")
print("-"*35)
for name, f1 in sorted(results, key=lambda x: -x[1]):
    if f1 > 0:
        print(f"{name:<20} {f1:>10.5f}")

print(f"\nüìù Refer√™ncia (TF-IDF): 0.77885")
print(f"üìù Refer√™ncia (BERTimbau v4): 0.82073")

In [None]:
# ===== INSIGHTS =====
print("""
üìù INSIGHTS - LLMs Instruction-Tuned
=====================================

1. **Phi-3.5:**
   - Muito eficiente (3.8B params)
   - Bom para ensemble r√°pido
   - [RESULTADO: PREENCHER]

2. **Mistral:**
   - Maior (7B), melhor capacidade
   - Instruction-following forte
   - [RESULTADO: PREENCHER]

3. **BioGPT:**
   - Treinado em PubMed
   - Sem chat template nativo
   - [RESULTADO: PREENCHER]

4. **Compara√ß√£o:**
   - Melhor modelo: [PREENCHER]
   - Trade-off tamanho vs performance

5. **Recomenda√ß√£o:**
   - Para ensemble: usar o mais leve com bom F1
   - Para solo: comparar com transformers fine-tuned
""")