# SPR 2026 - Qwen2.5 7B Instruct (Zero-Shot Classification)

**Este notebook usa Qwen2.5 7B Instruct do Kaggle Models para classificacao zero-shot.**

---
**CONFIGURACAO KAGGLE:**

1. **Add Input** -> **Models** -> buscar `Qwen2.5`
   - Autor: **QwenLM**
   - Variation: **transformers** (IMPORTANTE: escolher variacao transformers/safetensors, NAO GGUF)
2. **Settings** -> Internet -> **OFF**
3. **Settings** -> Accelerator -> **GPU T4 x2**

---
**IMPORTANTE:** Em competicoes com Internet OFF, use a variacao **transformers/safetensors** do modelo.
Modelos GGUF requerem `llama-cpp-python` que nao esta pre-instalado no Kaggle.

---
**Estrategia:** Zero-shot classification com prompt em portugues + quantizacao 4-bit

---

In [None]:
# =============================================================================
# SPR 2026 - QWEN2.5 7B INSTRUCT ZERO-SHOT (OFFLINE)
# =============================================================================

import os
import numpy as np
import pandas as pd
import torch
import warnings
warnings.filterwarnings('ignore')

# Verificar libs GGUF disponiveis (NAO instalar - internet OFF)
LLAMA_CPP_AVAILABLE = False
CTRANSFORMERS_AVAILABLE = False

try:
    from llama_cpp import Llama
    LLAMA_CPP_AVAILABLE = True
except ImportError:
    pass

try:
    from ctransformers import AutoModelForCausalLM as CTAutoModel
    CTRANSFORMERS_AVAILABLE = True
except ImportError:
    pass

print("="*60)
print("SPR 2026 - Qwen2.5 7B Instruct Zero-Shot (OFFLINE)")
print("="*60)

# ==== CONFIGURACOES ====
SEED = 42
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
USE_4BIT = True  # Usar quantizacao 4-bit para modelos HF

# ==== AUTO-DETECTAR MODELO ====
def find_model():
    """Auto-detecta modelo GGUF ou HuggingFace."""
    base = '/kaggle/input'
    gguf_file = None
    hf_path = None
    
    def search_dir(directory, depth=0, max_depth=8):
        nonlocal gguf_file, hf_path
        if depth > max_depth:
            return
        try:
            for item in os.listdir(directory):
                path = os.path.join(directory, item)
                if item.endswith('.gguf'):
                    gguf_file = path
                    return
                if os.path.isdir(path):
                    if os.path.exists(os.path.join(path, 'config.json')):
                        hf_path = path
                        return
                    search_dir(path, depth + 1, max_depth)
        except:
            pass
    
    search_dir(base)
    return gguf_file, hf_path

GGUF_PATH, HF_PATH = find_model()

# Verificar compatibilidade
if GGUF_PATH and not HF_PATH:
    if not LLAMA_CPP_AVAILABLE and not CTRANSFORMERS_AVAILABLE:
        print("\n" + "="*60)
        print("ERRO: Modelo GGUF detectado mas libs nao disponiveis!")
        print("="*60)
        print(f"\nModelo encontrado: {GGUF_PATH}")
        print("\nSOLUCAO: No Kaggle, troque a variacao do modelo:")
        print("  1. Remova o modelo atual")
        print("  2. Add Input -> Models -> QwenLM/Qwen2.5")
        print("  3. Escolha variacao: 'transformers' ou 'safetensors'")
        print("  4. NAO escolha variacao 'gguf'")
        print("\nLibs GGUF (nao pre-instaladas no Kaggle):")
        print(f"  - llama-cpp-python: {LLAMA_CPP_AVAILABLE}")
        print(f"  - ctransformers: {CTRANSFORMERS_AVAILABLE}")
        raise RuntimeError("Modelo GGUF incompativel com ambiente offline. Use variacao transformers.")

if GGUF_PATH is None and HF_PATH is None:
    print("\nModelo nao encontrado. Estrutura de /kaggle/input:")
    def show_tree(path, prefix="", depth=0):
        if depth > 6:
            return
        try:
            items = os.listdir(path)[:10]
            for item in items:
                full = os.path.join(path, item)
                print(f"{prefix}{item}")
                if os.path.isdir(full):
                    show_tree(full, prefix + "  ", depth + 1)
        except:
            pass
    show_tree('/kaggle/input')
    raise FileNotFoundError("Adicione o modelo QwenLM/Qwen2.5 (variacao transformers) ao notebook!")

USE_GPU = torch.cuda.is_available()
device = 'cuda' if USE_GPU else 'cpu'

# Determinar modo
if GGUF_PATH and (LLAMA_CPP_AVAILABLE or CTRANSFORMERS_AVAILABLE):
    MODEL_MODE = 'GGUF'
    MODEL_PATH = GGUF_PATH
else:
    MODEL_MODE = 'HF'
    MODEL_PATH = HF_PATH

print(f"Modo: {MODEL_MODE}")
print(f"Modelo: {MODEL_PATH}")
print(f"GPU disponivel: {USE_GPU}")
print(f"Quantizacao 4-bit: {USE_4BIT if MODEL_MODE == 'HF' else 'N/A (GGUF)'}")

In [None]:
# ==== CARREGAR DADOS ====
print("\n[1/4] Carregando dados...")
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f"    Train: {train.shape} | Test: {test.shape}")

# Auto-detectar colunas
TEXT_COL = 'text' if 'text' in train.columns else 'report'
LABEL_COL = 'label' if 'label' in train.columns else 'target'
ID_COL = 'id' if 'id' in test.columns else 'ID'

print(f"    Colunas: texto={TEXT_COL}, label={LABEL_COL}, id={ID_COL}")

# Ver distribuicao das classes
print("\nDistribuicao das classes no treino:")
print(train[LABEL_COL].value_counts().sort_index())

In [None]:
# ==== CARREGAR MODELO ====
print("\n[2/4] Carregando Qwen2.5 7B Instruct...")

if MODEL_MODE == 'GGUF':
    # Carregar modelo GGUF com llama-cpp-python
    n_gpu_layers = -1 if USE_GPU else 0  # -1 = todas as camadas na GPU
    
    model = Llama(
        model_path=MODEL_PATH,
        n_ctx=2048,
        n_gpu_layers=n_gpu_layers,
        verbose=False,
    )
    tokenizer = None
    print(f"    Modelo GGUF carregado!")
    print(f"    GPU layers: {n_gpu_layers}")
else:
    # Carregar modelo HuggingFace
    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True, local_files_only=True)
    
    if USE_4BIT and USE_GPU:
        # Configuracao 4-bit para caber no T4 (16GB)
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_PATH,
            quantization_config=quantization_config,
            device_map='auto',
            trust_remote_code=True,
            local_files_only=True,
        )
        print("    Modelo HF carregado em 4-bit!")
        print(f"    VRAM estimada: ~10GB")
    else:
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_PATH,
            torch_dtype=torch.float16 if USE_GPU else torch.float32,
            device_map='auto' if USE_GPU else None,
            trust_remote_code=True,
            local_files_only=True,
        )
        if not USE_GPU:
            model = model.to(device)
        print("    Modelo HF carregado em float16!")
    
    model.eval()

In [None]:
# ==== PROMPT TEMPLATE ====
PROMPT_TEMPLATE = """Voce e um radiologista especialista em mamografia. 
Classifique o seguinte laudo de mamografia na categoria BI-RADS correta.

Categorias BI-RADS:
0 - Inconclusivo, necessita exames adicionais
1 - Negativo, sem achados
2 - Achados benignos
3 - Provavelmente benigno
4 - Suspeito de malignidade
5 - Altamente suspeito de malignidade
6 - Malignidade confirmada por biopsia

Laudo:
{text}

Responda APENAS com o numero da categoria BI-RADS (0, 1, 2, 3, 4, 5 ou 6):"""

def classify_text(text, max_length=1024):
    """Classifica um laudo usando o LLM (GGUF ou HF)."""
    prompt = PROMPT_TEMPLATE.format(text=text[:2000])
    
    if MODEL_MODE == 'GGUF':
        # Inferencia com llama-cpp-python
        output = model(
            prompt,
            max_tokens=5,
            temperature=0.1,
            echo=False,
            stop=["\n", " "],
        )
        response = output['choices'][0]['text']
    else:
        # Inferencia com HuggingFace transformers
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=5,
                temperature=0.1,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id,
            )
        
        response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    
    # Extrair numero da resposta
    for char in response.strip():
        if char.isdigit() and int(char) in range(7):
            return int(char)
    
    return 2  # Fallback

# Testar com uma amostra
print("\nTestando classificacao...")
sample_text = train.iloc[0][TEXT_COL]
sample_pred = classify_text(sample_text)
print(f"    Predicao: {sample_pred} | Real: {train.iloc[0][LABEL_COL]}")

In [None]:
# ==== CLASSIFICAR TESTE ====
print("\n[3/4] Classificando dados de teste...")

from tqdm import tqdm

predictions = []
for idx, row in tqdm(test.iterrows(), total=len(test), desc="Classificando"):
    pred = classify_text(row[TEXT_COL])
    predictions.append(pred)
    
    # Liberar memoria GPU periodicamente
    if idx % 100 == 0 and USE_GPU:
        torch.cuda.empty_cache()

print("    Classificacao concluida!")

In [None]:
# ==== SUBMISSAO ====
print("\n[4/4] Gerando submissao...")

submission = pd.DataFrame({
    ID_COL: test[ID_COL],
    LABEL_COL: predictions
})
submission.to_csv('/kaggle/working/submission.csv', index=False)

print("="*60)
print("CONCLUIDO - submission.csv criado!")
print("="*60)
print("\nDistribuicao das predicoes:")
print(submission[LABEL_COL].value_counts().sort_index())