# SPR 2026 - Qwen3 1.7B (Zero-Shot Classification)

**Este notebook usa Qwen3-1.7B do Kaggle Models para classificacao zero-shot.**

---
**CONFIGURACAO KAGGLE:**

1. **Add Input** -> **Models** -> buscar `Qwen3`
   - Autor: **QwenLM**
   - Variation: **1.7B**
2. **Settings** -> Internet -> **OFF**
3. **Settings** -> Accelerator -> **GPU T4 x2**

---
**Estrategia:** Zero-shot classification com prompt em portugues

---

In [None]:
# ===== SPR 2026 - QWEN3 1.7B ZERO-SHOT =====

import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("SPR 2026 - Qwen3 1.7B Zero-Shot")
print("="*60)

# ==== CONFIGURACOES ====
SEED = 42
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

# ==== AUTO-DETECTAR PATH DO MODELO ====
def find_model_path():
    base = '/kaggle/input'
    
    def has_config(path):
        return os.path.isdir(path) and os.path.exists(os.path.join(path, 'config.json'))
    
    def search_dir(directory, depth=0, max_depth=6):
        if depth > max_depth:
            return None
        try:
            for item in os.listdir(directory):
                path = os.path.join(directory, item)
                if os.path.isdir(path):
                    if has_config(path):
                        return path
                    result = search_dir(path, depth + 1, max_depth)
                    if result:
                        return result
        except:
            pass
        return None
    
    return search_dir(base)

MODEL_PATH = find_model_path()

if MODEL_PATH is None:
    print("\nModelo nao encontrado. Estrutura de /kaggle/input:")
    def show_tree(path, prefix="", depth=0):
        if depth > 4:
            return
        try:
            items = os.listdir(path)[:10]
            for item in items:
                full = os.path.join(path, item)
                print(f"{prefix}{item}")
                if os.path.isdir(full):
                    show_tree(full, prefix + "  ", depth + 1)
        except:
            pass
    show_tree('/kaggle/input')
    raise FileNotFoundError("Adicione o modelo QwenLM/Qwen3 (1.7B) ao notebook!")

USE_GPU = torch.cuda.is_available()
device = 'cuda' if USE_GPU else 'cpu'

print(f"Modelo: {MODEL_PATH}")
print(f"GPU disponivel: {USE_GPU}")
print(f"Device: {device}")

In [None]:
# ==== CARREGAR DADOS ====
print("\n[1/4] Carregando dados...")
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f"    Train: {train.shape} | Test: {test.shape}")

# Distribuicao das classes
print("\nDistribuicao das classes no treino:")
print(train['label'].value_counts().sort_index())

In [None]:
# ==== CARREGAR MODELO ====
print("\n[2/4] Carregando Qwen3 1.7B...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16 if USE_GPU else torch.float32,
    device_map='auto' if USE_GPU else None,
    trust_remote_code=True,
    local_files_only=True,
)

if not USE_GPU:
    model = model.to(device)

model.eval()
print("    Modelo carregado!")

In [None]:
# ==== PROMPT TEMPLATE ====
PROMPT_TEMPLATE = """Voce e um radiologista especialista em mamografia. 
Classifique o seguinte laudo de mamografia na categoria BI-RADS correta.

Categorias BI-RADS:
0 - Inconclusivo, necessita exames adicionais
1 - Negativo, sem achados
2 - Achados benignos
3 - Provavelmente benigno
4 - Suspeito de malignidade
5 - Altamente suspeito de malignidade
6 - Malignidade confirmada por biopsia

Laudo:
{text}

Responda APENAS com o numero da categoria BI-RADS (0, 1, 2, 3, 4, 5 ou 6):"""

def classify_text(text, max_length=1024):
    """Classifica um laudo usando o LLM."""
    prompt = PROMPT_TEMPLATE.format(text=text[:2000])  # Truncar laudos longos
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=5,
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    
    # Extrair numero da resposta
    for char in response.strip():
        if char.isdigit() and int(char) in range(7):
            return int(char)
    
    # Fallback: classe mais comum
    return 2  # BI-RADS 2

# Testar com uma amostra
print("\nTestando classificacao...")
sample_text = train.iloc[0]['text']
sample_pred = classify_text(sample_text)
print(f"    Predicao: {sample_pred} | Real: {train.iloc[0]['label']}")

In [None]:
# ==== CLASSIFICAR TESTE ====
print("\n[3/4] Classificando dados de teste...")

predictions = []
for idx, row in tqdm(test.iterrows(), total=len(test), desc="Classificando"):
    pred = classify_text(row['text'])
    predictions.append(pred)
    
    # Liberar memoria GPU periodicamente
    if idx % 100 == 0 and USE_GPU:
        torch.cuda.empty_cache()

print("    Classificacao concluida!")

In [None]:
# ==== SUBMISSAO ====
print("\n[4/4] Gerando submissao...")

submission = pd.DataFrame({
    'id': test['id'],
    'label': predictions
})
submission.to_csv('/kaggle/working/submission.csv', index=False)

print("="*60)
print("CONCLUIDO - submission.csv criado!")
print("="*60)
print("\nDistribuicao das predicoes:")
print(submission['label'].value_counts().sort_index())