# SPR 2026 - BioGPT Large (Microsoft)

**Modelo:** microsoft/BioGPT-Large

**Características:**
- Modelo generativo especializado em terminologia biomédica
- Desenvolvido pela Microsoft Research
- Treinado em literatura biomédica (PubMed)
- ~1.5B parâmetros

**Abordagem:** BI-RADS Instruction

---
## CONFIGURAÇÃO KAGGLE:
1. **Add Input** → **Models** → `biogpt-large`
2. **Add Input** → **Competition** → `spr-2026-mammography-report-classification`
3. **Settings** → Internet → **OFF**, GPU → **T4 x2**
---

In [None]:
# ===== BIOGPT LARGE =====

import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("SPR 2026 - BioGPT Large (Microsoft)")
print("="*60)

SEED = 42
MAX_NEW_TOKENS = 10
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(SEED)
np.random.seed(SEED)

def find_model_path():
    base = '/kaggle/input'
    def search_dir(directory, depth=0, max_depth=10):
        if depth > max_depth: return None
        try:
            for item in os.listdir(directory):
                path = os.path.join(directory, item)
                if os.path.isdir(path) and os.path.exists(os.path.join(path, 'config.json')):
                    return path
                result = search_dir(path, depth + 1, max_depth) if os.path.isdir(path) else None
                if result: return result
        except: pass
        return None
    return search_dir(base)

MODEL_PATH = find_model_path()
print(f"Device: {device} | Model: {MODEL_PATH}")

In [None]:
# ===== BI-RADS INSTRUCTION PROMPT (OTIMIZADO PARA BIOGPT) =====
# BioGPT é treinado em inglês biomédico, prompt direto sem chat template

PROMPT_TEMPLATE = """Task: Classify this mammography report according to BI-RADS categories.

BI-RADS Categories:
0 = Incomplete (needs additional imaging)
1 = Negative (normal)
2 = Benign (0% malignancy)
3 = Probably Benign (<2% malignancy)
4 = Suspicious (biopsy needed)
5 = Highly Suggestive of Malignancy (>95%)
6 = Known Malignancy (biopsy-proven)

Mammography Report:
{report}

BI-RADS Category (0-6):"""

In [None]:
# ===== CARREGAR MODELO =====
print("Carregando BioGPT...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH, local_files_only=True,
    torch_dtype=torch.float16, device_map="auto"
)

# BioGPT pode não ter pad_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Modelo: {model.config.architectures}")

In [None]:
# ===== DADOS =====
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
print(f"Train: {len(train_df)}, Test: {len(test_df)}")

In [None]:
# ===== CLASSIFICAÇÃO =====
def classify_report(report, model, tokenizer):
    # BioGPT usa prompt direto (não é chat model)
    prompt = PROMPT_TEMPLATE.format(report=report)
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=MAX_NEW_TOKENS, 
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            num_beams=1
        )
    
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    
    # Extrair número
    for char in response.strip():
        if char.isdigit() and char in '0123456':
            return int(char)
    return 2

sample = train_df.iloc[0]
print(f"Real: {sample['target']}, Pred: {classify_report(sample['report'], model, tokenizer)}")

In [None]:
# ===== VALIDAÇÃO =====
from sklearn.metrics import f1_score, classification_report

val_sample = train_df.groupby('target', group_keys=False).apply(lambda x: x.sample(min(20, len(x)), random_state=SEED))
val_preds = [classify_report(row['report'], model, tokenizer) for _, row in tqdm(val_sample.iterrows(), total=len(val_sample))]
val_labels = val_sample['target'].values

print(f"\nF1-Macro: {f1_score(val_labels, val_preds, average='macro'):.5f}")
print(classification_report(val_labels, val_preds))

In [None]:
# ===== SUBMISSION =====
test_preds = [classify_report(row['report'], model, tokenizer) for _, row in tqdm(test_df.iterrows(), total=len(test_df))]

submission = pd.DataFrame({'ID': test_df['ID'], 'target': test_preds})
submission.to_csv('submission.csv', index=False)
print(f"Submission salva!")
print(submission['target'].value_counts().sort_index())