In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Download e exploração inicial dos dados

In [4]:
!git clone https://github.com/pubmedqa/pubmedqa.git

import json

file_path = 'pubmedqa/data/ori_pqal.json'

with open(file_path, 'r') as f:
    data = json.load(f)

sample_key = list(data.keys())[0]
print(f"\nCampos disponíveis: {list(data[sample_key].keys())}\n")

print("=" * 60)
print("Exploração de dados - PubMedQA")
print("=" * 60)

for i, key in enumerate(list(data.keys())[:3]):
    item = data[key]

    print(f"\nExemplo {i+1} | ID: {key}")
    print("-" * 60)
    print(f"Question: {item.get('QUESTION', 'N/A')}")

    context = " ".join(item.get('CONTEXTS', []))
    print(f"Context: {context[:300]}...")

    print(f"Labels: {item.get('LABELS', 'N/A')}")
    print(f"Decision: {item.get('final_decision', 'N/A')}")
    print(f"Answer: {item.get('LONG_ANSWER', 'N/A')[:200]}...")
    print(f"Meshes: {item.get('MESHES', 'N/A')}")
    print(f"Year: {item.get('YEAR', 'N/A')}")
    print(f"Reasoning required pred: {item.get('reasoning_required_pred', 'N/A')}")
    print(f"Reasoning free pred: {item.get('reasoning_free_pred', 'N/A')}")

print(f"\n\nTotal de registros: {len(data)}")

Cloning into 'pubmedqa'...
remote: Enumerating objects: 40, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 40 (delta 0), reused 2 (delta 0), pack-reused 37 (from 1)[K
Receiving objects: 100% (40/40), 704.89 KiB | 22.03 MiB/s, done.
Resolving deltas: 100% (12/12), done.

Campos disponíveis: ['QUESTION', 'CONTEXTS', 'LABELS', 'MESHES', 'YEAR', 'reasoning_required_pred', 'reasoning_free_pred', 'final_decision', 'LONG_ANSWER']

Exploração de dados - PubMedQA

Exemplo 1 | ID: 21645374
------------------------------------------------------------
Question: Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?
Context: Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing 

# Pré-processamento

In [22]:
import json
import re
import unicodedata
import os

def preprocess_text(text):
    """Normaliza e limpa texto"""
    if not isinstance(text, str):
        return ""

    # Normalização unicode
    text = unicodedata.normalize('NFKC', text)

    # Normalização de espaços
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def preprocess_dataset(input_path, output_path):
    """Pré-processa o dataset original completo"""
    print(f"Carregando {input_path}...")
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    processed_data = {}

    for key, item in data.items():

        # QUESTION
        question = preprocess_text(item.get('QUESTION', ''))

        # CONTEXTS (string ou lista)
        context_raw = " ".join(item.get('CONTEXTS', []))
        context = preprocess_text(context_raw)

        decision = preprocess_text(item.get('final_decision', 'N/A').upper())
        long_answer = preprocess_text(item.get('LONG_ANSWER', ''))
        answer = f"Decisão: {decision}\nJustificativa:\n{long_answer}"

        processed_data[key] = {
            "QUESTION": question,
            "CONTEXTS": context,
            "FINAL_ANSWER": answer,
            "YEAR": item.get('YEAR', 'N/A')
        }

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f_out:
        json.dump(processed_data, f_out, indent=4, ensure_ascii=False)

    print(f"✓ Processados {len(processed_data)} registros")
    return len(processed_data)

# Processar dataset original completo
total = preprocess_dataset(
    'pubmedqa/data/ori_pqal.json',
    'data_processed/ori_pqal_preprocessed.json'
)

print(f"\nPré-processamento concluído: {total} registros")
print("Arquivo salvo em: data_processed/ori_pqal_preprocessed.json")

Carregando pubmedqa/data/ori_pqal.json...
✓ Processados 1000 registros

Pré-processamento concluído: 1000 registros
Arquivo salvo em: data_processed/ori_pqal_preprocessed.json


# Anonimização

In [23]:
import json
import re
import os

def anonymize_text(text):
    """Remove dados sensíveis (LGPD/HIPAA compliance)"""
    if not isinstance(text, str):
        return ""

    text = re.sub(r'(Dr\.|Dra\.|Doctor|Prof\.|MD)\s+[A-Z][a-z]+(\s+[A-Z][a-z]+)?', '[NOME]', text)
    text = re.sub(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '[EMAIL]', text)
    locations = r'(Israel|Denmark|Chile|Texas|France|United Kingdom|UK|USA|Pakistan|Karachi|Jordan|Japan|Australia|North Carolina|Washington)'
    text = re.sub(locations, '[LOCAL]', text, flags=re.IGNORECASE)
    text = re.sub(r'\b\d{6,}\b', '[ID]', text)
    text = re.sub(r'\b(19|20)\d{2}\b', '[ANO]', text)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '[URL]', text)

    return text

def anonymize_dataset(input_path, output_path):
    """Anonimiza o dataset pré-processado completo"""
    print(f"Carregando {input_path}...")
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    anonymized = {}

    for key, item in data.items():
        new_id = f"HOSP_REG_{key[:4]}"

        anonymized[new_id] = {
            "QUESTION": anonymize_text(item.get('QUESTION', '')),
            "CONTEXTS": anonymize_text(item.get('CONTEXTS', '')),
            "FINAL_ANSWER": anonymize_text(item.get('FINAL_ANSWER', '')),
            "YEAR": item.get('YEAR', 'N/A')
        }

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f_out:
        json.dump(anonymized, f_out, indent=4, ensure_ascii=False)

    print(f"✓ Anonimizados {len(anonymized)} registros")
    return len(anonymized)

# Anonimizar dataset pré-processado
total = anonymize_dataset(
    'data_processed/ori_pqal_preprocessed.json',
    'data_anonymized/ori_pqal_anonymized.json'
)

print(f"\nAnonimização concluída: {total} registros")
print("Arquivo salvo em: data_anonymized/ori_pqal_anonymized.json")

# Exemplo de dado anonimizado
with open('data_anonymized/ori_pqal_anonymized.json', 'r', encoding='utf-8') as f:
    sample = json.load(f)
    first_key = list(sample.keys())[0]
    print(f"\n{'='*60}")
    print("Exemplo de dado anonimizado:")
    print(f"{'='*60}")
    print(f"ID: {first_key}")
    print(f"Question: {sample[first_key]['QUESTION'][:100]}...")

Carregando data_processed/ori_pqal_preprocessed.json...
✓ Anonimizados 765 registros

Anonimização concluída: 765 registros
Arquivo salvo em: data_anonymized/ori_pqal_anonymized.json

Exemplo de dado anonimizado:
ID: HOSP_REG_2164
Question: Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?...


## Análise de Qualidade

In [24]:
import json
import os
from collections import Counter

def analyze_quality(data):
    issues = {
        'question_vazia': [],
        'context_vazio': [],
        'answer_vazia': [],
        'answer_muito_curta': [],
        'context_muito_curto': []
    }

    for key, item in data.items():
        if not item.get('QUESTION', '').strip():
            issues['question_vazia'].append(key)

        if not item.get('CONTEXTS', '').strip():
            issues['context_vazio'].append(key)

        if not item.get('FINAL_ANSWER', '').strip():
            issues['answer_vazia'].append(key)

        if len(item.get('FINAL_ANSWER', '')) < 50:
            issues['answer_muito_curta'].append(key)

        if len(item.get('CONTEXTS', '')) < 100:
            issues['context_muito_curto'].append(key)

    return issues

def extract_decision(answer):
    answer_upper = answer.upper()
    if 'YES' in answer_upper or 'SIM' in answer_upper:
        return 'YES'
    elif 'NO' in answer_upper or 'NÃO' in answer_upper or 'NAO' in answer_upper:
        return 'NO'
    elif 'MAYBE' in answer_upper or 'TALVEZ' in answer_upper:
        return 'MAYBE'
    return 'UNKNOWN'

def analyze_distribution(data):
    distribution = Counter()

    for key, item in data.items():
        decision = extract_decision(item.get('FINAL_ANSWER',''))
        distribution[decision] += 1

    return distribution

print("Analisando qualidade dos dados...")

# Analisar test set
with open('/content/data_anonymized/ori_pqal_anonymized.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

issues = analyze_quality(test_data)

print("\nProblemas encontrados:")
for issue_type, ids in issues.items():
    if ids:
        print(f"  {issue_type}: {len(ids)} registros")
    else:
        print(f"  {issue_type}: 0")

print("\nDistribuição:")
dist_test = analyze_distribution(test_data)
for cls, count in dist_test.items():
    pct = (count / len(test_data)) * 100
    print(f"  {cls}: {count} ({pct:.1f}%)")

print(f"\n{'='*60}")
print(f"Total de registros: {len(test_data)}")
print(f"{'='*60}")

Analisando qualidade dos dados...

Problemas encontrados:
  question_vazia: 0
  context_vazio: 0
  answer_vazia: 0
  answer_muito_curta: 0
  context_muito_curto: 0

Distribuição:
  YES: 440 (57.5%)
  NO: 298 (39.0%)
  MAYBE: 27 (3.5%)

Total de registros: 765


## Fine Tunning

Instando as dependências

In [8]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
!pip install transformers datasets

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-_f97i7fl/unsloth_b88a79cec70345f3a241bd6f4566b22c
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-_f97i7fl/unsloth_b88a79cec70345f3a241bd6f4566b22c
  Resolved https://github.com/unslothai/unsloth.git to commit 8ea5338154859ed25b50366cb1264ed4d933eae3
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.12.7 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.12.7-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.gi

Configuração das variáveis do modelo

In [31]:
os.environ["WANDB_MODE"] = "disabled"

from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
import json
import os
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments

max_seq_length = 2048
dtype = None
load_in_4bit = True
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
]


Conversão do dataset para treinamento


In [50]:
import json
from datasets import Dataset
import os

DATA_PATH = "data_anonymized/ori_pqal_anonymized.json"
OUTPUT_DATA_PATH = "data_final/final_pqal.json"
SYSTEM_PROMPT = """
Você é um assistente médico-científico.

Responda exclusivamente com base no contexto fornecido.

Formato obrigatório da resposta:
Decisão: YES | NO | MAYBE
Justificativa: <explicação objetiva>

Use:
- YES quando o contexto apoiar claramente a afirmação.
- NO quando o contexto claramente a contradizer.
- Evite usar MAYBE, use apenas quando as evidências forem insuficientes, inconclusivas ou conflitantes.

Não use conhecimento externo.
"""

with open(DATA_PATH, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

data = []
for _, item in raw_data.items():
    data.append({
        "messages": [
            {
                "role": "system",
                "content": (
                    f"{SYSTEM_PROMPT}"
                )
            },
            {
                "role": "user",
                "content": (
                    f"Pergunta:\n{item['QUESTION']}\n\n"
                    f"Contexto científico:\n{item['CONTEXTS']}"
                )
            },
            {
                "role": "assistant",
                "content": item["FINAL_ANSWER"]
            }
        ]
    })



formatted_data = Dataset.from_list(data)

print("Novo formato do dataset:")
print(json.dumps(formatted_data[0], indent=2, ensure_ascii=False))

os.makedirs(os.path.dirname(OUTPUT_DATA_PATH), exist_ok=True)
with open(OUTPUT_DATA_PATH, 'w', encoding='utf-8') as output_file:
  json.dump(formatted_data.to_list(), output_file, indent=4)

print("\n")
print("="*60)
print(f"Dataset final salvo em: {OUTPUT_DATA_PATH}")
print("="*60)


Novo formato do dataset:
{
  "messages": [
    {
      "content": "\nVocê é um assistente médico-científico.\n\nResponda exclusivamente com base no contexto fornecido.\n\nFormato obrigatório da resposta:\nDecisão: YES | NO | MAYBE\nJustificativa: <explicação objetiva>\n\nUse:\n- YES quando o contexto apoiar claramente a afirmação.\n- NO quando o contexto claramente a contradizer.\n- Evite usar MAYBE, use apenas quando as evidências forem insuficientes, inconclusivas ou conflitantes.\n\nNão use conhecimento externo.\n",
      "role": "system"
    },
    {
      "content": "Pergunta:\nDo mitochondria play a role in remodelling lace plant leaves during programmed cell death?\n\nContexto científico:\nProgrammed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in t

## Carregando o modelo "unsloth/llama-3-8b-bnb-4bit"

In [52]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)


==((====))==  Unsloth 2025.12.9: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

In [53]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",

    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.12.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [56]:
from datasets import load_dataset

# Define o EOS_TOKEN que já foi carregado com o tokenizer
EOS_TOKEN = tokenizer.eos_token

# Explicitly set the chat template for Llama-3 to prevent ValueError
# This template is based on the official Llama 3 chat format
tokenizer.chat_template = (
    "{% for message in messages %}"
        "{% if message['role'] == 'system' %}"
            "<|start_header_id|>system<|end_header_id|>\n{{ message['content'] | trim }}<|eot_id|>"
        "{% elif message['role'] == 'user' %}"
            "<|start_header_id|>user<|end_header_id|>\n{{ message['content'] | trim }}<|eot_id|>"
        "{% elif message['role'] == 'assistant' %}"
            "<|start_header_id|>assistant<|end_header_id|>\n{{ message['content'] | trim }}<|eot_id|>"
        "{% endif %}"
    "{% endfor %}"
)


# Função de formatação adaptada para Llama 3 chat
def formatting_prompts_func(examples):
    # Aplica o chat template do tokenizer diretamente aos 'messages'
    # O formatted_data de `301QWFxfkUx7` já está no formato de lista de dicionários com 'role' e 'content'
    texts = [tokenizer.apply_chat_template(messages_list, tokenize=False, add_generation_prompt=False) + EOS_TOKEN for messages_list in examples["messages"]]
    return { "text" : texts, }

# Caminho para o dataset pré-processado e anonimizado
OUTPUT_PATH_DATASET = "data_final/final_pqal.json"

# Carrega o dataset
dataset = load_dataset("json", data_files=OUTPUT_PATH_DATASET, split = "train")

# Aplica a função de formatação ao dataset
dataset = dataset.map(formatting_prompts_func, batched = True,)

print("Primeiro exemplo do dataset formatado para o Fine-tuning:")
print(dataset[0]["text"])


Map:   0%|          | 0/765 [00:00<?, ? examples/s]

Primeiro exemplo do dataset formatado para o Fine-tuning:
<|start_header_id|>system<|end_header_id|>
Você é um assistente médico-científico.

Responda exclusivamente com base no contexto fornecido.

Formato obrigatório da resposta:
Decisão: YES | NO | MAYBE
Justificativa: <explicação objetiva>

Use:
- YES quando o contexto apoiar claramente a afirmação.
- NO quando o contexto claramente a contradizer.
- Evite usar MAYBE, use apenas quando as evidências forem insuficientes, inconclusivas ou conflitantes.

Não use conhecimento externo.<|eot_id|><|start_header_id|>user<|end_header_id|>
Pergunta:
Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?

Contexto científico:
Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occu

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer_stats = trainer.train()

Map (num_proc=2):   0%|          | 0/765 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 765 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)
wandb: (1) Create a W&B account
wandb: (2) Use an existing W&B account
wandb: (3) Don't visualize my results
wandb: Enter your choice:

 3


wandb: You chose "Don't visualize my results"


wandb: Detected [openai] in use.
wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.2848
2,2.3849
3,2.1425
4,2.3416
5,2.3795
6,2.2064
7,2.2825
8,2.0394
9,2.1765
10,2.0825
