In [2]:
pip install langchain langgraph bitsandbytes peft trl

^C
Note: you may need to restart the kernel to use updated packages.





[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


# Download e exploração inicial dos dados

In [12]:
!git clone https://github.com/pubmedqa/pubmedqa.git

import json

file_path = 'pubmedqa/data/ori_pqal.json'

with open(file_path, 'r') as f:
    data = json.load(f)

sample_key = list(data.keys())[0]
print(f"\nCampos disponíveis: {list(data[sample_key].keys())}\n")

print("=" * 60)
print("Exploração de dados - PubMedQA")
print("=" * 60)

for i, key in enumerate(list(data.keys())[:3]):
    item = data[key]
    
    print(f"\nExemplo {i+1} | ID: {key}")
    print("-" * 60)
    print(f"Question: {item.get('QUESTION', 'N/A')}")
    
    context = " ".join(item.get('CONTEXTS', []))
    print(f"Context: {context[:300]}...")
    
    print(f"Labels: {item.get('LABELS', 'N/A')}")
    print(f"Decision: {item.get('final_decision', 'N/A')}")
    print(f"Answer: {item.get('LONG_ANSWER', 'N/A')[:200]}...")
    print(f"Meshes: {item.get('MESHES', 'N/A')}")
    print(f"Year: {item.get('YEAR', 'N/A')}")
    print(f"Reasoning required pred: {item.get('reasoning_required_pred', 'N/A')}")
    print(f"Reasoning free pred: {item.get('reasoning_free_pred', 'N/A')}")

print(f"\n\nTotal de registros: {len(data)}")


Campos disponíveis: ['QUESTION', 'CONTEXTS', 'LABELS', 'MESHES', 'YEAR', 'reasoning_required_pred', 'reasoning_free_pred', 'final_decision', 'LONG_ANSWER']

Exploração de dados - PubMedQA

Exemplo 1 | ID: 21645374
------------------------------------------------------------
Question: Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?
Context: Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cel...
Labels: ['BACKGROUND', 'RESULTS']
Decision: yes
Answer: Results depicted mitochondrial dynamics in vivo as PCD progresses within the lace plant, and highlight the correlation of this organelle with other organelles during developmental PCD. To the best of ...
Meshes: ['Alismataceae', 'Apoptosis', 'Cell Differ

fatal: destination path 'pubmedqa' already exists and is not an empty directory.


# Divisão dos dados

In [13]:
!cd pubmedqa/preprocess && python split_dataset.py pqal

# Pré-processamento

In [14]:
import json
import re
import unicodedata
import os

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^\w\s.,?!():%\-\+]', '', text)
    
    return text

def process_file(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    processed_data = {}
    
    for key, item in data.items():
        question = preprocess_text(item.get('QUESTION', ''))
        
        context_raw = " ".join(item.get('CONTEXTS', []))
        context = preprocess_text(context_raw)
        
        decision = item.get('final_decision', 'N/A').upper()
        long_answer = item.get('LONG_ANSWER', '')
        answer = preprocess_text(f"Decisão: {decision}. Justificativa: {long_answer}")
        
        processed_data[key] = {
            "QUESTION": question,
            "CONTEXTS": context,
            "FINAL_ANSWER": answer,
            "YEAR": item.get('YEAR', 'N/A')
        }
    
    with open(output_path, 'w', encoding='utf-8') as f_out:
        json.dump(processed_data, f_out, indent=4, ensure_ascii=False)
    
    return len(processed_data)

os.makedirs('data_processed', exist_ok=True)

print("Processando dados...")

total = process_file(
    'pubmedqa/data/test_set.json',
    'data_processed/test_set_preprocessed.json'
)
print(f"Test set: {total} registros")

for i in range(10):
    os.makedirs(f'data_processed/pqal_fold{i}', exist_ok=True)
    total = process_file(
        f'pubmedqa/data/pqal_fold{i}/dev_set.json',
        f'data_processed/pqal_fold{i}/dev_set_preprocessed.json'
    )
    print(f"Fold {i}: {total} registros")

print("\nProcessamento concluído")
print("Arquivos salvos em: data_processed/")

Processando dados...
Test set: 500 registros
Fold 0: 50 registros
Fold 1: 50 registros
Fold 2: 50 registros
Fold 3: 50 registros
Fold 4: 50 registros
Fold 5: 50 registros
Fold 6: 50 registros
Fold 7: 50 registros
Fold 8: 50 registros
Fold 9: 50 registros

Processamento concluído
Arquivos salvos em: data_processed/


# Anonimização

In [15]:
import json
import re
import os

def anonymize_text(text):
    if not isinstance(text, str):
        return ""
    
    text = re.sub(r'(Dr\.|Dra\.|Doctor|Prof\.|MD)\s+[A-Z][a-z]+(\s+[A-Z][a-z]+)?', '[NOME_PROFISSIONAL]', text)
    text = re.sub(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '[CONTATO_EMAIL]', text)
    
    locations = r'(Israel|Denmark|Chile|Texas|France|United Kingdom|UK|USA|Pakistan|Karachi|Jordan|Japan|Australia|North Carolina|Washington)'
    text = re.sub(locations, '[LOCALIZACAO_RESTRITA]', text, flags=re.IGNORECASE)
    
    text = re.sub(r'\b\d{6,}\b', '[ID_RESTRITO]', text)
    text = re.sub(r'\b(19|20)\d{2}\b', '[ANO]', text)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '[URL_RESTRITA]', text)
    
    return text

def anonymize_file(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    anonymized = {}
    
    for key, item in data.items():
        new_id = f"HOSP_REG_{key[:4]}"
        
        anonymized[new_id] = {
            "QUESTION": anonymize_text(item.get('QUESTION', '')),
            "CONTEXTS": anonymize_text(item.get('CONTEXTS', '')),
            "FINAL_ANSWER": anonymize_text(item.get('FINAL_ANSWER', '')),
            "ORIGINAL_ID": "[OCULTADO_POR_SEGURANCA]"
        }
    
    with open(output_path, 'w', encoding='utf-8') as f_out:
        json.dump(anonymized, f_out, indent=4, ensure_ascii=False)
    
    return len(anonymized)

os.makedirs('data_anonymized', exist_ok=True)

print("Anonimizando dados...")

total = anonymize_file(
    'data_processed/test_set_preprocessed.json',
    'data_anonymized/test_set_anonymized.json'
)
print(f"Test set: {total} registros")

for i in range(10):
    os.makedirs(f'data_anonymized/pqal_fold{i}', exist_ok=True)
    total = anonymize_file(
        f'data_processed/pqal_fold{i}/dev_set_preprocessed.json',
        f'data_anonymized/pqal_fold{i}/dev_set_anonymized.json'
    )
    print(f"Fold {i}: {total} registros")

print("\nAnonimização concluída")
print("Arquivos salvos em: data_anonymized/")

with open('data_anonymized/test_set_anonymized.json', 'r', encoding='utf-8') as f:
    sample = json.load(f)
    first_key = list(sample.keys())[0]
    print(f"\nExemplo de dado anonimizado:")
    print(f"ID: {first_key}")
    print(f"Question: {sample[first_key]['QUESTION'][:100]}...")
    print(f"Original ID: {sample[first_key]['ORIGINAL_ID']}")

Anonimizando dados...
Test set: 435 registros
Fold 0: 50 registros
Fold 1: 50 registros
Fold 2: 49 registros
Fold 3: 49 registros
Fold 4: 50 registros
Fold 5: 50 registros
Fold 6: 48 registros
Fold 7: 50 registros
Fold 8: 50 registros
Fold 9: 50 registros

Anonimização concluída
Arquivos salvos em: data_anonymized/

Exemplo de dado anonimizado:
ID: HOSP_REG_1237
Question: Is anorectal endosonography valuable in dyschesia?...
Original ID: [OCULTADO_POR_SEGURANCA]


# Análise de qualidade

In [16]:
import json
import os
from collections import Counter

def analyze_quality(data):
    issues = {
        'question_vazia': [],
        'context_vazio': [],
        'answer_vazia': [],
        'answer_muito_curta': [],
        'context_muito_curto': []
    }
    
    for key, item in data.items():
        if not item.get('QUESTION', '').strip():
            issues['question_vazia'].append(key)
        
        if not item.get('CONTEXTS', '').strip():
            issues['context_vazio'].append(key)
        
        if not item.get('FINAL_ANSWER', '').strip():
            issues['answer_vazia'].append(key)
        
        if len(item.get('FINAL_ANSWER', '')) < 50:
            issues['answer_muito_curta'].append(key)
        
        if len(item.get('CONTEXTS', '')) < 100:
            issues['context_muito_curto'].append(key)
    
    return issues

def extract_decision(answer):
    answer_upper = answer.upper()
    if 'YES' in answer_upper or 'SIM' in answer_upper:
        return 'YES'
    elif 'NO' in answer_upper or 'NÃO' in answer_upper or 'NAO' in answer_upper:
        return 'NO'
    elif 'MAYBE' in answer_upper or 'TALVEZ' in answer_upper:
        return 'MAYBE'
    return 'UNKNOWN'

def analyze_distribution(data):
    distribution = Counter()
    
    for key, item in data.items():
        decision = extract_decision(item.get('FINAL_ANSWER', ''))
        distribution[decision] += 1
    
    return distribution

print("Analisando qualidade dos dados...")

with open('data_anonymized/test_set_anonymized.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

issues = analyze_quality(test_data)

print("\nProblemas encontrados:")
for issue_type, ids in issues.items():
    if ids:
        print(f"  {issue_type}: {len(ids)} registros")

print("\nDistribuição - test set:")
dist_test = analyze_distribution(test_data)
for cls, count in dist_test.items():
    pct = (count / len(test_data)) * 100
    print(f"  {cls}: {count} ({pct:.1f}%)")

print("\nDistribuição - folds:")
all_folds = {}
for i in range(10):
    with open(f'data_anonymized/pqal_fold{i}/dev_set_anonymized.json', 'r', encoding='utf-8') as f:
        fold_data = json.load(f)
        all_folds.update(fold_data)

dist_folds = analyze_distribution(all_folds)
for cls, count in dist_folds.items():
    pct = (count / len(all_folds)) * 100
    print(f"  {cls}: {count} ({pct:.1f}%)")

print(f"\nTotal de registros:")
print(f"  Test: {len(test_data)}")
print(f"  Folds: {len(all_folds)}")
print(f"  Total: {len(test_data) + len(all_folds)}")

Analisando qualidade dos dados...

Problemas encontrados:

Distribuição - test set:
  YES: 238 (54.7%)
  NO: 176 (40.5%)
  MAYBE: 21 (4.8%)

Distribuição - folds:
  YES: 244 (56.1%)
  NO: 176 (40.5%)
  MAYBE: 15 (3.4%)

Total de registros:
  Test: 435
  Folds: 435
  Total: 870


In [17]:
import json
import os

def validate_consistency(data, max_samples=5):
    print("Validação de consistência\n")
    
    inconsistencies = []
    
    for key, item in list(data.items())[:max_samples]:
        question = item.get('QUESTION', '')
        context = item.get('CONTEXTS', '')
        answer = item.get('FINAL_ANSWER', '')
        
        has_question_mark = '?' in question
        valid_context = len(context) > 50
        valid_answer = len(answer) > 30
        
        if not (has_question_mark and valid_context and valid_answer):
            inconsistencies.append({
                'id': key,
                'has_question_mark': has_question_mark,
                'valid_context': valid_context,
                'valid_answer': valid_answer
            })
        
        print(f"Registro: {key}")
        print(f"Question: {question[:100]}...")
        print(f"Context: {len(context)} caracteres")
        print(f"Answer: {answer[:150]}...")
        print("-" * 60)
    
    return inconsistencies

with open('data_anonymized/test_set_anonymized.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

inconsistencies = validate_consistency(test_data, max_samples=3)

if inconsistencies:
    print(f"\nInconsistências encontradas: {len(inconsistencies)}")
else:
    print(f"\nTodas as amostras validadas")

Validação de consistência

Registro: HOSP_REG_1237
Question: Is anorectal endosonography valuable in dyschesia?...
Context: 1231 caracteres
Answer: Decisão: YES. Justificativa: Linear anorectal endosonography demonstrated incomplete or even absent relaxation of the anal sphincter and the m. pubore...
------------------------------------------------------------
Registro: HOSP_REG_2616
Question: Is there a connection between sublingual varices and hypertension?...
Context: 1667 caracteres
Answer: Decisão: YES. Justificativa: An association was found between sublingual varices and hypertension. Examining the lateral borders of the tongue is easi...
------------------------------------------------------------
Registro: HOSP_REG_1910
Question: Are home sampling kits for sexually transmitted infections acceptable among men who have sex with me...
Context: 1224 caracteres
Answer: Decisão: MAYBE. Justificativa: The widespread acceptability of using HSKs for the diagnosis of STIs could have imp

In [18]:
import json
import os

def format_for_finetuning(data, output_path):
    formatted = []
    
    for key, item in data.items():
        question = item.get('QUESTION', '')
        context = item.get('CONTEXTS', '')
        answer = item.get('FINAL_ANSWER', '')
        
        example = {
            "id": key,
            "messages": [
                {
                    "role": "system",
                    "content": "Você é um assistente médico especializado. Responda às perguntas baseando-se nas evidências científicas fornecidas no contexto."
                },
                {
                    "role": "user",
                    "content": f"Contexto médico: {context}\n\nPergunta: {question}"
                },
                {
                    "role": "assistant",
                    "content": answer
                }
            ]
        }
        
        formatted.append(example)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        for item in formatted:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    
    return len(formatted)

os.makedirs('data_curated', exist_ok=True)

print("Formatando para fine-tuning...")

with open('data_anonymized/test_set_anonymized.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

total = format_for_finetuning(test_data, 'data_curated/test_set_curated.jsonl')
print(f"Test set: {total} exemplos")

for i in range(10):
    os.makedirs(f'data_curated/pqal_fold{i}', exist_ok=True)
    
    with open(f'data_anonymized/pqal_fold{i}/dev_set_anonymized.json', 'r', encoding='utf-8') as f:
        fold_data = json.load(f)
    
    total = format_for_finetuning(fold_data, f'data_curated/pqal_fold{i}/dev_set_curated.jsonl')
    print(f"Fold {i}: {total} exemplos")

print("\nFormatação concluída")
print("Formato: JSONL conversacional")
print("Estrutura: system + user + assistant")
print("Arquivos salvos em: data_curated/")

with open('data_curated/test_set_curated.jsonl', 'r', encoding='utf-8') as f:
    example = json.loads(f.readline())
    print("\nExemplo:")
    print(json.dumps(example, indent=2, ensure_ascii=False)[:800] + "...")

Formatando para fine-tuning...
Test set: 435 exemplos
Fold 0: 50 exemplos
Fold 1: 50 exemplos
Fold 2: 49 exemplos
Fold 3: 49 exemplos
Fold 4: 50 exemplos
Fold 5: 50 exemplos
Fold 6: 48 exemplos
Fold 7: 50 exemplos
Fold 8: 50 exemplos
Fold 9: 50 exemplos

Formatação concluída
Formato: JSONL conversacional
Estrutura: system + user + assistant
Arquivos salvos em: data_curated/

Exemplo:
{
  "id": "HOSP_REG_1237",
  "messages": [
    {
      "role": "system",
      "content": "Você é um assistente médico especializado. Responda às perguntas baseando-se nas evidências científicas fornecidas no contexto."
    },
    {
      "role": "user",
      "content": "Contexto médico: Dyschesia can be provoked by inappropriate defecation movements. The aim of this prospective study was to demonstrate dysfunction of the anal sphincter andor the musculus (m.) puborectalis in patients with dyschesia using anorectal endosonography. Twenty consecutive patients with a medical history of dyschesia and a contr