In [1]:
# Imports
import os
import json
import csv
import random

In [2]:
dir = os.getcwd()
data_dir = os.path.join(dir, 'data')
os.makedirs(data_dir, exist_ok=True)
output_dir = os.path.join(dir, 'output')
os.makedirs(output_dir, exist_ok=True)

# Create IDs

In [None]:
data = []
with open(os.path.join(data_dir, 'kqa_golden_test_MedLFQA.jsonl'), 'r') as f:
    for line in f:
        data.append(json.loads(line.strip()))

output_path = os.path.join(output_dir,  f'kqa-questions.jsonl')

id = 1

for d in data:

    d['id'] = id

    with open(output_path, 'a') as file:
        json.dump(d, file)
        file.write('\n') 
    
    id += 1

In [None]:
data = []
with open(output_path, 'r') as f:
    for line in f:
        data.append(json.loads(line.strip()))

len(data)

# Sample 5 and 100

In [3]:
data = []
with open(os.path.join(data_dir, 'kqa_qa_pairs.jsonl'), 'r') as f:
    for line in f:
        data.append(json.loads(line.strip()))

In [None]:
# Set random seed
random_state = 23

# Draw five examples at random
random.seed(random_state)
indexes_5 = random.sample(range(len(data)), 5)
sample_5 = [data[i] for i in indexes_5]
print(sample_5[3])

# Remove the sampled 5 dictionaries from the original list
for idx in indexes_5:
    data.pop(idx)
print(len(data))

# Step 2: Sample 100 dictionaries at random from the remaining data
random.seed(random_state)
sample_100 = random.sample(data, 100)

In [5]:
for d in sample_5:    
    with open(os.path.join(data_dir, 'sample_5.jsonl'), 'a') as file:
        json.dump(d, file)
        file.write('\n') 

In [6]:
for d in sample_100:    
    with open(os.path.join(data_dir, 'sample_100.jsonl'), 'a') as file:
        json.dump(d, file)
        file.write('\n') 

# Create prompt

In [None]:
# Get examples
sample_5 = []
with open(os.path.join(data_dir, 'sample_5.jsonl'), 'r') as f:
    for line in f:
        sample_5.append(json.loads(line.strip()))
print(sample_5[0])

In [6]:
prompt = f'''You are a trained physician. Answer this question from a fellow clinician by providing correct, relevant, and safe information. Make sure to keep your answer under 270 words and do not hedge. Follow these steps:

Read the question.
Respond with medical information that is correct, relevant, and safe given the question asked.
Formulate your answer in less than 270 words and do not hedge!

Here are five example of correct, relevant, and safe answers to clinical questions:

Example 1:
Question: {sample_5[0]['Question']}
Answer: {sample_5[0]['Free_form_answer']}

Example 2:
Question: {sample_5[1]['Question']}
Answer: {sample_5[1]['Free_form_answer']}

Example 3:
Question: {sample_5[2]['Question']}
Answer: {sample_5[2]['Free_form_answer']}

Example 4:
Question: {sample_5[3]['Question']}
Answer: {sample_5[3]['Free_form_answer']}

Example 5:
Question: {sample_5[4]['Question']}
Answer: {sample_5[4]['Free_form_answer']}

Answer:
'''

with open(os.path.join(data_dir, 'five_shot_prompt.txt'), "w") as file:
    file.write(prompt)

# From JSONL to CSV

In [10]:
with open(os.path.join(output_dir, "kqa_answers_gpt4_five.jsonl"), 'r', encoding='utf-8') as jsonl_file:
    data = [json.loads(line) for line in jsonl_file]

lower_data = [{k.lower(): v for k, v in record.items()} for record in data]

selected_fields = ['id', 'question', 'answer']

filtered_data = [{field: record.get(field, None) for field in selected_fields} for record in data]

with open(os.path.join(output_dir, "kqa_answers_gpt4_five.csv"), 'w', encoding='utf-8', newline='') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=selected_fields)
    
    # Write header
    writer.writeheader()
    
    # Write rows
    writer.writerows(filtered_data)

# Create separate answer files

In [3]:
with open(os.path.join(output_dir, "kqa_answers_gpt4_five.jsonl"), 'r', encoding='utf-8') as jsonl_file:
    data = [json.loads(line) for line in jsonl_file]

for answer_type in ['gpt4', 'physician']:
    
    answer_n = 0
    
    for d in data:
        
        answer_id = f'{answer_type}_{answer_n}'
        
        if answer_type == 'gpt4':
            answer = d['answer']    
        elif answer_type == 'physician':
            answer = d['Free_form_answer']
        question_number = d['id']
        
        new_d = {'question_id': f'question_{question_number}',
                'question': d['Question'],
                'answer_id': answer_id,
                'answer': answer,
                'answer_type': answer_type}
        
        with open(os.path.join(output_dir, f'{answer_type}_answers.jsonl'), 'a') as file:
            json.dump(new_d, file)
            file.write('\n')
        
        answer_n += 1

In [4]:
with open(os.path.join(output_dir, "kqa_answers_llama_five.jsonl"), 'r', encoding='utf-8') as jsonl_file:
    data = [json.loads(line) for line in jsonl_file]
    
answer_n = 0

for d in data:
    
    answer_id = f'llama_{answer_n}'
    
    answer = d['answer']
    question_number = d['id']
    
    new_d = {'question_id': f'question_{question_number}',
            'question': d['Question'],
            'answer_id': answer_id,
            'answer': answer,
            'answer_type': 'llama'}
    
    with open(os.path.join(output_dir, f'llama_answers.jsonl'), 'a') as file:
        json.dump(new_d, file)
        file.write('\n')
    
    answer_n += 1

# From JSON to annotator files

In [None]:
random.seed(17)

for annotator, assignments in annotators.items():
    
    while len(assignments) < 50:
        sample = random.sample(physicians, 3)[0]
        if sample['id'] not in assignments and annotator not in sample['annotators'] and len(sample['annotators']) < 4:
            assignments.append(sample['id'])
            sample['annotators'].append(annotator)
            

## Create pilot files

In [3]:
with open(os.path.join(data_dir, "kqa_qa_pairs.jsonl"), 'r', encoding='utf-8') as jsonl_file:
    og = [json.loads(line) for line in jsonl_file]

In [4]:
with open(os.path.join(data_dir, "sample_5.jsonl"), 'r', encoding='utf-8') as jsonl_file:
    fiveshot = [json.loads(line) for line in jsonl_file]

In [5]:
with open(os.path.join(data_dir, "sample_100.jsonl"), 'r', encoding='utf-8') as jsonl_file:
    annotations = [json.loads(line) for line in jsonl_file]

In [6]:
used_ids = [d['id'] for d in annotations] + [d['id'] for d in fiveshot]
len((set(used_ids)))

105

In [7]:
pool = [d for d in og if d['id'] not in used_ids]

# Set random seed
random_state = 23

# Draw five examples at random
random.seed(random_state)
pilot_sample = random.sample(pool, 15)
print(pilot_sample[3])

{'Question': 'What is a neurological issue', 'Free_form_answer': "A neurological issue is a problem that affects the nervous system, which includes the brain, spinal cord, and peripheral nerves. The nervous system is in charge of many of our human abilities, such as our ability to speak, move, and think. Neurological issues are conditions where those abilities are impaired. Among the common neurological issues are stroke, epilepsy, Multiple sclerosis, Alzheimer's disease, and Peripheral neuropathy. ", 'Must_have': ['A neurological issue is a problem that affects the nervous system.', ' Neurological issues are conditions where abilities such as speaking, moving, thinking are impaired.'], 'Nice_to_have': ['Epilepsy is a neurological issue.', ' Stroke is a neurological issue.', " Alzheimer's disease is a neurological issue.", ' Peripheral neuropathy is a neurological issue.', ' The nervous system is in charge of human abilities such as the ability to speak, move, think.', ' The nervous sy

In [8]:
check = used_ids + [d['id'] for d in pilot_sample]
len(set(check))

120

In [9]:
for d in pilot_sample:    
    with open(os.path.join(data_dir, 'pilot_sample.jsonl'), 'a') as file:
        json.dump(d, file)
        file.write('\n') 