In [1]:
# Imports
import os
import json
import csv
import random
import spacy



In [2]:
dir = os.getcwd()
data_dir = os.path.join(dir, 'data')
os.makedirs(data_dir, exist_ok=True)
output_dir = os.path.join(dir, 'output')
os.makedirs(output_dir, exist_ok=True)

In [3]:
random.seed(17)

annotators = [f"Annotator{i+1}" for i in range(6)]
random.shuffle(annotators)
annotators

['Annotator1',
 'Annotator6',
 'Annotator2',
 'Annotator3',
 'Annotator4',
 'Annotator5']

In [4]:
# annotator 1 package:

# gpt4 [0:50] fine
# llama [50:100] fine
# physician [0:50] fine
# gpt4 [50:100] coarse
# llama [0:50] coarse
# physician [50:100] coarse

In [5]:
indexes_dict_0 = {'coarse': {'gpt4': [0,50], 'llama': [50,100], 'physician': [0,50]},
                 'fine': {'gpt4': [50,100], 'llama': [0,50], 'physician': [50,100]}}

indexes_dict_1 = {'coarse': {'gpt4': [50,100], 'llama': [0,50], 'physician': [50,100]},
                 'fine': {'gpt4': [0,50], 'llama': [50,100], 'physician': [0,50]}} 
    

In [6]:
def bold_sentences(text):
    
    # Load the spaCy English model
    nlp = spacy.load("en_core_web_sm")
    # Process the text with spaCy to segment into sentences
    doc = nlp(text)
    sentences = [sentence.text for sentence in doc.sents]
    fine_sentences = []
    for bold_index in range(len(sentences)):
        bold_sentence = f':orange[{sentences[bold_index]}]'
        new_sentences = sentences[:bold_index] + [bold_sentence] + sentences[bold_index + 1:]
        fine_sentence = ' '.join(new_sentences)
        fine_sentences.append((bold_index, fine_sentence))
    
    return fine_sentences

# Example text
text = """This is the first sentence. Here's the second one! Is this the third sentence? Yes, it is."""

# bold sentences in the example text
for i in bold_sentences(text):
    print(i)


(0, ":orange[This is the first sentence.] Here's the second one! Is this the third sentence? Yes, it is.")
(1, "This is the first sentence. :orange[Here's the second one!] Is this the third sentence? Yes, it is.")
(2, "This is the first sentence. Here's the second one! :orange[Is this the third sentence?] Yes, it is.")
(3, "This is the first sentence. Here's the second one! Is this the third sentence? :orange[Yes, it is.]")


In [7]:
flag = 0
for annotator in annotators:
    
    print(annotator)
    
    if (flag % 2) == 0:
        indexes_dictionary = indexes_dict_0
    else:
        indexes_dictionary = indexes_dict_1
        
    for annotation_type, answer_types in indexes_dictionary.items():
        
        print(f'Adding {annotation_type} annotations...', )
        
        for answer_type, indexes in answer_types.items():
            
            print(annotation_type, answer_type, indexes)
            with open(os.path.join(data_dir, f"{answer_type}_answers.jsonl"), 'r', encoding='utf-8') as jsonl_file:
                data = [json.loads(line) for line in jsonl_file]
            
            selection = data[indexes[0]:indexes[1]]
            print(len(selection))
            
            for qa_pair in selection:
                
                if annotation_type == 'coarse':
                    qa_pair['annotation_type'] = annotation_type
                    qa_pair['rated'] = 'No'
                
                    with open(os.path.join(output_dir, f'{annotator.lower()}_{annotation_type}.jsonl'), 'a') as file:
                            json.dump(qa_pair, file)
                            file.write('\n')
            
                if annotation_type == 'fine':
                    sentences = bold_sentences(qa_pair['answer'])
                    for sentence in sentences:
                        new_d = qa_pair
                        new_d['annotation_type'] = annotation_type
                        new_d['rated'] = 'No'
                        new_d['sentence_id'] = new_d['answer_id'] + f'_{sentence[0]}'
                        new_d['answer'] = sentence[1]
                        with open(os.path.join(output_dir, f'{annotator.lower()}_{annotation_type}.jsonl'), 'a') as file:
                            json.dump(new_d, file)
                            file.write('\n')
                    

    flag += 1

Annotator1
Adding coarse annotations...
coarse gpt4 [0, 50]
50
coarse llama [50, 100]
50
coarse physician [0, 50]
50
Adding fine annotations...
fine gpt4 [50, 100]
50
fine llama [0, 50]
50
fine physician [50, 100]
50
Annotator6
Adding coarse annotations...
coarse gpt4 [50, 100]
50
coarse llama [0, 50]
50
coarse physician [50, 100]
50
Adding fine annotations...
fine gpt4 [0, 50]
50
fine llama [50, 100]
50
fine physician [0, 50]
50
Annotator2
Adding coarse annotations...
coarse gpt4 [0, 50]
50
coarse llama [50, 100]
50
coarse physician [0, 50]
50
Adding fine annotations...
fine gpt4 [50, 100]
50
fine llama [0, 50]
50
fine physician [50, 100]
50
Annotator3
Adding coarse annotations...
coarse gpt4 [50, 100]
50
coarse llama [0, 50]
50
coarse physician [50, 100]
50
Adding fine annotations...
fine gpt4 [0, 50]
50
fine llama [50, 100]
50
fine physician [0, 50]
50
Annotator4
Adding coarse annotations...
coarse gpt4 [0, 50]
50
coarse llama [50, 100]
50
coarse physician [0, 50]
50
Adding fine ann

In [8]:
new_d

{'question_id': 'question_49',
 'question': 'Does low thyroid contribute to weight gain?',
 'answer_id': 'physician_49',
 'answer': 'When thyroid function is low, it can lead to a reduction in metabolic rate and modest weight gain. However, this weight gain is typically only around 5-10 pounds and is not commonly associated with severe obesity. Treating and balancing thyroid function can help with weight loss. :orange[If you suspect you have low thyroid function, it is important to consult your healthcare provider for assessment and treatment.]',
 'answer_type': 'physician',
 'annotation_type': 'fine',
 'rated': 'No',
 'sentence_id': 'physician_49_3'}

In [19]:
with open(os.path.join(output_dir, f"annotator3_coarse.jsonl"), 'r', encoding='utf-8') as jsonl_file:
    data = [json.loads(line) for line in jsonl_file]
    
for d in data:
    if int(d['question_id'].split('_')[1]) == 145:
        print(d['answer_id'])

gpt4_93
physician_93
