In [1]:
# Imports
import os
import time
import json
import csv
import random
import spacy



In [2]:
dir = os.getcwd()
data_dir = os.path.join(dir, 'data')
os.makedirs(data_dir, exist_ok=True)
output_dir = os.path.join(dir, 'output')
os.makedirs(output_dir, exist_ok=True)

In [3]:
random.seed(17)

annotators = [f"Annotator{i+1}" for i in range(6)]
random.shuffle(annotators)
annotators

['Annotator1',
 'Annotator6',
 'Annotator2',
 'Annotator3',
 'Annotator4',
 'Annotator5']

In [4]:
indexes_dict_0 = {'coarse': [0, 50],
                 'fine': [50, 100]}

indexes_dict_1 = {'coarse': [50, 100],
                 'fine': [0, 50]}

In [5]:
def bold_sentences(text):
    
    # Load the spaCy English model
    nlp = spacy.load("en_core_web_sm")
    # Process the text with spaCy to segment into sentences
    doc = nlp(text)
    sentences = [sentence.text for sentence in doc.sents]
    fine_sentences = []
    for bold_index in range(len(sentences)):
        bold_sentence = f':orange[{sentences[bold_index]}]'
        new_sentences = sentences[:bold_index] + [bold_sentence] + sentences[bold_index + 1:]
        fine_sentence = ' '.join(new_sentences)
        fine_sentences.append((bold_index, fine_sentence))
    
    return fine_sentences

# Example text
text = """This is the first sentence. Here's the second one! Is this the third sentence? Yes, it is."""

# bold sentences in the example text
for i in bold_sentences(text):
    print(i)


(0, ":orange[This is the first sentence.] Here's the second one! Is this the third sentence? Yes, it is.")
(1, "This is the first sentence. :orange[Here's the second one!] Is this the third sentence? Yes, it is.")
(2, "This is the first sentence. Here's the second one! :orange[Is this the third sentence?] Yes, it is.")
(3, "This is the first sentence. Here's the second one! Is this the third sentence? :orange[Yes, it is.]")


In [6]:
with open(os.path.join(data_dir, f"gpt4_answers.jsonl"), 'r', encoding='utf-8') as jsonl_file:
    gpt4_answers = [json.loads(line) for line in jsonl_file]

In [7]:
with open(os.path.join(data_dir, f"llama_answers.jsonl"), 'r', encoding='utf-8') as jsonl_file:
    llama_answers = [json.loads(line) for line in jsonl_file]

In [8]:
with open(os.path.join(data_dir, f"physician_answers.jsonl"), 'r', encoding='utf-8') as jsonl_file:
    physician_answers = [json.loads(line) for line in jsonl_file]

In [9]:
flag = 0
for annotator in annotators:
    
    print(annotator)
    
    if flag < 3:
        indexes_dictionary = indexes_dict_0
    else:
        indexes_dictionary = indexes_dict_1
    
    print(indexes_dictionary)
        
    for annotation_type, indexes in indexes_dictionary.items():
        
        print(f'Adding {annotation_type} annotations, indexes: {indexes}')
        
        for n in range(indexes[0], indexes[1]):
            
            qa_pairs = [gpt4_answers[n], llama_answers[n], physician_answers[n]]
            random.shuffle(qa_pairs)
            
            if qa_pairs[0]['question_id'] == qa_pairs[1]['question_id'] == qa_pairs[2]['question_id']:
            
                for qa_pair in qa_pairs:
                    
                    if annotation_type == 'coarse':
                        qa_pair['annotation_type'] = annotation_type
                        qa_pair['rated'] = 'No'
                    
                        with open(os.path.join(output_dir, f'{annotator.lower()}_{annotation_type}.jsonl'), 'a') as file:
                                json.dump(qa_pair, file)
                                file.write('\n')
                
                    elif annotation_type == 'fine':
                        sentences = bold_sentences(qa_pair['answer'])
                        for sentence in sentences:
                            new_d = qa_pair.copy()
                            new_d['annotation_type'] = annotation_type
                            new_d['rated'] = 'No'
                            new_d['sentence_id'] = new_d['answer_id'] + f'_{sentence[0]}'
                            new_d['answer'] = sentence[1]
                            with open(os.path.join(output_dir, f'{annotator.lower()}_{annotation_type}.jsonl'), 'a') as file:
                                json.dump(new_d, file)
                                file.write('\n')

    flag += 1

Annotator1
{'coarse': [0, 50], 'fine': [50, 100]}
Adding coarse annotations, indexes: [0, 50]
Adding fine annotations, indexes: [50, 100]
Annotator6
{'coarse': [0, 50], 'fine': [50, 100]}
Adding coarse annotations, indexes: [0, 50]
Adding fine annotations, indexes: [50, 100]
Annotator2
{'coarse': [0, 50], 'fine': [50, 100]}
Adding coarse annotations, indexes: [0, 50]
Adding fine annotations, indexes: [50, 100]
Annotator3
{'coarse': [50, 100], 'fine': [0, 50]}
Adding coarse annotations, indexes: [50, 100]
Adding fine annotations, indexes: [0, 50]
Annotator4
{'coarse': [50, 100], 'fine': [0, 50]}
Adding coarse annotations, indexes: [50, 100]
Adding fine annotations, indexes: [0, 50]
Annotator5
{'coarse': [50, 100], 'fine': [0, 50]}
Adding coarse annotations, indexes: [50, 100]
Adding fine annotations, indexes: [0, 50]


In [None]:
new_d

In [None]:
with open(os.path.join(output_dir, f"annotator3_coarse.jsonl"), 'r', encoding='utf-8') as jsonl_file:
    data = [json.loads(line) for line in jsonl_file]
    
for d in data:
    if int(d['question_id'].split('_')[1]) == 145:
        print(d['answer_id'])