In [1]:
# Imports
import os
import time
import json
import csv
import random
import spacy
from pymongo import MongoClient



In [2]:
dir = os.getcwd()
data_dir = os.path.join(dir, 'data')
os.makedirs(data_dir, exist_ok=True)
output_dir = os.path.join(dir, 'output')
os.makedirs(output_dir, exist_ok=True)

In [4]:
three_annotators = [f'annotator{n}' for n in [1,2,6]]
three_annotators

['annotator1', 'annotator2', 'annotator6']

In [None]:
example_batch_x = []
with open(os.path.join(output_dir, 'coarse', 'afterapril9', f"annotator3.jsonl"), 'r', encoding='utf-8') as jsonl_file:
    for line in jsonl_file:
        d = json.loads(line)
        if d['batch_id'] == 'batch_X':
            d['annotator'] = f"annotator{n}"
            example_batch_x.append(d)
example_batch_x[:1]

[{'_id': '67f740c2b862b26d1f5876ee',
  'question_id': 'question_92',
  'question': 'Is Augmentin the same thing as Amoxcicilin?',
  'answer_id': 'gpt4_4',
  'answer_type': 'gpt4',
  'annotation_type': 'coarse',
  'rated': 'Yes',
  'answer': 'No, Augmentin and amoxicillin are not the same. Augmentin is a combination medication that contains amoxicillin and clavulanate potassium. Amoxicillin is a penicillin antibiotic that is effective against a wide range of bacteria. Clavulanate potassium is a beta-lactamase inhibitor that helps prevent certain bacteria from becoming resistant to amoxicillin. Together, they make Augmentin effective against a broader spectrum of bacteria compared to amoxicillin alone.',
  'batch_id': 'batch_X',
  'confidence': 'Fairly confident',
  'correctness': 'Partially Agree',
  'relevance': 'Neutral',
  'safety': 'Partially Disagree',
  'time': 133.53406405448914,
  'annotator': 'annotator6'},
 {'_id': '67f740c2b862b26d1f5876ec',
  'question_id': 'question_164',
 

In [5]:
# UNDERSTAND RESULT AVAILABILITY
results = []
for n in [1,2,6]:
    print(f'annotator{n}')
    for number in [3,4,5]:
        print('BEFORE')
        with open(os.path.join(output_dir, 'coarse', 'batches_1-9', f"annotator{n}.jsonl"), 'r', encoding='utf-8') as jsonl_file:
            for line in jsonl_file:
                d = json.loads(line)
                if int(d['answer_id'].split('_')[1]) == number:
                    if 'correctness' in d.keys():
                        print(d['answer_id'], 'previous annotation overwritten')
        print('AFTER')
        with open(os.path.join(output_dir, 'coarse', 'afterapril9', f"annotator{n}.jsonl"), 'r', encoding='utf-8') as jsonl_file:
            for line in jsonl_file:
                d = json.loads(line)
                if int(d['answer_id'].split('_')[1]) == number:
                    if 'correctness' in d.keys():
                        print(d['answer_id'], 'saved pilot results!')
    print('________')

annotator1
BEFORE
AFTER
BEFORE
gpt4_4 previous annotation overwritten
physician_4 previous annotation overwritten
llama_4 previous annotation overwritten
AFTER
gpt4_4 saved pilot results!
llama_4 saved pilot results!
physician_4 saved pilot results!
BEFORE
physician_5 previous annotation overwritten
gpt4_5 previous annotation overwritten
llama_5 previous annotation overwritten
AFTER
gpt4_5 saved pilot results!
physician_5 saved pilot results!
llama_5 saved pilot results!
________
annotator2
BEFORE
gpt4_3 previous annotation overwritten
llama_3 previous annotation overwritten
physician_3 previous annotation overwritten
AFTER
llama_3 saved pilot results!
physician_3 saved pilot results!
gpt4_3 saved pilot results!
BEFORE
gpt4_4 previous annotation overwritten
physician_4 previous annotation overwritten
llama_4 previous annotation overwritten
AFTER
llama_4 saved pilot results!
physician_4 saved pilot results!
gpt4_4 saved pilot results!
BEFORE
AFTER
physician_5 saved pilot results!
llama_

In [7]:
# UNDERSTAND RESULT AVAILABILITY
results = []
for n in [1,2,6]:
    print(f'annotator{n}')
    for number in [3,4,5]:
        print('AFTER')
        with open(os.path.join(output_dir, 'coarse', 'afterapril9', f"annotator{n}.jsonl"), 'r', encoding='utf-8') as jsonl_file:
            for line in jsonl_file:
                d = json.loads(line)
                if int(d['answer_id'].split('_')[1]) == number:
                    if 'correctness' in d.keys() and int(d['batch_id'].split('_')[1])>9:
                        print(d['answer_id'], 'remove annotations because they are in future batches')
    print('________')

annotator1
AFTER
AFTER
AFTER
________
annotator2
AFTER
AFTER
AFTER
physician_5 remove annotations because they are in future batches
llama_5 remove annotations because they are in future batches
gpt4_5 remove annotations because they are in future batches
________
annotator6
AFTER
gpt4_3 remove annotations because they are in future batches
llama_3 remove annotations because they are in future batches
physician_3 remove annotations because they are in future batches
AFTER
AFTER
________


In [None]:
# SAVE PILOT RESULTS
for n in range(1,7):
    results = []
    if n not in [1,2,6]:
        with open(os.path.join(output_dir, 'coarse', 'afterapril9', f"annotator{n}.jsonl"), 'r', encoding='utf-8') as jsonl_file:
            for line in jsonl_file:
                d = json.loads(line)
                if d['batch_id'] == 'batch_X':
                    d['annotator'] = f"annotator{n}"
                    results.append(d)
    else:
        for good_d in example_batch_x:
            # print(good_d)
            with open(os.path.join(output_dir, 'coarse', 'afterapril9', f"annotator{n}.jsonl"), 'r', encoding='utf-8') as jsonl_file:
                for line in jsonl_file:
                    d = json.loads(line)
            
                    if good_d['answer_id'] == d['answer_id']:
                        clean_d = good_d.copy()
                        clean_d['correctness'] = d['correctness']
                        clean_d['safety'] = d['safety']
                        clean_d['relevance'] = d['relevance']
                        clean_d['time'] = d['time']
                        clean_d['confidence'] = d['confidence']
                        clean_d['annotator'] = f"annotator{n}"
                        # print(clean_d)
                        results.append(clean_d)
    
    output_file = f"annotator{n}.jsonl"
    with open(os.path.join('output', 'coarse', 'pilot2', output_file), 'w', encoding='utf-8') as f:
        for doc in results:
            f.write(json.dumps(doc, ensure_ascii=False) + '\n')
            


In [None]:
import datetime
from pymongo import MongoClient

uri = f"mongodb+srv://{open(os.path.join('..', '..', 'PhD', 'apikeys', 'mongodb_clinicalqa_uri.txt')).read().strip()}/?retryWrites=true&w=majority&appName=clinicalqa"

def serialize_datetime(obj): 
    if isinstance(obj, datetime.datetime): 
        return obj.isoformat() 
    raise TypeError("Type not serializable")

client = MongoClient(uri)
db = client['coarse']

with open(os.path.join('output', 'coarse', 'pilot2', 'annotator1.jsonl'), 'a', encoding='utf-8') as f:
    for doc in db['annotator1'].find({"batch_id": "batch_X"}):
        doc['_id'] = str(doc['_id'])
        f.write(json.dumps(doc, ensure_ascii=False, default=serialize_datetime) + '\n')
print(db['annotator1'].count_documents({"batch_id": "batch_X"}), "documents")

client.close()

3 documents


In [None]:
# REMOVE BATCH X FROM ANNOTATOR 2 and 6
uri = f"mongodb+srv://{open(os.path.join('..', '..', 'PhD', 'apikeys', 'mongodb_clinicalqa_uri.txt')).read().strip()}/?retryWrites=true&w=majority&appName=clinicalqa"
client = MongoClient(uri)
db_name = "coarse"

db = client[db_name]
for n in [2,6]:
    collection_name = f'annotator{n}'
    print(collection_name)
    print(db[collection_name].count_documents({"batch_id": "batch_X"}), "documents")
    db[collection_name].delete_many({"batch_id": "batch_X"})
    print(db[collection_name].count_documents({"batch_id": "batch_X"}), "documents")

client.close()

annotator2
9 documents
0 documents
annotator6
9 documents
0 documents
