In [10]:
# Imports
import yaml
import sys

import pandas as pd
import numpy as np
import rubrix as rb

from yaml import Loader

sys.path.insert(0, '../')
from ids import rubrix_api_key
from tqdm.auto import tqdm
from collections import defaultdict


pd.set_option('display.max_colwidth', 100)
rb.init("http://localhost:6900/", api_key=rubrix_api_key)




In [3]:
# Load users
users = yaml.load(open('../.users.yaml','r'), Loader)
active_users = set([u['username'] for u in users if 'credits' in u])
u2cred = {u['username']: u['credits'] for u in users if 'credits' in u}

# Specify how many classes should be present in data
n_classes = 5

# Fix the index of each class
label_to_class_num = {'study_drug': 0, 'target_disease': 1, 'control_group': 2,  'population_size': 3, 'quantitative_effect_measure': 4}
class_num_to_label = {i:l for l, i in label_to_class_num.items()}

In [4]:
dfs = []
for user in tqdm(users):
    if 'credits' not in user:
        continue
    
    dataset_name = [x for x in user['workspaces'] if x.startswith('cancer_stage_1')][0]
    qc_dataset_names = [x for x in user['workspaces'] if x.startswith('qc_stage_1')]
    # print(dataset_name)
    df = rb.load(name=dataset_name)
    qc_dfs = [rb.load(name=qc_name) for qc_name in qc_dataset_names]
    dfs.append(df)
    dfs.extend(qc_dfs)
    # print(df.status.value_counts())
    # print('------------------------')


# Concatenate all user-level dfs together 
all_abstracts = pd.concat(dfs)

  0%|          | 0/26 [00:00<?, ?it/s]

In [7]:
# Add relevant text fields
all_abstracts['pmid'] = all_abstracts['metadata'].map(lambda x: x['pmid'])
all_abstracts['text'] = all_abstracts['inputs'].map(lambda x: x['text'])
all_abstracts.groupby('pmid').agg({'annotation_agent':'nunique', 'text':'nunique'}).sort_values(by='text').tail(50)

# Filter by annotated abstracts
all_abstracts = all_abstracts.query('status == "Validated"')

# Get all used labels
all_annotated_labels = set([x for labels in all_abstracts['annotation'] for x in labels])

# Get unique, reproducible label vector for each abstract
# label_to_class_num = {l:i for i, l in enumerate(all_annotated_labels)}

# print(label_to_class_num)
all_abstracts['labels'] = all_abstracts['annotation'].map(lambda x: np.array([1 if class_num_to_label[i] in x else 0 
                                                        for i in range(n_classes)]))
all_abstracts['label_tuple'] = all_abstracts['labels'].map(lambda x: tuple(x))
df = all_abstracts[['pmid','annotation_agent','label_tuple','text']].drop_duplicates()


In [15]:
# Cutoff for proportion agreement to call correct
CUTOFF = .75

# Get data on who annotated each abstract, as well how many unique annotation patterns were used
pmid_label_counts = df.groupby(['pmid', 'label_tuple']).agg(annotation_agent=('annotation_agent', lambda x: set([a for a in x])), 
                                                                text=('text','first')).reset_index()
pmid_label_counts['num_annotators'] = pmid_label_counts['annotation_agent'].map(lambda x: len(x))

# Get counts of how many times each PMID was annotated and who annotated it
# Also get lists of who has NOT annotated each PMID
already_assigned = df.groupby('pmid').agg({'annotation_agent':lambda x: set([a for a in x])}).to_dict()['annotation_agent']
pmid_counts = {k:len(v) for k, v in already_assigned.items()}
possible_assignees = {pmid: active_users - prev for pmid, prev in already_assigned.items()}

# Abstract is "correct" if at least 2/3 of annotators agreed on it.
pmid_label_counts['total'] = pmid_label_counts['pmid'].map(pmid_counts)
pmid_label_counts['proportion'] = pmid_label_counts['num_annotators']/pmid_label_counts['total']
correct_pmids = set(pmid_label_counts.query('(proportion >= @CUTOFF) & (num_annotators > 1)').pmid.tolist())
qc_pmids = set([pmid for pmid in pmid_label_counts.pmid if pmid not in correct_pmids])

# Pick 25 "hardest" PMIDs 
# Hardest = most distint annotations divided by number of total annotators
difficulty = pmid_label_counts.groupby('pmid').agg(distinct_annotations=('proportion','count'), total_annotators=('num_annotators','sum')).reset_index()
difficulty['score'] = difficulty['distinct_annotations']/difficulty['total_annotators']
# test_pmids = difficulty.sort_values(by=['score', 'distinct_annotations'],ascending=False).head(25).pmid.tolist()

# Also pick 25 pmids with high disagreement (or only one annotation) and assign to everyone
test_pmids = difficulty.query('score == 1').sample(25).pmid.tolist()


# Randomly assign 2 new annotators to QC abstracts
qc_df = all_abstracts.loc[all_abstracts.pmid.map(lambda x: x in qc_pmids), 
               ['pmid','prediction','multi_label','metadata','text']].groupby('pmid').agg('first').reset_index()
qc_df['possible_assignees'] = qc_df['pmid'].map(lambda x: list(possible_assignees[x]))
qc_df['assigned'] = qc_df['possible_assignees'].map(lambda x: sorted(np.random.choice(x, 
                                                                                      size=2, 
                                                                                      p=np.array([u2cred[i] for i in x], dtype=float)/np.array([u2cred[i] for i in x]).sum())))
all_users = list(active_users) + ['haydnturner','dongyu_zhang','davidkartchner']
# qc_df.loc[qc_df['pmid'].map(lambda x: x in test_pmids), 'assigned'] = pd.Series([list(active_users) + ['haydnturner','dongyu_zhang','davidkartchner'] for _ in test_pmids])
qc_df['assigned'] = qc_df[['pmid','assigned']].apply(lambda x: all_users if x[0] in test_pmids else x[1], axis=1)


In [16]:
len(correct_pmids)

1242

In [12]:
# Keep a list of "correct" annotations
len(correct_pmids)
with open('/nethome/dkartchner3/annotation/rubrix/pmid_logs/correct.txt', 'a') as f:
    for pmid in correct_pmids:
        f.write(str(pmid))
        f.write('\n')



In [13]:
# Make dictionary of assignments for users
assignments = defaultdict(list)
qc_df = qc_df.rename({'text':'inputs'}, axis=1)
for assigned_users, abstract in zip(qc_df['assigned'].tolist(), 
                   qc_df[['prediction','multi_label','metadata','inputs']].to_dict(orient='records')):
    for user in assigned_users:
        assignments[user].append(abstract)

In [14]:
print({key:len(val) for key, val in assignments.items()})

{'kzheng': 115, 'mramirezmartin': 155, 'echang': 104, 'jvasquez': 199, 'sfraga': 221, 'asamadi': 171, 'prumyantseva': 140, 'lcheng': 150, 'apatel': 195, 'bswitzer': 112, 'ashoemaker': 138, 'mmendoza': 137, 'dfeng': 136, 'kayral': 126, 'acarvalho': 154, 'kamstutz': 112, 'mgallimore': 188, 'lware': 143, 'ayigitkanli': 139, 'haydnturner': 25, 'dongyu_zhang': 25, 'davidkartchner': 25}


In [15]:
# Upload abstracts to rubrix
qc_logfile = '/nethome/dkartchner3/annotation/rubrix/pmid_logs/qc_uploaded.txt'
for user, abstract_list in tqdm(assignments.items()):
    workspace = 'qc_stage_1_' + user
    rb.init("http://localhost:6900/", api_key=rubrix_api_key)
    rb.set_workspace(workspace)
    pmids = [abstract['metadata']['pmid'] for abstract in abstract_list]
    records = [rb.TextClassificationRecord(**abstract) for abstract in abstract_list]
    rb.log(
        records=records,
        name='qc_round_2',
        tags={
            "task": "multilabel-text-classification",
            "family": "text-classification",
            "dataset": "spring_2022_stage_1_qc_round_2",
        },
    )
    
    with open(qc_logfile, 'a') as f:
        for r in pmids:
            f.write(r + '\n')


  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

115 records logged to http://localhost:6900/ws/qc_stage_1_kzheng/qc_round_2


  0%|          | 0/155 [00:00<?, ?it/s]

155 records logged to http://localhost:6900/ws/qc_stage_1_mramirezmartin/qc_round_2


  0%|          | 0/104 [00:00<?, ?it/s]

104 records logged to http://localhost:6900/ws/qc_stage_1_echang/qc_round_2


  0%|          | 0/199 [00:00<?, ?it/s]

199 records logged to http://localhost:6900/ws/qc_stage_1_jvasquez/qc_round_2


  0%|          | 0/221 [00:00<?, ?it/s]

221 records logged to http://localhost:6900/ws/qc_stage_1_sfraga/qc_round_2


  0%|          | 0/171 [00:00<?, ?it/s]

171 records logged to http://localhost:6900/ws/qc_stage_1_asamadi/qc_round_2


  0%|          | 0/140 [00:00<?, ?it/s]

140 records logged to http://localhost:6900/ws/qc_stage_1_prumyantseva/qc_round_2


  0%|          | 0/150 [00:00<?, ?it/s]

150 records logged to http://localhost:6900/ws/qc_stage_1_lcheng/qc_round_2


  0%|          | 0/195 [00:00<?, ?it/s]

195 records logged to http://localhost:6900/ws/qc_stage_1_apatel/qc_round_2


  0%|          | 0/112 [00:00<?, ?it/s]

112 records logged to http://localhost:6900/ws/qc_stage_1_bswitzer/qc_round_2


  0%|          | 0/138 [00:00<?, ?it/s]

138 records logged to http://localhost:6900/ws/qc_stage_1_ashoemaker/qc_round_2


  0%|          | 0/137 [00:00<?, ?it/s]

137 records logged to http://localhost:6900/ws/qc_stage_1_mmendoza/qc_round_2


  0%|          | 0/136 [00:00<?, ?it/s]

136 records logged to http://localhost:6900/ws/qc_stage_1_dfeng/qc_round_2


  0%|          | 0/126 [00:00<?, ?it/s]

126 records logged to http://localhost:6900/ws/qc_stage_1_kayral/qc_round_2


  0%|          | 0/154 [00:00<?, ?it/s]

154 records logged to http://localhost:6900/ws/qc_stage_1_acarvalho/qc_round_2


  0%|          | 0/112 [00:00<?, ?it/s]

112 records logged to http://localhost:6900/ws/qc_stage_1_kamstutz/qc_round_2


  0%|          | 0/188 [00:00<?, ?it/s]

188 records logged to http://localhost:6900/ws/qc_stage_1_mgallimore/qc_round_2


  0%|          | 0/143 [00:00<?, ?it/s]

143 records logged to http://localhost:6900/ws/qc_stage_1_lware/qc_round_2


  0%|          | 0/139 [00:00<?, ?it/s]

139 records logged to http://localhost:6900/ws/qc_stage_1_ayigitkanli/qc_round_2


  0%|          | 0/25 [00:00<?, ?it/s]

25 records logged to http://localhost:6900/ws/qc_stage_1_haydnturner/qc_round_2


  0%|          | 0/25 [00:00<?, ?it/s]

25 records logged to http://localhost:6900/ws/qc_stage_1_dongyu_zhang/qc_round_2


  0%|          | 0/25 [00:00<?, ?it/s]

25 records logged to http://localhost:6900/ws/qc_stage_1_davidkartchner/qc_round_2
