In [1]:
import yaml
import sys

import pandas as pd
import numpy as np
import rubrix as rb

from yaml import Loader

sys.path.insert(0, '../')
from ids import rubrix_api_key
from tqdm.auto import tqdm
from collections import defaultdict


pd.set_option('display.max_colwidth', 100)
rb.init("http://localhost:6900/", api_key=rubrix_api_key)


# Specify how many classes should be present in data
n_classes = 5


In [2]:
# Load users
users = yaml.load(open('../.users.yaml','r'), Loader)
active_users = set([u['username'] for u in users if 'credits' in u])
u2cred = {u['username']: u['credits'] for u in users if 'credits' in u}

In [3]:
dfs = []
for user in users:
    if 'credits' not in user:
        continue
    
    dataset_name = [x for x in user['workspaces'] if x.startswith('cancer_stage_1')][0]
    print(dataset_name)
    df = rb.load(name=dataset_name)
    dfs.append(df)
    print(df.status.value_counts())
    print('------------------------')


# Concatenate all user-level dfs together 
all_abstracts = pd.concat(dfs)

cancer_stage_1_acarvalho
Validated    252
Name: status, dtype: int64
------------------------
cancer_stage_1_apatel
Validated    617
Name: status, dtype: int64
------------------------
cancer_stage_1_asamadi
Validated    504
Name: status, dtype: int64
------------------------
cancer_stage_1_ashoemaker
Validated    252
Name: status, dtype: int64
------------------------
cancer_stage_1_ayigitkanli
Default      182
Validated     70
Name: status, dtype: int64
------------------------
cancer_stage_1_bswitzer
Validated    252
Name: status, dtype: int64
------------------------
cancer_stage_1_dfeng
Validated    252
Name: status, dtype: int64
------------------------
cancer_stage_1_echang
Validated    252
Name: status, dtype: int64
------------------------
cancer_stage_1_jvasquez
Validated    500
Name: status, dtype: int64
------------------------
cancer_stage_1_kamstutz
Validated    250
Discarded      2
Name: status, dtype: int64
------------------------
cancer_stage_1_kayral
Validated    252

In [4]:
# Add relevant text fields
all_abstracts['pmid'] = all_abstracts['metadata'].map(lambda x: x['pmid'])
all_abstracts['text'] = all_abstracts['inputs'].map(lambda x: x['text'])
all_abstracts.groupby('pmid').agg({'annotation_agent':'nunique', 'text':'nunique'}).sort_values(by='text').tail(50)

# Filter by annotated abstracts
df = all_abstracts.query('status == "Validated"')

# Get all used labels
all_annotated_labels = set([x for labels in df['annotation'] for x in labels])

# Get unique, reproducible label vector for each abstract
label_to_class_num = {l:i for i, l in enumerate(all_annotated_labels)}
class_num_to_label = {i:l for l, i in label_to_class_num.items()}
df['labels'] = df['annotation'].map(lambda x: np.array([1 if class_num_to_label[i] in x else 0 
                                                        for i in range(n_classes)]))
df['label_tuple'] = df['labels'].map(lambda x: tuple(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['labels'] = df['annotation'].map(lambda x: np.array([1 if class_num_to_label[i] in x else 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label_tuple'] = df['labels'].map(lambda x: tuple(x))


In [48]:
# Get data on who annotated each abstract, as well how many unique annotation patterns were used
pmid_annotation_counts = df.groupby('pmid').agg({'annotation_agent':lambda x: set([a for a in x]), 
                                                 'label_tuple':'nunique', 'text':'first'})
pmid_annotation_counts['num_annotators'] = pmid_annotation_counts['annotation_agent'].map(lambda x: len(x))

# Use annotation into to sort "correct" and "QC" abstracts
pmids_for_qc = set(pmid_annotation_counts.query('label_tuple > 1').index)
correct_pmids = set(pmid_annotation_counts.query('(label_tuple == 1) & (num_annotators > 1)').index)

In [35]:
# Keep a list of "correct" annotations
len(correct_pmids)
with open('/nethome/dkartchner3/annotation/rubrix/pmid_logs/correct.txt', 'a') as f:
    for pmid in correct_pmids:
        f.write(str(pmid))
        f.write('\n')

659

In [55]:
# Figure out who could be assigned to QC abstracts
already_assigned = pmid_annotation_counts['annotation_agent'].to_dict()
possible_assignees = {pmid: active_users - prev for pmid, prev in already_assigned.items()}

In [103]:
# Randomly assign 2 new annotators to QC abstracts
qc_df = df.loc[df.pmid.map(lambda x: x in pmids_for_qc), 
               ['pmid','prediction','multi_label','metadata','text']].groupby('pmid').agg('first').reset_index()
qc_df['possible_assignees'] = qc_df['pmid'].map(lambda x: list(possible_assignees[x]))
qc_df['assigned'] = qc_df['possible_assignees'].map(lambda x: sorted(np.random.choice(x, 
                                                                                      size=2, 
                                                                                      p=np.array([u2cred[i] for i in x], dtype=float)/np.array([u2cred[i] for i in x]).sum())))


In [104]:
# Make dictionary of assignments for users
assignments = defaultdict(list)
qc_df = qc_df.rename({'text':'inputs'}, axis=1)
for assigned_users, abstract in zip(qc_df['assigned'].tolist(), 
                   qc_df[['prediction','multi_label','metadata','inputs']].to_dict(orient='records')):
    for user in assigned_users:
        assignments[user].append(abstract)

In [105]:
print({key:len(val) for key, val in assignments.items()})

{'apatel': 113, 'lware': 106, 'sfraga': 165, 'asamadi': 175, 'jvasquez': 162, 'acarvalho': 96, 'mramirezmartin': 101, 'mmendoza': 101, 'ashoemaker': 85, 'echang': 76, 'kzheng': 71, 'kamstutz': 82, 'lcheng': 85, 'kayral': 85, 'mgallimore': 148, 'bswitzer': 89, 'dfeng': 89, 'prumyantseva': 107, 'ayigitkanli': 88}


In [110]:
# Upload abstracts to rubrix
qc_logfile = '/nethome/dkartchner3/annotation/rubrix/pmid_logs/qc_uploaded.txt'
for user, abstract_list in tqdm(assignments.items()):
    workspace = 'qc_stage_1_' + user
    rb.init("http://localhost:6900/", api_key=rubrix_api_key)
    rb.set_workspace(workspace)
    pmids = [abstract['metadata']['pmid'] for abstract in abstract_list]
    records = [rb.TextClassificationRecord(**abstract) for abstract in abstract_list]
    rb.log(
        records=records,
        name=workspace,
        tags={
            "task": "multilabel-text-classification",
            "family": "text-classification",
            "dataset": "spring_2022_stage_1_qc",
        },
    )
    
    with open(qc_logfile, 'a') as f:
        for r in pmids:
            f.write(r + '\n')


  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

113 records logged to http://localhost:6900/ws/qc_stage_1_apatel/qc_stage_1_apatel


  0%|          | 0/106 [00:00<?, ?it/s]

106 records logged to http://localhost:6900/ws/qc_stage_1_lware/qc_stage_1_lware


  0%|          | 0/165 [00:00<?, ?it/s]

165 records logged to http://localhost:6900/ws/qc_stage_1_sfraga/qc_stage_1_sfraga


  0%|          | 0/175 [00:00<?, ?it/s]

175 records logged to http://localhost:6900/ws/qc_stage_1_asamadi/qc_stage_1_asamadi


  0%|          | 0/162 [00:00<?, ?it/s]

162 records logged to http://localhost:6900/ws/qc_stage_1_jvasquez/qc_stage_1_jvasquez


  0%|          | 0/96 [00:00<?, ?it/s]

96 records logged to http://localhost:6900/ws/qc_stage_1_acarvalho/qc_stage_1_acarvalho


  0%|          | 0/101 [00:00<?, ?it/s]

101 records logged to http://localhost:6900/ws/qc_stage_1_mramirezmartin/qc_stage_1_mramirezmartin


  0%|          | 0/101 [00:00<?, ?it/s]

101 records logged to http://localhost:6900/ws/qc_stage_1_mmendoza/qc_stage_1_mmendoza


  0%|          | 0/85 [00:00<?, ?it/s]

85 records logged to http://localhost:6900/ws/qc_stage_1_ashoemaker/qc_stage_1_ashoemaker


  0%|          | 0/76 [00:00<?, ?it/s]

76 records logged to http://localhost:6900/ws/qc_stage_1_echang/qc_stage_1_echang


  0%|          | 0/71 [00:00<?, ?it/s]

71 records logged to http://localhost:6900/ws/qc_stage_1_kzheng/qc_stage_1_kzheng


  0%|          | 0/82 [00:00<?, ?it/s]

82 records logged to http://localhost:6900/ws/qc_stage_1_kamstutz/qc_stage_1_kamstutz


  0%|          | 0/85 [00:00<?, ?it/s]

85 records logged to http://localhost:6900/ws/qc_stage_1_lcheng/qc_stage_1_lcheng


  0%|          | 0/85 [00:00<?, ?it/s]

85 records logged to http://localhost:6900/ws/qc_stage_1_kayral/qc_stage_1_kayral


  0%|          | 0/148 [00:00<?, ?it/s]

148 records logged to http://localhost:6900/ws/qc_stage_1_mgallimore/qc_stage_1_mgallimore


  0%|          | 0/89 [00:00<?, ?it/s]

89 records logged to http://localhost:6900/ws/qc_stage_1_bswitzer/qc_stage_1_bswitzer


  0%|          | 0/89 [00:00<?, ?it/s]

89 records logged to http://localhost:6900/ws/qc_stage_1_dfeng/qc_stage_1_dfeng


  0%|          | 0/107 [00:00<?, ?it/s]

107 records logged to http://localhost:6900/ws/qc_stage_1_prumyantseva/qc_stage_1_prumyantseva


  0%|          | 0/88 [00:00<?, ?it/s]

88 records logged to http://localhost:6900/ws/qc_stage_1_ayigitkanli/qc_stage_1_ayigitkanli


In [61]:
label_counts = df[['pmid', 'label_tuple']].drop_duplicates().groupby('pmid').count()
example_pmids = set(label_counts.query('label_tuple > 3').index)

In [66]:
example_df[['inputs', 'prediction','metadata','multi_label']].to_dict(orient='records')

[{'inputs': {'text': 'Association between coronary heart disease and cancers of the breast, prostate, and colon.\nCoronary heart disease (CHD) and cancers of the breast, prostate, and colon are more common in industrialized countries than in the developing world, and to some degree, these conditions appear to share risk factors. To investigate whether there is an association between these cancers and a prior history of CHD, a hospital-based case-control study was conducted at Columbia-Presbyterian Medical Center in New York. The study was based on 252 breast cancer cases, 256 colorectal cancer cases, and 322 benign surgical controls, all of whom underwent biopsy or surgery between January 1989 and December 1992, and on 319 prostate cancer cases and 189 benign prostatic hypertrophy controls diagnosed between January 1984 and December 1986 (prior to widespread use of prostate-specific antigen screening). Medical records were reviewed on each, focusing on the preoperative anesthesia and s

In [78]:
example_df = df[df.pmid.map(lambda x: x in example_pmids)].groupby('pmid').agg("first")
example_df

Unnamed: 0_level_0,inputs,prediction,prediction_agent,annotation,annotation_agent,multi_label,explanation,id,metadata,status,event_timestamp,metrics,text,labels,label_tuple
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
14760377,{'text': 'Statin use and cancer risk in the General Practice Research Database. In a matched cas...,"[(population_size, 0.0), (quantitative_effect_measure, 0.0), (study_drug, 0.0), (control_group, ...",MITCH-LIN,"[target_disease, study_drug]",bswitzer,True,,9cab1c5d-d68c-4752-9ead-203ee36c3e2f,"{'substances': ['Hydroxymethylglutaryl-CoA Reductase Inhibitors'], 'substance_mesh_id': ['D01916...",Validated,,{},Statin use and cancer risk in the General Practice Research Database.\nIn a matched case-control...,"[0, 0, 1, 0, 1]","(0, 0, 1, 0, 1)"
15180615,{'text': 'Risk behaviours and benign prostatic hyperplasia. To identify risk factors for benign ...,"[(population_size, 0.0), (quantitative_effect_measure, 0.0), (study_drug, 0.0), (control_group, ...",MITCH-LIN,"[control_group, population_size, quantitative_effect_measure, target_disease]",apatel,True,,8800f8bc-37a7-4bbc-ac39-7609cfbc32d3,"{'substances': ['Anti-Inflammatory Agents, Non-Steroidal'], 'substance_mesh_id': ['D000894'], 'p...",Validated,,{},Risk behaviours and benign prostatic hyperplasia.\nTo identify risk factors for benign prostatic...,"[1, 1, 0, 1, 1]","(1, 1, 0, 1, 1)"
17235211,"{'text': 'Statin use and the risk of 10 cancers. Statins affect the proliferation, survival, and...","[(population_size, 0.0), (quantitative_effect_measure, 0.0), (study_drug, 0.0), (control_group, ...",MITCH-LIN,"[quantitative_effect_measure, study_drug, control_group]",apatel,True,,d91807bd-3660-4c8f-85d1-01bf6e1c51c6,"{'substances': ['Hydroxymethylglutaryl-CoA Reductase Inhibitors'], 'substance_mesh_id': ['D01916...",Validated,,{},"Statin use and the risk of 10 cancers.\nStatins affect the proliferation, survival, and migratio...","[1, 1, 1, 0, 0]","(1, 1, 1, 0, 0)"
17932357,{'text': 'Factors associated with human small aggressive non small cell lung cancer. Some non-sm...,"[(population_size, 0.0), (quantitative_effect_measure, 0.0), (study_drug, 0.0), (control_group, ...",MITCH-LIN,"[quantitative_effect_measure, target_disease]",apatel,True,,c200c661-4571-4406-992c-8f59294319da,"{'substances': ['Anti-Inflammatory Agents, Non-Steroidal', 'Ibuprofen'], 'substance_mesh_id': ['...",Validated,,{},Factors associated with human small aggressive non small cell lung cancer.\nSome non-small cell ...,"[0, 1, 0, 0, 1]","(0, 1, 0, 0, 1)"
24661226,{'text': 'Dose effect of thiazolidinedione on cancer risk in type 2 diabetes mellitus patients: ...,"[(population_size, 0.0), (quantitative_effect_measure, 0.0), (study_drug, 0.0), (control_group, ...",MITCH-LIN,"[quantitative_effect_measure, study_drug, control_group]",apatel,True,,4e3b5fe9-0ba7-4598-9cb8-e0b0f1dca4af,"{'substances': ['Hypoglycemic Agents', 'Thiazolidinediones'], 'substance_mesh_id': ['D007004', '...",Validated,,{},Dose effect of thiazolidinedione on cancer risk in type 2 diabetes mellitus patients: a six-year...,"[1, 1, 1, 0, 0]","(1, 1, 1, 0, 0)"
9796631,"{'text': 'Association between coronary heart disease and cancers of the breast, prostate, and co...","[(population_size, 0.0), (quantitative_effect_measure, 0.0), (study_drug, 0.0), (control_group, ...",MITCH-LIN,"[control_group, quantitative_effect_measure, target_disease]",apatel,True,,471ac7e0-9745-494d-a5f3-bde813dc74e3,"{'substances': ['Anti-Inflammatory Agents, Non-Steroidal', 'Aspirin'], 'substance_mesh_id': ['D0...",Validated,,{},"Association between coronary heart disease and cancers of the breast, prostate, and colon.\nCoro...","[1, 1, 0, 0, 1]","(1, 1, 0, 0, 1)"
9950242,{'text': 'Colorectal adenomatous and hyperplastic polyps: smoking and N-acetyltransferase 2 poly...,"[(population_size, 0.0), (quantitative_effect_measure, 0.0), (study_drug, 0.0), (control_group, ...",MITCH-LIN,"[quantitative_effect_measure, target_disease, control_group]",apatel,True,,ebf997bd-8028-4580-abf1-6b8f462843a8,"{'substances': ['Anti-Inflammatory Agents, Non-Steroidal', 'Carcinogens', 'Mutagens', 'Arylamine...",Validated,,{},Colorectal adenomatous and hyperplastic polyps: smoking and N-acetyltransferase 2 polymorphisms....,"[1, 1, 0, 0, 1]","(1, 1, 0, 0, 1)"


In [79]:
example_df = df[df.pmid.map(lambda x: x in example_pmids)].groupby('pmid').agg("first")
# example_df.groupby('pmid').agg("first")
example_df = example_df.drop(['inputs'], axis=1)
example_df['inputs'] = example_df['text']

            
example_records = [rb.TextClassificationRecord(**record) 
                   for record in 
                   example_df[['inputs', 'prediction','metadata','multi_label']].to_dict(orient='records')]

workspace = 'spring_2022_qc_stage_1_examples'
rb.set_workspace(workspace)

rb.log(records=example_records, name=workspace, tags={
            "task": "multilabel-text-classification",
            "family": "text-classification",
            "dataset": "qc_stage_1",})

  0%|          | 0/7 [00:00<?, ?it/s]

7 records logged to http://localhost:6900/ws/spring_2022_qc_stage_1_examples/spring_2022_qc_stage_1_examples


BulkResponse(dataset='spring_2022_qc_stage_1_examples', processed=7, failed=0)

In [None]:
rb.set_workspace('qc_exmaples')

In [43]:
all_abstracts['pmid'] = all_abstracts['metadata'].map(lambda x: x['pmid'])
all_abstracts['text'] = all_abstracts['inputs'].map(lambda x: x['text'])
all_abstracts.groupby('pmid').agg({'annotation_agent':'nunique', 'text':'nunique'})

Unnamed: 0_level_0,annotation_agent,text
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1
19306135,1,1
19336559,3,1
19336727,1,1
19337834,2,1
19338997,1,1
19340409,4,1
19342396,1,1
19352344,2,1
19331713,2,1
20075740,2,1


In [40]:
all_abstracts.pmid.value_counts()

12795046    12
16841255    10
32919225    10
17235211    10
10856067    10
            ..
27537577     2
2444306      2
17895894     2
17678735     2
19018260     2
Name: pmid, Length: 2558, dtype: int64

In [30]:
df.groupby('pmid').count()

Unnamed: 0_level_0,inputs,prediction,prediction_agent,annotation,annotation_agent,multi_label,explanation,id,metadata,status,event_timestamp,metrics,labels
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
10029375,4,4,4,4,4,4,0,4,4,4,0,4,4
10069640,2,2,2,2,2,2,0,2,2,2,0,2,2
10090841,1,1,1,1,1,1,0,1,1,1,0,1,1
10096264,1,1,1,1,1,1,0,1,1,1,0,1,1
10096552,2,2,2,2,2,2,0,2,2,2,0,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9880123,2,2,2,2,2,2,0,2,2,2,0,2,2
9920356,1,1,1,1,1,1,0,1,1,1,0,1,1
9927099,3,3,3,3,3,3,0,3,3,3,0,3,3
9927354,1,1,1,1,1,1,0,1,1,1,0,1,1


In [6]:
df

NameError: name 'df' is not defined