# Evaluation of BERN2 Entity Linking

In [1]:
import ujson
import pandas as pd
import os
import sys
from bigbio.dataloader import BigBioConfigHelpers
from tqdm.auto import tqdm
from umls_utils import UmlsMappings
from collections import defaultdict
from itertools import combinations

pd.set_option('display.max_rows', 200)

sys.path.append('../')
from bigbio_utils import dataset_to_df

AWS_S3_BUCKET = os.getenv("AWS_S3_BUCKET")
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_SESSION_TOKEN = os.getenv("AWS_SESSION_TOKEN")

conhelps = BigBioConfigHelpers()
tqdm.pandas()

%load_ext autoreload
%autoreload 2


## Get list of all needed PMIDs

In [2]:
all_pmids = set([])
for dataset in tqdm(['medmentions_full','medmentions_st21pv','bc5cdr','gnormplus','ncbi_disease','nlmchem']):
    data = conhelps.for_config_name(f"{dataset}_bigbio_kb").load_dataset()
    for split in data.keys():
        for doc in data[split]:
            pmid = doc["document_id"]
            all_pmids.add(pmid)


  0%|          | 0/6 [00:00<?, ?it/s]

Reusing dataset medmentions (/nethome/dkartchner3/.cache/huggingface/datasets/medmentions/medmentions_full_bigbio_kb/1.0.0/3fc6b8a3681d540ae6c7497c238636b543b90764247b5ff3642d243474000794)


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset medmentions/medmentions_st21pv_bigbio_kb to /nethome/dkartchner3/.cache/huggingface/datasets/medmentions/medmentions_st21pv_bigbio_kb/1.0.0/3fc6b8a3681d540ae6c7497c238636b543b90764247b5ff3642d243474000794...


Downloading data files:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/5.10M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/4 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset medmentions downloaded and prepared to /nethome/dkartchner3/.cache/huggingface/datasets/medmentions/medmentions_st21pv_bigbio_kb/1.0.0/3fc6b8a3681d540ae6c7497c238636b543b90764247b5ff3642d243474000794. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset bc5cdr (/nethome/dkartchner3/.cache/huggingface/datasets/bc5cdr/bc5cdr_bigbio_kb/1.0.0/f01f16ea9b65ead985bedadf7335195c32297c8f1b09417fc607b102a6757d6f)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset gnormplus (/nethome/dkartchner3/.cache/huggingface/datasets/gnormplus/gnormplus_bigbio_kb/1.0.0/a07d375cacec149beba70fa6c1ea3ac78628e6652dca5345f3478354e8c0ae96)


  0%|          | 0/2 [00:00<?, ?it/s]

Reusing dataset ncbi_disease (/nethome/dkartchner3/.cache/huggingface/datasets/ncbi_disease/ncbi_disease_bigbio_kb/1.0.0/91a4f7ea79a8b89806de2cefa4fcca8cd4a7495e2cfeb9c28a8d68c51f7eac8b)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset nlmchem (/nethome/dkartchner3/.cache/huggingface/datasets/nlmchem/nlmchem_bigbio_kb/1.0.0/d91131823c66b7dd1162027991ea47c342e478209b37cf261c5f122d30409594)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# Load PMIDs from NLMGene and PlantNorm

# NLM Gene
for subset in ['Train','Test']:
    with open(f'/Users/david//Downloads/Pmidlist.{subset}.txt') as f:
        pmids = set(f.read().strip().split('\n'))
        all_pmids.update(pmids)
        # print(list(pmids)[:5])


# PlantNorm
for subset in ['training','test','development']:
    with open(f'/Users/david/Downloads/DMCB_plant_{subset}_corpus.txt', 'r', encoding='utf-8', errors='ignore') as f:
        all_text = f.read()
        abstracts = all_text.strip().split('\n\n')
        abstract_lines = [x.split('\n') for x in abstracts]
        for abs in abstract_lines:
            pmid = abs[0].split('|')[0]
            # print(pmid)
            all_pmids.add(pmid)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/david//Downloads/Pmidlist.Train.txt'

In [None]:
with open('entity_linking_pmids.txt','w') as f:
    f.write('\n'.join([x for x in list(all_pmids) if x.strip() != '']))

## Load Mappings

In [14]:
hgnc_to_entrez = {
    k: v
    for k, v in pd.read_csv(
        "../data/proteins.tsv",
        delimiter="\t",
        names=["source", "target"],
    )
    .set_index("source")
    .to_dict()["target"].items()
    if k.startswith("hgnc")
}

entrez_to_hgnc = {
    val: key for key, val in hgnc_to_entrez.items() if key.startswith("hgnc")
}



In [None]:
# chebi_to_mesh = {k:v for k,v in pd.read_csv('/Users/david/Downloads/identifiers', delimiter="\t",
#         names=["source", "target"],
#     )
#     .set_index("source")
#     .to_dict()["target"].items() if k.startswith('')}

In [6]:
umls = UmlsMappings(umls_dir='/mitchell/entity-linking/2022AA/META/', debug=False, force_reprocess=False)

Loading cached UMLS data from /mitchell/entity-linking/2022AA/META/.cached_df.feather


In [7]:
umls_to_mesh = umls.get_mapping('MSH', other_prefix='MESH')
mesh_to_umls = umls.get_mapping('MSH', reverse=True, other_prefix='MESH')
print(len(umls_to_mesh))
print(len(mesh_to_umls))

# Write to file
with open('../data/umls2mesh.json', 'w') as f:
    f.write(ujson.dumps(umls_to_mesh))
    
with open('../data/mesh2umls.json', 'w') as f:
    f.write(ujson.dumps(mesh_to_umls))

455672
348733


In [15]:
umls_to_hgnc = {k:v.replace('HGNC','hgnc') for k, v in umls.get_mapping('HGNC', mapping_col='scui', ).items()}
hgnc_to_umls = {k.replace('HGNC','hgnc'):v  for k, v in umls.get_mapping('HGNC', mapping_col='scui', reverse=True).items()}
umls_to_entrez = {key: hgnc_to_entrez[val] for key, val in umls_to_hgnc.items() if val in hgnc_to_entrez}
entrez_to_umls = {v:k for k,v in umls_to_entrez.items()}
print(len(umls_to_hgnc))
print(len(hgnc_to_umls))

with open('../data/umls2entrez.json', 'w') as f:
    f.write(ujson.dumps(umls_to_entrez))
    
with open('../data/entrez2umls.json', 'w') as f:
    f.write(ujson.dumps(entrez_to_umls))


42509
42514


In [16]:
# Check lengths of mappings to make sure we go in the right direction and avoid dropping keys
print(len(set(umls_to_hgnc.keys())))
print(len(set(umls_to_hgnc.values())))
print(len(set(entrez_to_hgnc.keys())))
print(len(set(entrez_to_hgnc.values())))
print(len(set(hgnc_to_entrez.keys())))
print(len(set(hgnc_to_entrez.values())))

42509
42508
42305
42305
84109
42305


In [17]:
umls_to_ncbi = umls.get_mapping('NCBI', mapping_col='scui', other_prefix='NCBI')
ncbi_to_umls = umls.get_mapping('NCBI', reverse=True, mapping_col='scui', other_prefix='NCBI')
print(len(umls_to_ncbi))
print(len(ncbi_to_umls))

1983467
1983785


In [18]:
umls_to_omim = umls.get_mapping('OMIM', other_prefix='OMIM')
omim_to_umls = umls.get_mapping('OMIM', reverse=True, other_prefix='OMIM')
print(len(umls_to_omim))
print(len(omim_to_umls))

99542
101089


In [12]:
len(umls_to_omim)

99542

In [20]:
mapping_dicts = [umls_to_omim, umls_to_mesh, umls_to_ncbi, umls_to_entrez,]
# names = ['omim','mesh','ncbitaxon','entrez']
# print(sum([len(x)  for x in mapping_dicts]))
# for (a,b) in combinations(range(len(mapping_dicts)), 2):
#     print(names[a], names[b])
#     d1, d2 = mapping_dicts[a], mapping_dicts[b]
#     print("Overlap:", len(set(d1.keys()).intersection(set(d2.keys()))))
#     print('\n')


# omnimap = defaultdict(str, {**umls_to_omim, **umls_to_mesh, **umls_to_ncbi, **umls_to_entrez,})
# print(len(omnimap))

omnimap_with_duplicates = defaultdict(set)
for d in mapping_dicts:
    for key, val in d.items():
        omnimap_with_duplicates[key].add(val)


umls_to_mesh_omim = defaultdict(set)
for d in [umls_to_mesh, umls_to_omim]:
    for key, val in d.items():
        umls_to_mesh_omim[key].add(val)

omnimap_with_duplicates = {key:list(val) for key, val in omnimap_with_duplicates}
umls_to_mesh_omim = {key:list(val) for key, val in umls_to_mesh_omim}

with open('../data/umls2entrez_mesh_omim_ncbi.json', 'w') as f:
    f.write(ujson.dumps(omnimap_with_duplicates))

with open('../data/umls2mesh_omim.json', 'w') as f:
    f.write(ujson.dumps(umls_to_mesh_omim))

2486286


## Take special care to resolve synonym sets with Entrez
There is no mapping of proteins to corresponding genes.  This code is meant to be a partial remedy to that problem

In [None]:
umls_to_entrez = ujson.load(open('../data/umls2entrez.json', 'r'))
df = umls.umls[umls.umls.lang == 'ENG']
df['type'] = df.tui.map(lambda x: [umls.type2abbrev[y] for y in x])
df['entrez'] = df['cui'].progress_map(lambda x: umls_to_entrez[x] if x in umls_to_entrez else '')
df['has_entrez'] = df.entrez != ''
df['type_str'] = df['type'].progress_map(lambda x: ','.join(x))
subset = df.loc[df.entrez != '', ['cui','alias','type_str', 'entrez']].drop_duplicates()
agg = subset.groupby('alias').agg({'entrez':['nunique', lambda x: x]})

# Get blacklist of Entrez identifiers taht are not unique to a single gene CUI
blacklist_df = agg[agg[('entrez','nunique')] > 1]
blacklist = set([])
for x in blacklist_df[('entrez','<lambda_0>')]:
    for y in x:
        blacklist.add(y)
        
# Filter blacklisted entities
df.loc[df.entrez.isin(blacklist), 'has_entrez'] = False

# Get synonym sets that map to a single alias
grouped = df.groupby('alias')
entrez_agg = grouped.agg({'has_entrez': 'max',})
keep_aliases = set(entrez_agg[entrez_agg['has_entrez']].index.unique().tolist())

# Subset of unique ncbigene/entrez identifiers
subset = df[df.alias.isin(keep_aliases)]
subset_grouped = subset.groupby('alias')
gene_synset_agg = subset_grouped.agg({
                               'cui':lambda x: list(set(x)),
                               'group': lambda group_list: set([y for x in group_list for y in x]),
                               'type':lambda type_list: set([y for x in type_list for y in x]),
                               'entrez': lambda x: list(set([y for y in x if y != '']))[0],
                    })

synset_mapping = gene_synset_agg.groupby('entrez').agg({'cui':lambda cui_lists: list(set([y for x in cui_lists for y in x]))})

# Additional filtering of promiscuous CUIs
used_cuis = set([])
duplicate_cuis = set([])
for cui_list in synset_mapping['cui']:
    for cui in cui_list:
        if cui in used_cuis:
            duplicate_cuis.add(cui)
        else:
            used_cuis.add(cui)
            
            
# Use synonym sets to update entrez_to_umls mapping
for entrez_curie, cui_list in synset_mapping.to_dict()['cui'].items():
    for cui in cui_list:
        if cui not in duplicate_cuis:
            umls_to_entrez[cui] = entrez_curie
            
            
# Write to file
with open('../data/umls2entrez_with_synsets.json', 'w') as f:
    f.write(ujson.dumps(umls_to_entrez))

## Load BERN2 Data

In [2]:
with open('/Users/david/Downloads/abstracts-entity-linking-pmids.json', 'r') as f:
    lines = f.read().strip().split('\n')
    abstracts = [ujson.loads(line) for line in lines]

In [3]:
abstracts[0]

{'_id': '2004',
 'pmid': '2004',
 'annotations': [{'id': ['OMIM:115000', 'MESH:D001145'],
   'span': {'begin': 33, 'end': 52},
   'obj': 'disease',
   'is_neural_normalized': False,
   'mention': 'cardiac arrhythmias',
   'prob': 0.999992311000824},
  {'id': ['MESH:D010640'],
   'span': {'begin': 212, 'end': 226},
   'obj': 'drug',
   'is_neural_normalized': False,
   'mention': 'phenothiazines',
   'prob': 0.9991432428359985},
  {'id': ['MESH:D013881'],
   'span': {'begin': 310, 'end': 318},
   'obj': 'drug',
   'is_neural_normalized': False,
   'mention': 'Mellaril',
   'prob': 0.9977579712867737},
  {'id': ['MESH:D013881'],
   'span': {'begin': 320, 'end': 332},
   'obj': 'drug',
   'is_neural_normalized': False,
   'mention': 'thioridazine',
   'prob': 0.9996978044509888},
  {'id': ['MESH:D017180'],
   'span': {'begin': 379, 'end': 402},
   'obj': 'disease',
   'is_neural_normalized': False,
   'mention': 'ventricular tachycardia',
   'prob': 0.9999977946281433},
  {'id': ['MESH:D0

In [4]:
all_entities = []
all_prefixes = set([])
for doc in tqdm(abstracts):
    pmid = doc['pmid']
    if 'annotations' not in doc.keys():
        print(doc)
        continue
    for an in doc['annotations']:
        for cui_list in an['id']:
            if type(cui_list) == list:
                cui_list = cui_list[0]
            if cui_list == 'CUI-less':
                continue
            for cui in cui_list.split(','):
                prefix = cui.split(':')[0].split('_')[0]
                if prefix[0].isdigit():
                    continue
                prob = 0.0
                if 'prob' not in an.keys():
                    if 'logit' in an.keys():
                        prob = an['logit']
                    else:
                       print(an) 
                else:
                    prob = an['prob']
                all_entities.append([pmid, cui, an['mention'],  an['obj'], an['span']['begin'], an['span']['end'], prob, prefix])
                all_prefixes.add(prefix)

  0%|          | 0/8099 [00:00<?, ?it/s]

{'id': ['MESH:C498038'], 'span': {'begin': 509, 'end': 518}, 'obj': 'drug', 'is_neural_normalized': True, 'mention': 'Val509Ala'}
{'id': ['MESH:C515730'], 'span': {'begin': 533, 'end': 542}, 'obj': 'drug', 'is_neural_normalized': True, 'mention': 'Cys672Tyr'}
{'error_code': 1, 'error_message': 'Something went wrong. Try again.', 'pmid': '28054712', '_id': '28054712'}
{'id': ['CHEBI:74801'], 'span': {'begin': 1022, 'end': 1029}, 'obj': 'drug', 'is_neural_normalized': True, 'mention': 'Ser5Ala'}


In [5]:
bern2_annotations = pd.DataFrame(all_entities, columns=['pmid','cui','text', 'type','start','end', 'prob', 'prefix'])

bern2_filtered = bern2_annotations[bern2_annotations.prefix.isin(['EntrezGene','MESH','NCBI','OMIM'])]
bern2_filtered.shape

(132719, 8)

In [6]:
bern2_filtered[['pmid','start','end']].drop_duplicates()

Unnamed: 0,pmid,start,end
0,2004,33,52
2,2004,212,226
3,2004,310,318
4,2004,320,332
5,2004,379,402
...,...,...,...
143715,30991764,1349,1361
143716,30991764,1376,1380
143717,30991764,1392,1401
143718,30991764,1500,1508


In [7]:
bern2_filtered['cui'] = bern2_filtered.cui.map(lambda x: x.replace('txid','').replace('EntrezGene','ncbigene'))
bern2_filtered[bern2_filtered.prefix == 'EntrezGene']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bern2_filtered['cui'] = bern2_filtered.cui.map(lambda x: x.replace('txid','').replace('EntrezGene','ncbigene'))


Unnamed: 0,pmid,cui,text,type,start,end,prob,prefix
38,23402,ncbigene:2539,G6PD,gene,814,818,0.950792,EntrezGene
42,23402,ncbigene:2539,G6PD,gene,1228,1232,0.779835,EntrezGene
75,28952,ncbigene:5972,renin,gene,1158,1163,0.991751,EntrezGene
78,28952,ncbigene:5972,renin,gene,1212,1217,0.987656,EntrezGene
82,28952,ncbigene:5972,renin,gene,1413,1418,0.994729,EntrezGene
...,...,...,...,...,...,...,...,...
143709,30991764,ncbigene:3569,IL-6,gene,1171,1175,0.933437,EntrezGene
143710,30991764,ncbigene:7124,TNF-,gene,1181,1185,0.963428,EntrezGene
143712,30991764,ncbigene:834,caspase-1,gene,1235,1244,0.985916,EntrezGene
143716,30991764,ncbigene:85480,TSLP,gene,1376,1380,0.984251,EntrezGene


In [8]:
all_prefixes

{'CHEBI', 'CL', 'CVCL', 'EntrezGene', 'MESH', 'NCBI', 'OMIM'}

## Evaluate BERN2 on MedMentions_Full

### Load Medmentions and convert to dataframe

In [154]:
mm_df = dataset_to_df(conhelps.for_config_name(f"medmentions_full_bigbio_kb").load_dataset())
mm_df['cui'] = mm_df['db_id'].map(omnimap)
mm_filtered = mm_df[mm_df.cui != '']
mm_filtered.shape

Reusing dataset med_mentions_dataset (/Users/david/.cache/huggingface/datasets/med_mentions_dataset/medmentions_full_bigbio_kb/1.0.0/b5c8691186d4701f9b18eddbe36d178ccf7e55761dcc6140c57f4410754511ac)


  0%|          | 0/3 [00:00<?, ?it/s]

(198282, 10)

In [155]:
mm_filtered.head(20)

Unnamed: 0,pmid,mention_id,text,type,db_name,db_id,split,start,end,cui
0,25763772,1,DCTN4,T116,UMLS,C4308010,train,0,5,MESH:C000606363
1,25763772,2,DCTN4,T123,UMLS,C4308010,train,0,5,MESH:C000606363
2,25763772,3,chronic Pseudomonas aeruginosa infection,T047,UMLS,C0854135,train,23,63,MESH:D011552
3,25763772,4,cystic fibrosis,T047,UMLS,C0010674,train,67,82,MESH:D003550
4,25763772,5,Pseudomonas aeruginosa (Pa) infection,T047,UMLS,C0854135,train,83,120,MESH:D011552
5,25763772,6,cystic fibrosis,T047,UMLS,C0010674,train,124,139,MESH:D003550
6,25763772,7,CF,T047,UMLS,C0010674,train,141,143,MESH:D003550
7,25763772,8,patients,T101,UMLS,C0030705,train,145,153,MESH:D010361
9,25763772,10,pulmonary disease,T047,UMLS,C0024115,train,189,206,MESH:D008171
10,25763772,11,shorter survival,T169,UMLS,C0220921,train,211,227,MESH:Q000401


In [156]:
mm_dict = defaultdict(str, mm_filtered.set_index(['pmid','start','end'])['cui'].to_dict())

### Compute overlap between BERN2 and MedMentions

In [11]:
# Helper Functions
# def iou(span1, span2):
def iou(b1, e1, b2, e2):
    '''
    Calculate intersection-over-union of two spans
    '''
    # b1 = span1[0]
    # e1 = span1[1]
    # b2 = span2[0]
    # e2 = span2[1]
    overlap = (min(e1, e2) - max(b1, b2))
    if overlap <= 0:
        return 0
    else:
        union = (max(e1, e2) - min(b1, b2)) 
        return overlap/union



In [5]:
def evaluate_bern2_bigbio(
    bern2_df,
    bigbio_df,
    bigbio_suffix,
    entity_types=["gene", "disease", "chemical", "drug", "species"],
    subsets=['train','test','dev','validation','training','valid'],
):
    bigbio_df = bigbio_df[bigbio_df['split'].isin(subsets)]

    # Merge datasets to find overlap
    merged = pd.merge(
        bern2_df[["pmid", "cui", "type", "start", "end", "prob"]],
        bigbio_df,
        how="outer",
        on="pmid",
        suffixes=["_bern2", f"_{bigbio_suffix}"],
    ).dropna()

    merged["iou"] = merged[
        ["start_bern2", "end_bern2", f"start_{bigbio_suffix}", f"end_{bigbio_suffix}"]
    ].progress_apply(lambda x: iou(*x), axis=1)

    iou_cutoff = 0.5
    confidence_cutoff = 0.99

    # Compute matches between BERN2 CUI and MedMentions CUI
    merged["match"] = merged["cui_bern2"] == merged[f"cui_{bigbio_suffix}"]
    print(
        "Initial candidate spans:",
        merged.query("iou > 0")[["pmid", "start_bern2", "end_bern2"]]
        .drop_duplicates()
        .shape[0],
    )
    filtered = merged.query("iou >= @iou_cutoff & prob > @confidence_cutoff")
    idx = (
        filtered.groupby(["pmid", f"start_{bigbio_suffix}", f"end_{bigbio_suffix}"])[
            "iou"
        ].transform(max)
        == filtered["iou"]
    )
    filtered = filtered[idx]

    # filtered['is_ncbigene']
    display(filtered.head())

    matches = filtered.groupby(
        ["pmid", f"start_{bigbio_suffix}", f"end_{bigbio_suffix}", ]
    ).agg(
        {
            "type_bern2": "first",
            f"type_{bigbio_suffix}": lambda x: x,
            "match": "max",
            'start_bern2':'first',
            'end_bern2':'first',
            'iou':'first', 
            'text':'first',
            'cui_bern2':lambda x: x,
            f'cui_{bigbio_suffix}': lambda x: x,

            # "is_ncbigene": "max",
        }
    )
    print("Filtered candidate spans:", matches.shape[0])
    print("Linking accuracy on matched spans (lower bound):", matches.mean()["match"])
    print(matches.groupby("type_bern2")['match'].mean())
    print("Filter to mentions of genes specifically")
    # print(matches[matches.is_ncbigene].groupby('type_bern2')['match'].mean())
    # print((matches.type_bern2 == 'gene').sum())
    # print(matches.is_ncbigene.sum())
    return matches



In [181]:

evaluate_bern2_bigbio(bern2_filtered, mm_filtered, "mm")


  0%|          | 0/3230696 [00:00<?, ?it/s]

Initial candidate spans: 29870


Unnamed: 0,pmid,cui_bern2,type_bern2,start_bern2,end_bern2,prob,mention_id,text,type_mm,db_name,db_id,split,start_mm,end_mm,cui_mm,iou,match
64023,25763772,ncbigene:51164,gene,0.0,5.0,0.991746,1,DCTN4,T116,UMLS,C4308010,train,0.0,5.0,MESH:C000606363,1.0,False
64024,25763772,ncbigene:51164,gene,0.0,5.0,0.991746,2,DCTN4,T123,UMLS,C4308010,train,0.0,5.0,MESH:C000606363,1.0,False
64302,25763772,OMIM:219700,disease,67.0,82.0,0.999996,4,cystic fibrosis,T047,UMLS,C0010674,train,67.0,82.0,MESH:D003550,1.0,False
64394,25763772,MESH:D003550,disease,67.0,82.0,0.999996,4,cystic fibrosis,T047,UMLS,C0010674,train,67.0,82.0,MESH:D003550,1.0,True
65044,25763772,MESH:D008171,disease,211.0,228.0,0.99704,11,shorter survival,T169,UMLS,C0220921,train,211.0,227.0,MESH:Q000401,0.941176,False


Filtered candidate spans: 9386
Linking accuracy on matched spans (lower bound): 0.5397400383549968
               match
type_bern2          
disease     0.577894
drug        0.664594
gene        0.167602
species     0.252918
Filter to mentions of genes specifically


  print("Linking accuracy on matched spans (lower bound):", matches.mean()["match"])


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,type_bern2,type_mm,match,text
pmid,start_mm,end_mm,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
25763772,0.0,5.0,gene,"[T116, T123]",False,DCTN4
25763772,67.0,82.0,disease,"[T047, T047]",True,cystic fibrosis
25763772,211.0,227.0,disease,T169,False,shorter survival
25763772,1251.0,1259.0,species,T101,False,patients
26867927,331.0,339.0,species,T101,False,patients
...,...,...,...,...,...,...
28549178,1390.0,1407.0,disease,"[T047, T047]",False,dystrophic muscle
28550154,16.0,20.0,drug,"[T121, T130, T197]",True,H2O2
28550165,1267.0,1272.0,gene,"[T116, T123]",False,CLIP2
28550165,1341.0,1347.0,gene,"[T116, T123]",False,CLASP2


## Evaluate BERN2 on GNormPlus

In [185]:
gnorm_df = dataset_to_df(conhelps.for_config_name(f"gnormplus_bigbio_kb").load_dataset())
print(gnorm_df[['pmid','start','end']].drop_duplicates().shape[0])
gnorm_df['cui'] = 'ncbigene:' + gnorm_df.db_id.astype(str)
output = evaluate_bern2_bigbio(bern2_filtered, gnorm_df, "gnorm",)

Reusing dataset gnormplus_dataset (/Users/david/.cache/huggingface/datasets/gnormplus_dataset/gnormplus_bigbio_kb/1.0.0/1bb16f1b4abf9394b9180cac70edc575cc5e7d32c697f8b9f69ba2f643d2fc95)


  0%|          | 0/2 [00:00<?, ?it/s]

6252


  0%|          | 0/122409 [00:00<?, ?it/s]

Initial candidate spans: 4463


Unnamed: 0,pmid,cui_bern2,type_bern2,start_bern2,end_bern2,prob,mention_id,text,type_gnorm,db_name,db_id,split,start_gnorm,end_gnorm,cui_gnorm,iou,match
1624,1281549,ncbigene:2263,gene,209,244,0.997139,1353,keratinocyte growth factor receptor,Gene,NCBI,2263,train,209.0,244.0,ncbigene:2263,1.0,True
1631,1281549,ncbigene:2263,gene,250,285,0.990651,1355,fibroblast growth factor receptor 2,Gene,NCBI,2263,train,250.0,285.0,ncbigene:2263,1.0,True
1678,1281549,ncbigene:1845,gene,1063,1066,0.995078,1362,VHR,Gene,NCBI,1845,train,1063.0,1066.0,ncbigene:1845,1.0,True
2155,1317062,ncbigene:196,gene,116,127,0.996888,1368,Ah receptor,Gene,NCBI,196,train,116.0,127.0,ncbigene:196,1.0,True
2168,1317062,ncbigene:196,gene,133,153,0.99658,1369,Ah (dioxin) receptor,Gene,NCBI,196,train,133.0,153.0,ncbigene:196,1.0,True


Filtered candidate spans: 2204
Linking accuracy on matched spans (lower bound): 0.911524500907441
               match  start_bern2   end_bern2       iou
type_bern2                                             
disease     0.000000   334.390244  352.585366  0.882529
drug        0.000000   753.000000  763.800000  0.692408
gene        0.935289   718.449255  726.489292  0.993177
Filter to mentions of genes specifically


  print("Linking accuracy on matched spans (lower bound):", matches.mean()["match"])


## Evaluate BERN2 on BC5CDR

In [15]:
bc5cdr_df = dataset_to_df(conhelps.for_config_name(f"bc5cdr_bigbio_kb").load_dataset())
print(bc5cdr_df[['pmid','start','end']].drop_duplicates().shape[0])
bc5cdr_df['cui'] = bc5cdr_df['db_name'] +':' + bc5cdr_df['db_id']
# bc5cdr_df['cui'] = 'ncbigene:' + bc5cdr_df.db_id.astype(str)
bc5cdr_output = evaluate_bern2_bigbio(bern2_filtered, bc5cdr_df, "bc5cdr", subsets=['test'])

Reusing dataset bc5cdr_dataset (/Users/david/.cache/huggingface/datasets/bc5cdr_dataset/bc5cdr_bigbio_kb/1.0.0/f01f16ea9b65ead985bedadf7335195c32297c8f1b09417fc607b102a6757d6f)


  0%|          | 0/3 [00:00<?, ?it/s]

28888


  0%|          | 0/228288 [00:00<?, ?it/s]

Initial candidate spans: 6855


Unnamed: 0,pmid,cui_bern2,type_bern2,start_bern2,end_bern2,prob,mention_id,text,type_bc5cdr,db_name,db_id,split,start_bc5cdr,end_bc5cdr,cui_bc5cdr,iou,match
138,35781,MESH:D003000,drug,126,135,0.999151,6840,clonidine,Chemical,MESH,D003000,test,126.0,135.0,MESH:D003000,1.0,True
172,35781,MESH:D009278,drug,137,148,0.999263,6841,naphazoline,Chemical,MESH,D009278,test,137.0,148.0,MESH:D009278,1.0,True
206,35781,MESH:C009695,drug,153,167,0.999446,6842,xylometazoline,Chemical,MESH,C009695,test,153.0,167.0,MESH:C009695,1.0,True
241,35781,MESH:D003061,drug,202,209,0.998451,6844,codeine,Chemical,MESH,D003061,test,202.0,209.0,MESH:D003061,1.0,True
275,35781,MESH:D005283,drug,211,219,0.997851,6845,fentanyl,Chemical,MESH,D005283,test,211.0,219.0,MESH:D005283,1.0,True


Filtered candidate spans: 5028
Linking accuracy on matched spans (lower bound): 0.854813046937152
type_bern2
disease    0.798285
drug       0.906127
gene       0.000000
species    0.000000
Name: match, dtype: float64
Filter to mentions of genes specifically


  print("Linking accuracy on matched spans (lower bound):", matches.mean()["match"])


In [16]:
bc5cdr_output[~bc5cdr_output.match]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,type_bern2,type_bc5cdr,match,start_bern2,end_bern2,iou,text,cui_bern2,cui_bc5cdr
pmid,start_bc5cdr,end_bc5cdr,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10087562,19.0,42.0,disease,Disease,False,0,42,0.547619,ventricular tachycardia,MESH:D016171,MESH:D017180
10087562,392.0,415.0,disease,Disease,False,373,415,0.547619,ventricular tachycardia,MESH:D016171,MESH:D017180
10087562,525.0,548.0,disease,Disease,False,506,548,0.547619,ventricular tachycardia,MESH:D016171,MESH:D017180
10087562,782.0,792.0,drug,Chemical,False,782,792,1.000000,Dubutamine,MESH:C576824,MESH:D004280
10219427,518.0,525.0,disease,Disease,False,518,525,1.000000,dyspnea,MESH:D001049,MESH:D004417
...,...,...,...,...,...,...,...,...,...,...,...
946593,1167.0,1198.0,disease,Disease,False,1167,1198,1.000000,central nervous system leukemia,MESH:D016543,MESH:D002493
9578276,809.0,830.0,disease,Disease,False,810,831,0.909091,hyper- or hypotension,MESH:D007022,MESH:D006973
9646784,46.0,61.0,disease,Disease,False,34,61,0.555556,thromboembolism,MESH:D019320,MESH:D013923
9646784,463.0,481.0,disease,Disease,False,463,481,1.000000,allergic reactions,MESH:D006967,MESH:D004342


## Evaluate on NCBI-Disease

In [6]:
ncbi_disease_df = dataset_to_df(conhelps.for_config_name(f"ncbi_disease_bigbio_kb").load_dataset())
ncbi_disease_df['db_name'] = ncbi_disease_df.db_name.map(lambda x: x.upper())
print(ncbi_disease_df[['pmid','start','end']].drop_duplicates().shape[0])
ncbi_disease_df['cui'] = ncbi_disease_df['db_name'] +':' + ncbi_disease_df['db_id']
# ncbi_disease_df['cui'] = 'ncbigene:' + ncbi_disease_df.db_id.astype(str)
output = evaluate_bern2_bigbio(bern2_filtered, ncbi_disease_df, "ncbi_disease")

Reusing dataset ncbi_disease_dataset (/Users/david/.cache/huggingface/datasets/ncbi_disease_dataset/ncbi_disease_bigbio_kb/1.0.0/10a393201e55b403e5d107701b719368f54f1bf3d3438a1233f99be0badeb034)


  0%|          | 0/3 [00:00<?, ?it/s]

6881


NameError: name 'bern2_filtered' is not defined

In [7]:
ncbi_disease_df[ncbi_disease_df.cui.map(lambda x: '|' in x and ':' in x)]

Unnamed: 0,pmid,mention_id,text,type,db_name,db_id,split,start,end,cui
5,10192393,10192393_D003110|D009369_5,colon and some other cancers,CompositeMention,MESH,D003110|D009369,train,670,698,MESH:D003110|D009369
33,10196379,10196379_D001943|D010051_6,breast or ovarian cancer,CompositeMention,MESH,D001943|D010051,train,796,820,MESH:D001943|D010051
77,10090885,10090885_D008232|D007154_3,disorder of lymphocyte homeostasis and immunol...,CompositeMention,MESH,D008232|D007154,train,145,207,MESH:D008232|D007154
218,8531967,8531967_D001943|D010051_1,breast and ovarian cancer,CompositeMention,MESH,D001943|D010051,train,165,190,MESH:D001943|D010051
223,8531967,8531967_D001943|D010051_6,breast or ovarian cancer,CompositeMention,MESH,D001943|D010051,train,1143,1167,MESH:D001943|D010051
225,8531967,8531967_D001943|D010051_8,breast or ovarian cancer,CompositeMention,MESH,D001943|D010051,train,1503,1527,MESH:D001943|D010051
559,1682919,1682919_D001943|D018307_4,breast and squamous cell neoplasms,CompositeMention,MESH,D001943|D018307,train,742,776,MESH:D001943|D018307
657,10417286,10417286_D002971|D002972_5,cleft lip/palate,SpecificDisease,MESH,D002971|D002972,train,278,294,MESH:D002971|D002972
658,10417286,10417286_D002971|D002972_6,CL/P,SpecificDisease,MESH,D002971|D002972,train,296,300,MESH:D002971|D002972
665,10417286,10417286_D002971|D002972_13,CL/P,SpecificDisease,MESH,D002971|D002972,train,454,458,MESH:D002971|D002972


# Elucidate error in NCBI-Disease Dataset
