# SciSpacy linking evaluation across datasets

In [1]:
import spacy
import numpy as np 
import pandas as pd
import ujson
import sys
import time
import pickle
import itertools

from tqdm import tqdm
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker, CandidateGenerator
from scispacy.umls_utils import UmlsKnowledgeBase
from bigbio.dataloader import BigBioConfigHelpers
from typing import List, Optional
from collections import defaultdict
from matplotlib import pyplot as plt

sys.path.append('..')
from bigbio_utils import dataset_to_df, DATASET_NAMES, CUIS_TO_EXCLUDE, CUIS_TO_REMAP, resolve_abbreviation, dataset_to_documents


%load_ext autoreload 
%autoreload 2

# spacy.prefer_gpu()
# spacy.prefer_cpu()

conhelps = BigBioConfigHelpers()

def get_candidates(df, cg, k=50, db='UMLS'):
    '''
        df: pandas.dataframe produced by dataset_to_df()
        cg: SciSpacy candidate generator 
        k: int, number of candidates to generate
    '''
    # Get candidates from SciSpacy
    print("Generating candidates")
    if 'mention_text' not in df.columns:
        df['mention_text'] = df['text']
    candidates = cg(df['mention_text'].tolist(), k=k)

    print("Processing candidates")
    processed_candidates = [[(cand.concept_id, round(max(cand.similarities)), 5) for cand in sorted(cand_list, key=lambda x: round(max(x.similarities), 5), reverse=True)] for cand_list in candidates]
    grouped_candidates = []
    for cand_list in processed_candidates:
        grouped_sublist = []
        for k, g in itertools.groupby(cand_list, key=lambda x: x[1]):
            grouped_sublist.append(list(g))
        grouped_candidates.append(grouped_sublist)
        
    candidate_cuis = [[[f"{db}:{x[0]}" for x in group] for group in cand_list] for cand_list in grouped_candidates]
    candidate_scores = [[x[1] for x in cand_list] for cand_list in processed_candidates]
    df['candidates'] = candidate_cuis
    df['scores'] = candidate_scores


def min_hit_index(gold_cuis, candidates):
    '''
    Find index of first hit in candidates
    '''
    for i, c in enumerate(candidates):
        if c in gold_cuis:
            return i

    return 1000000

def recall_at_k(df, max_k: int=10, filter_null=False):
    '''
    Compute recall@k for all values of k < max_k
    '''

    # Filter rows wilt null values after CUI remapping
    if filter_null:
        before_row_count = df.shape[0]
        df = df[df.db_ids.map(lambda x: len(x) > 0)]
        after_row_count = df.shape[0]
        print(f"Dropped {before_row_count - after_row_count} rows with null db_ids")


    df['min_hit_index'] = df[['db_ids','candidates']].apply(lambda x: min_hit_index(x[0], x[1]), axis=1)

    recall_at_k_dict = {}
    for k in range(1, max_k + 1):
        recall_at_k_dict[k] = (df.min_hit_index < k).mean()

    return recall_at_k_dict


def plot_recall_at_k(recall_dict, max_k=10, legend_key=None):
    
    plt.plot(recall_dict.keys(), recall_dict.values(), marker='o', label=legend_key)
    if legend_key is not None:
        plt.legend()



Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


In [2]:
kb = UmlsKnowledgeBase()

In [3]:
umls_cg = CandidateGenerator(name='umls', verbose=True)
mesh_cg = CandidateGenerator(name='mesh', verbose=True)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
# umls2entrez = ujson.load(open('../data/umls2entrez.json'))
umls2mesh_omim = ujson.load(open('../data/umls_2017/umls2mesh_omim.json'))
entrez2umls = ujson.load(open('../data/entrez2umls.json'))

updated_entrez2umls = defaultdict(set)
for key, val in entrez2umls.items():
    updated_entrez2umls[key.replace('ncbigene', 'NCBIGene')] = set([f'UMLS:{val}'])
    

mesh_omim2umls = defaultdict(set)
for key, val in umls2mesh_omim.items():
     for v in val:
        mesh_omim2umls[v].add(f'UMLS:{key}')


mapping_dicts = {"bc5cdr": None,
    "medmentions_full": None,
    "medmentions_st21pv": None,
    "gnormplus": updated_entrez2umls,
    "nlmchem": None,
    "nlm_gene": updated_entrez2umls,
    "ncbi_disease": mesh_omim2umls,}

dataset2cg = {
    "bc5cdr": mesh_cg,
    "medmentions_full": umls_cg,
    "medmentions_st21pv": umls_cg,
    "gnormplus": umls_cg,
    "nlmchem": mesh_cg,
    "nlm_gene": umls_cg,
    "ncbi_disease": umls_cg,
}

dataset2db = {
    "bc5cdr": 'MESH',
    "medmentions_full": 'UMLS',
    "medmentions_st21pv": 'UMLS',
    "gnormplus": 'UMLS',
    "nlmchem": 'MESH',
    "nlm_gene": 'UMLS',
    "ncbi_disease": 'UMLS',
} 

abbreviation_dict = ujson.load(open('../data/abbreviations.json'))

In [5]:
# name = DATASET_NAMES[-1]
results = {}
for name in DATASET_NAMES:
# for name in ['nlm_gene']:
    results[name] = []
    print(name)

    # Load dataset and deduplicate + remove deprecated identifiers
    dataset = conhelps.for_config_name(f'{name}_bigbio_kb').load_dataset(from_hub=False)
    exclude = CUIS_TO_EXCLUDE[name]
    remap = CUIS_TO_REMAP[name]
    df = dataset_to_df(dataset, cuis_to_exclude=exclude, entity_remapping_dict=remap)
    df = df.query("split == 'test'")
    display(df.head())

    # Get candidate generator and 
    linked_db = dataset2db[name]
    cg = dataset2cg[name]
    cui_map = mapping_dicts[name]

     # Remap identifiers if necessary (since SciSpacy doesn't link to external ids)
    if cui_map is not None:
        df['remapped_db_ids'] = df.db_ids.map(lambda x: list(set().union(*[cui_map[cui] for cui in x])))


    for resolve_abbrevs in [False, True]:
        # Resolve abbreviations
        if resolve_abbrevs:
            df["mention_text"] = df[["document_id", "text"]].apply(
                        lambda x: resolve_abbreviation(x[0], x[1], abbreviation_dict), axis=1
                    )
        else: 
            df['mention_text'] = df['text']

        start_time = time.time()
        # compute linking candidates
        get_candidates(df=df, cg=cg, db=linked_db)

        elapsed = time.time() - start_time
        print("Total time to compute candidates:", elapsed)

        # # Visualize results
        # recall_dict = recall_at_k(df)
        # plt.title(name)
        # plt.ylim(.5,1)
        # plot_recall_at_k(recall_dict, legend_key=f'no_null_filter, abbrev_res={resolve_abbrevs}')
        # if resolve_abbrevs:
        #     recall_dict['resolve_abbrevs'] = True

        # else:
        #     recall_dict['resolve_abbrevs'] = False

        # recall_dict['time'] = elapsed
        # results[name].append(recall_dict)
        # print(recall_dict)

        # nonull_recall_dict = recall_at_k(df, filter_null=True)
        # plot_recall_at_k(nonull_recall_dict, legend_key=f"after_null_filtering, abbrev_res={resolve_abbrevs}")

        # Remap column names
        if resolve_abbrevs:
            df = df.rename({'candidates':'scispacy_resolve_abbrev'}, axis=1)
        else:
            df = df.rename({'candidates':'scispacy'}, axis=1)

    # Save file
    # df.reset_index().to_feather(f'../results/scispacy/{name}_scispacy.feather')
    # display(df.head())

       

    # plt.show()

# with open('../results/scispacy_overall_results.json', 'w') as f:
#     f.write(ujson.dumps(results, indent=2))



Found cached dataset bc5cdr (/nethome/dkartchner3/.cache/huggingface/datasets/bc5cdr/bc5cdr_bigbio_kb/1.0.0/68f03988d9e501c974d9f9987183bf06474858d1318ed0d4e51cfc4584f0f51f)


bc5cdr


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,document_id,offsets,text,type,db_ids,split,mention_id
55,10087562,"[[0, 18]]",Torsade de pointes,[Disease],[MESH:D016171],test,10087562.1
58,10087562,"[[19, 42]]",ventricular tachycardia,[Disease],[MESH:D017180],test,10087562.2
70,10087562,"[[72, 82]]",dobutamine,[Chemical],[MESH:D004280],test,10087562.3
56,10087562,"[[111, 133]]",dilated cardiomyopathy,[Disease],[MESH:D002311],test,10087562.4
57,10087562,"[[138, 162]]",congestive heart failure,[Disease],[MESH:D006333],test,10087562.5


Generating candidates
Generating candidates for 9750 mentions
Number of empty vectors: 82


  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


Finding neighbors took 1.991395 seconds
Processing candidates
Total time to compute candidates: 3.866079092025757
Generating candidates
Generating candidates for 9750 mentions
Number of empty vectors: 18


  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


Finding neighbors took 0.97954 seconds
Processing candidates


Found cached dataset medmentions (/nethome/dkartchner3/.cache/huggingface/datasets/medmentions/medmentions_full_bigbio_kb/1.0.0/4ed5b6a69d807969022e559198c5a7386b9a978268a558758a090db6b451d6c4)


Total time to compute candidates: 2.8138649463653564
medmentions_full


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,document_id,offsets,text,type,db_ids,split,mention_id
98,25847295,"[[0, 24]]",Nonylphenol diethoxylate,[T131],[UMLS:C1254354],test,25847295.1
134,25847295,"[[25, 33]]",inhibits,[T052],[UMLS:C3463820],test,25847295.2
141,25847295,"[[34, 43]]",apoptosis,[T043],[UMLS:C0162638],test,25847295.3
147,25847295,"[[44, 51]]",induced,[T169],[UMLS:C0205263],test,25847295.4
154,25847295,"[[55, 65]]",PC12 cells,[T025],[UMLS:C0085262],test,25847295.5


Generating candidates
Generating candidates for 70359 mentions
Number of empty vectors: 23


  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


Finding neighbors took 18.311251 seconds
Processing candidates
Total time to compute candidates: 52.31352758407593
Generating candidates
Generating candidates for 70359 mentions
Number of empty vectors: 22


  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


Finding neighbors took 8.532194 seconds
Processing candidates


Found cached dataset medmentions (/nethome/dkartchner3/.cache/huggingface/datasets/medmentions/medmentions_st21pv_bigbio_kb/1.0.0/4ed5b6a69d807969022e559198c5a7386b9a978268a558758a090db6b451d6c4)


Total time to compute candidates: 42.32234072685242
medmentions_st21pv


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,document_id,offsets,text,type,db_ids,split,mention_id
82,25847295,"[[34, 43]]",apoptosis,[T038],[UMLS:C0162638],test,25847295.1
87,25847295,"[[55, 65]]",PC12 cells,[T017],[UMLS:C0085262],test,25847295.2
73,25847295,"[[137, 144]]",present,[T033],[UMLS:C0150312],test,25847295.3
78,25847295,"[[206, 219]]",toxic effects,[T037],[UMLS:C0600688],test,25847295.4
79,25847295,"[[259, 268]]",Apoptosis,[T038],[UMLS:C0162638],test,25847295.5


Generating candidates
Generating candidates for 40143 mentions
Number of empty vectors: 19


  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


Finding neighbors took 4.663302 seconds
Processing candidates
Total time to compute candidates: 28.869887828826904
Generating candidates
Generating candidates for 40143 mentions
Number of empty vectors: 18


  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


Finding neighbors took 4.13249 seconds
Processing candidates


Found cached dataset gnormplus (/nethome/dkartchner3/.cache/huggingface/datasets/gnormplus/gnormplus_bigbio_kb/1.0.0/97a2714b58185305591c949b067cea2febfca2447016096c3d08021d84bf7b69)


Total time to compute candidates: 14.749157905578613
gnormplus


  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,document_id,offsets,text,type,db_ids,split,mention_id
0,10022127,"[[0, 9]]",TIF1gamma,[Gene],[NCBIGene:51592],test,10022127.1
7,10022127,"[[211, 220]]",TIF1gamma,[Gene],[NCBIGene:51592],test,10022127.2
8,10022127,"[[233, 242]]",TIF1alpha,[Gene],[NCBIGene:8805],test,10022127.3
9,10022127,"[[247, 255]]",TIF1beta,[Gene],[NCBIGene:10155],test,10022127.4
10,10022127,"[[274, 282]]",TIF1beta,[Gene],[NCBIGene:10155],test,10022127.5


Generating candidates
Generating candidates for 3222 mentions
Number of empty vectors: 0


  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


Finding neighbors took 0.350383 seconds
Processing candidates
Total time to compute candidates: 1.118152141571045
Generating candidates
Generating candidates for 3222 mentions
Number of empty vectors: 0


  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


Finding neighbors took 0.358304 seconds
Processing candidates


Found cached dataset nlmchem (/nethome/dkartchner3/.cache/huggingface/datasets/nlmchem/nlmchem_bigbio_kb/1.0.0/66bcefa38a4fe5d4ba1a0993a516040bad028699fbe3ef935f95532596668131)


Total time to compute candidates: 1.193880558013916
nlmchem


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,document_id,offsets,text,type,db_ids,split,mention_id
190,1325037,"[[6667, 6676]]",67Gallium,[Chemical],[MESH:D005710],test,1325037.1
191,1325037,"[[7086, 7097]]",CHOP regime,[Chemical],"[MESH:D011239, MESH:D003520, MESH:D014750, MES...",test,1325037.2
184,1325037,"[[11306, 11322]]",cyclophosphamide,[Chemical],[MESH:D003520],test,1325037.3
185,1325037,"[[11324, 11336]]",daunorubicin,[Chemical],[MESH:D003630],test,1325037.4
186,1325037,"[[11338, 11348]]",vincrstine,[Chemical],[MESH:D014750],test,1325037.5


Generating candidates
Generating candidates for 11624 mentions
Number of empty vectors: 326


  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


Finding neighbors took 0.79052 seconds
Processing candidates
Total time to compute candidates: 2.840824842453003
Generating candidates
Generating candidates for 11624 mentions
Number of empty vectors: 307


  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


Finding neighbors took 15.147075 seconds
Processing candidates


Found cached dataset nlm_gene (/nethome/dkartchner3/.cache/huggingface/datasets/nlm_gene/nlm_gene_bigbio_kb/1.0.0/92249f0c0c401d2d902c7a0af6d76ca2e535383a56c0c4d4099036c7c0dc9581)


Total time to compute candidates: 17.53421664237976
nlm_gene


  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,document_id,offsets,text,type,db_ids,split,mention_id
69,12486105,"[[70, 77]]",DEC-205,[GENERIF],[NCBIGene:17076],test,12486105.1
46,12486105,"[[131, 171]]",major histocompatibility complex class I,[STARGENE],[NCBIGene:14972],test,12486105.2
58,12486105,"[[196, 199]]",CD8,[Gene],[NCBIGene:12526],test,12486105.3
59,12486105,"[[319, 365]]",major histocompatibility complex (MHC) class I,[STARGENE],[NCBIGene:14972],test,12486105.4
60,12486105,"[[397, 404]]",DEC-205,[GENERIF],[NCBIGene:17076],test,12486105.5


Generating candidates
Generating candidates for 2729 mentions
Number of empty vectors: 5


  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


Finding neighbors took 0.27962 seconds
Processing candidates
Total time to compute candidates: 0.9559614658355713
Generating candidates
Generating candidates for 2729 mentions
Number of empty vectors: 5


  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


Finding neighbors took 0.339645 seconds
Processing candidates


Found cached dataset ncbi_disease (/nethome/dkartchner3/.cache/huggingface/datasets/ncbi_disease/ncbi_disease_bigbio_kb/1.0.0/e6b217666a5647d5abc614785b2caad62f1d72a94d1631b86c0f615b75dcc865)


Total time to compute candidates: 1.2165265083312988
ncbi_disease


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,document_id,offsets,text,type,db_ids,split,mention_id
5863,9288106,"[[40, 61]]",ataxia-telangiectasia,[Modifier],[MESH:D001260],test,9288106.1
5872,9288106,"[[72, 97]]",sporadic T-cell leukaemia,[SpecificDisease],[MESH:D015458],test,9288106.2
5873,9288106,"[[99, 120]]",Ataxia-telangiectasia,[SpecificDisease],[MESH:D001260],test,9288106.3
5849,9288106,"[[122, 125]]",A-T,[SpecificDisease],[MESH:D001260],test,9288106.4
5852,9288106,"[[132, 163]]",recessive multi-system disorder,[DiseaseClass],[MESH:D030342],test,9288106.5


Generating candidates
Generating candidates for 960 mentions
Number of empty vectors: 0
Finding neighbors took 0.135327 seconds


  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


Processing candidates
Total time to compute candidates: 0.5287775993347168
Generating candidates
Generating candidates for 960 mentions
Number of empty vectors: 0
Finding neighbors took 0.0938 seconds
Processing candidates
Total time to compute candidates: 0.2853991985321045


  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


In [9]:
data = [("apple", 5), ("banana", 2), ("cherry", 5), ("date", 3), ("elderberry", 1), ("fig", 5)]

# sort the list of tuples by the second element of each tuple
sorted_data = sorted(data, key=lambda x: x[1])

# group the sorted data by the same second element value
result = []
for k, g in itertools.groupby(sorted_data, key=lambda x: x[1]):
    result.append(list(g))

print(result)

[[('elderberry', 1)], [('banana', 2)], [('date', 3)], [('apple', 5), ('cherry', 5), ('fig', 5)]]


In [13]:
df

Unnamed: 0,document_id,offsets,text,type,db_ids,split,first_offset,last_offset,mention_id,mention_text,scispacy,scores,min_hit_index,scispacy_resolve_abbrev
0,10022127,"[[0, 9]]",TIF1gamma,[Gene],[],test,0,9,10022127.1,TIF1gamma,"[UMLS:C1424158, UMLS:C3273663, UMLS:C1504843, ...","[1.0, 1.0, 0.8456882238388062, 0.8403988480567...",1000000,"[UMLS:C1424158, UMLS:C3273663, UMLS:C1504843, ..."
7,10022127,"[[211, 220]]",TIF1gamma,[Gene],[],test,211,220,10022127.2,TIF1gamma,"[UMLS:C1424158, UMLS:C3273663, UMLS:C1504843, ...","[1.0, 1.0, 0.8456882238388062, 0.8403988480567...",1000000,"[UMLS:C1424158, UMLS:C3273663, UMLS:C1504843, ..."
8,10022127,"[[233, 242]]",TIF1alpha,[Gene],[],test,233,242,10022127.3,TIF1alpha,"[UMLS:C0906566, UMLS:C3888335, UMLS:C0965644, ...","[1.0, 1.0, 0.7551559805870056, 0.6794222593307...",1000000,"[UMLS:C0906566, UMLS:C3888335, UMLS:C0965644, ..."
9,10022127,"[[247, 255]]",TIF1beta,[Gene],[],test,247,255,10022127.4,TIF1beta,"[UMLS:C1506870, UMLS:C1704839, UMLS:C1456422, ...","[0.8167135119438171, 0.8167135119438171, 0.806...",1000000,"[UMLS:C1506870, UMLS:C1704839, UMLS:C1456422, ..."
10,10022127,"[[274, 282]]",TIF1beta,[Gene],[],test,274,282,10022127.5,TIF1beta,"[UMLS:C1506870, UMLS:C1704839, UMLS:C1456422, ...","[0.8167135119438171, 0.8167135119438171, 0.806...",1000000,"[UMLS:C1506870, UMLS:C1704839, UMLS:C1456422, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6243,9932288,"[[606, 609]]",XT3,[Gene],[],test,606,609,9932288.3,XT3,"[UMLS:C1429013, UMLS:C1619921, UMLS:C1414488, ...","[1.0, 0.7857595086097717, 0.7801902890205383, ...",1000000,"[UMLS:C1429013, UMLS:C1619921, UMLS:C1414488, ..."
6244,9932288,"[[739, 742]]",XT2,[Gene],[],test,739,742,9932288.4,XT2,"[UMLS:C1423547, UMLS:C1308965, UMLS:C0529768, ...","[1.0, 0.7665935158729553, 0.7482038736343384, ...",1000000,"[UMLS:C1423547, UMLS:C1308965, UMLS:C0529768, ..."
6245,9932288,"[[747, 750]]",XT3,[Gene],[],test,747,750,9932288.5,XT3,"[UMLS:C1429013, UMLS:C1619921, UMLS:C1414488, ...","[1.0, 0.7857595086097717, 0.7801902890205383, ...",1000000,"[UMLS:C1429013, UMLS:C1619921, UMLS:C1414488, ..."
6239,9932288,"[[1139, 1142]]",XT2,[Gene],[],test,1139,1142,9932288.6,XT2,"[UMLS:C1423547, UMLS:C1308965, UMLS:C0529768, ...","[1.0, 0.7665935158729553, 0.7482038736343384, ...",1000000,"[UMLS:C1423547, UMLS:C1308965, UMLS:C0529768, ..."


In [7]:
df.index

Int64Index([   55,    58,    70,    56,    57,    59,    60,    61,    62,
               63,
            ...
            28749, 28750, 28751, 28753, 28754, 28734, 28735, 28737, 28738,
            28739],
           dtype='int64', length=9750)

In [69]:
df

Unnamed: 0,document_id,offsets,text,type,db_ids,split,first_offset,last_offset,mention_id,candidates,scores,min_hit_index
2,10021369,"[[43, 76]]",adenomatous polyposis coli tumour,[Modifier],"[UMLS:C0032580, UMLS:C2713443, UMLS:C2713442]",train,43,76,10021369.1,"[MESH:D011125, MESH:D025601, MESH:D017491, MES...","[0.8481286764144897, 0.7981353998184204, 0.780...",1000000
3,10021369,"[[93, 132]]",adenomatous polyposis coli (APC) tumour,[Modifier],"[UMLS:C0032580, UMLS:C2713443, UMLS:C2713442]",train,93,132,10021369.2,"[MESH:D011125, MESH:D025601, MESH:D017491, MES...","[0.7652613520622253, 0.7201526761054993, 0.704...",1000000
1,10021369,"[[357, 372]]",colon carcinoma,[Modifier],"[UMLS:C0009375, UMLS:C0007102]",train,357,372,10021369.3,"[MESH:D002277, MESH:D015179, MESH:D018267, MES...","[0.8241872191429138, 0.7547714114189148, 0.678...",1000000
4,10021369,"[[955, 970]]",colon carcinoma,[Modifier],"[UMLS:C0009375, UMLS:C0007102]",train,955,970,10021369.4,"[MESH:D002277, MESH:D015179, MESH:D018267, MES...","[0.8241872191429138, 0.7547714114189148, 0.678...",1000000
0,10021369,"[[1090, 1096]]",cancer,[SpecificDisease],"[UMLS:C0006826, UMLS:C0086692, UMLS:C4282132, ...",train,1090,1096,10021369.5,"[MESH:D009369, MESH:D001005, MESH:D009062, MES...","[1.0, 0.8050955533981323, 0.7720818519592285, ...",1000000
...,...,...,...,...,...,...,...,...,...,...,...,...
6804,9988281,"[[996, 1015]]",breast malignancies,[SpecificDisease],"[UMLS:C1257930, UMLS:C1257931, UMLS:C1458155, ...",test,996,1015,9988281.7,"[MESH:D001943, MESH:D019337, MESH:D009369, MES...","[0.6743446588516235, 0.6187905073165894, 0.596...",1000000
6794,9988281,"[[1123, 1147]]",invasive lobular cancers,[DiseaseClass],[UMLS:C0206692],test,1123,1147,9988281.8,"[MESH:D002820, MESH:D058365, MESH:D000072742, ...","[0.565710723400116, 0.5440589189529419, 0.5105...",1000000
6795,9988281,"[[1152, 1179]]",low-grade ductal carcinomas,[SpecificDisease],[UMLS:C1176475],test,1152,1179,9988281.9,"[MESH:D044584, MESH:D002285, MESH:D018270, MES...","[0.6658602356910706, 0.5461992025375366, 0.534...",1000000
6797,9988281,"[[1269, 1286]]",ductal carcinomas,[SpecificDisease],[UMLS:C1176475],test,1269,1286,9988281.10,"[MESH:D044584, MESH:D002285, MESH:D018270, MES...","[0.8752213716506958, 0.7109460234642029, 0.703...",1000000


### Load BigScience Biomedical dataset utils

In [13]:
df.text.tolist()[:1]

['22-oxacalcitriol']

In [18]:
df.shape
df['scispacy_candidates'] = candidates

In [19]:
df.head()

Unnamed: 0,document_id,offsets,text,type,db_ids,split,first_offset,last_offset,mention_id,scispacy_candidates
0,10027919,"[[0, 16]]",22-oxacalcitriol,[Chemical],[MESH:C051883],validation,0,16,10027919.1,"[(C0006668, [Calcitrin], [0.48612040281295776]..."
23,10027919,"[[28, 57]]",secondary hyperparathyroidism,[Disease],[MESH:D006962],validation,28,57,10027919.2,"[(C0020502, [hyperparathyroidism, HYPERPARATHY..."
32,10027919,"[[75, 92]]",low bone turnover,[Disease],[MESH:D001851],validation,75,92,10027919.3,"[(C0031227, [turnover, turnovers], [0.80338394..."
1,10027919,"[[106, 119]]",renal failure,[Disease],[MESH:D051437],validation,106,119,10027919.4,"[(C0001623, [adrenal failure], [0.843946099281..."
5,10027919,"[[133, 143]]",Calcitriol,[Chemical],[MESH:D002117],validation,133,143,10027919.5,"[(C0006668, [Calcitrin], [0.779952347278595]),..."


In [26]:


get_candidates(df, mesh_cg)

Generating candidates for 28816 mentions
Number of empty vectors: 192
Finding neighbors took 1.486444 seconds


In [27]:
df

Unnamed: 0,document_id,offsets,text,type,db_ids,split,first_offset,last_offset,mention_id,scispacy_candidates,candidates,scores
0,10027919,"[[0, 16]]",22-oxacalcitriol,[Chemical],[MESH:C051883],validation,0,16,10027919.1,"[(C0006668, [Calcitrin], [0.48612040281295776]...","[D002117, D018167, D002116, D002762, D065668, ...","[0.6363470554351807, 0.5304351449012756, 0.480..."
23,10027919,"[[28, 57]]",secondary hyperparathyroidism,[Disease],[MESH:D006962],validation,28,57,10027919.2,"[(C0020502, [hyperparathyroidism, HYPERPARATHY...","[D006962, D006961, D007037, D049950, D007011, ...","[1.0, 0.8202486634254456, 0.7772029042243958, ..."
32,10027919,"[[75, 92]]",low bone turnover,[Disease],[MESH:D001851],validation,75,92,10027919.3,"[(C0031227, [turnover, turnovers], [0.80338394...","[D016723, D010563, D063269, D001851, D001842, ...","[0.8778586983680725, 0.5458993315696716, 0.521..."
1,10027919,"[[106, 119]]",renal failure,[Disease],[MESH:D051437],validation,106,119,10027919.4,"[(C0001623, [adrenal failure], [0.843946099281...","[D051437, D058186, D007676, D006333, D017093, ...","[1.0, 0.8490089178085327, 0.7505104541778564, ..."
5,10027919,"[[133, 143]]",Calcitriol,[Chemical],[MESH:D002117],validation,133,143,10027919.5,"[(C0006668, [Calcitrin], [0.779952347278595]),...","[D002117, D018167, D002116, D002762, D002119, ...","[1.0, 0.8335597515106201, 0.7767539024353027, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
28802,9952311,"[[1145, 1156]]",bradycardia,[Disease],[MESH:D001919],validation,1145,1156,9952311.13,"[(C0085610, [bradycardia sinus, Sinus Bradycar...","[D001919, D002299, D013610, D009543, D009615, ...","[1.0, 0.5819385647773743, 0.4919999837875366, ..."
28803,9952311,"[[1332, 1340]]",swelling,[Disease],[MESH:D004487],validation,1332,1340,9952311.14,"[(C0013604, [SWELLING], [1.0]), (C0038999, [SW...","[D001929, D008933, D029743, D013546, D034241, ...","[0.8281610608100891, 0.6587820649147034, 0.612..."
28804,9952311,"[[1612, 1615]]",ADR,[Chemical],[MESH:D004317],validation,1612,1615,9952311.15,"[(C0013089, [ADR, adr], [1.0, 1.0]), (C0031831...","[D003348, D000070780, D005039, D004837, D00547...","[0.37323009967803955, 0.36630523204803467, 0.3..."
28807,9952311,"[[1799, 1802]]",ADR,[Chemical],[MESH:D004317],validation,1799,1802,9952311.16,"[(C0013089, [ADR, adr], [1.0, 1.0]), (C0031831...","[D003348, D000070780, D005039, D004837, D00547...","[0.37323009967803955, 0.36630523204803467, 0.3..."


In [16]:
[(cand.concept_id, max(cand.similarities)) for cand in sorted(c, key=lambda x: max(x.similarities), reverse=True)]

[('C0043872', 1.0),
 ('C0654280', 0.68692547082901),
 ('C0006674', 0.6379262804985046),
 ('C2698951', 0.6379262804985046),
 ('C3714610', 0.6379262804985046),
 ('C3207494', 0.5465289950370789),
 ('C3207493', 0.5359896421432495),
 ('C3821496', 0.5242783427238464),
 ('C2825373', 0.5113562941551208),
 ('C0046706', 0.5093730688095093),
 ('C4074702', 0.5073736310005188),
 ('C5203521', 0.5073736310005188),
 ('C0108082', 0.4992302656173706),
 ('C3657722', 0.4992302656173706),
 ('C3207492', 0.49131691455841064),
 ('C0006668', 0.48612040281295776)]

In [2]:
conhelps = BigBioConfigHelpers()
print("found {} dataset configs from {} datasets".format(
    len(conhelps),
    len(conhelps.available_dataset_names)
))



found 453 dataset configs from 127 datasets


### Load SciSpacy models and UMLS KB

**Models considered**
* SciSpacy Large
* SciSpacy + SciBERT

Depending on the dataset in question, we link entities to either MeSH or UMLS

In [36]:
nlp_lg = spacy.load("en_core_sci_lg")
nlp_scibert = spacy.load("en_core_sci_scibert")
nlp_lg.add_pipe("abbreviation_detector")
nlp_scibert.add_pipe("abbreviation_detector")


<scispacy.abbreviation.AbbreviationDetector at 0x7fc870c25150>

In [37]:
nlp_lg.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls", 'filter_for_definitions':False, "max_entities_per_mention":100, "no_definition_threshold":.7, "threshold":0.4})
nlp_scibert.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls", 'filter_for_definitions':False, "max_entities_per_mention":100, "no_definition_threshold":.7, "threshold":0.4})

<scispacy.linking.EntityLinker at 0x7fc870bcf5e0>

In [38]:
lg_mesh = spacy.load("en_core_sci_lg")
lg_mesh.add_pipe("abbreviation_detector")
lg_mesh.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, 
                                            "linker_name": "mesh", 
                                            'filter_for_definitions':False, 
                                            "max_entities_per_mention":100, 
                                            "no_definition_threshold":.7, "threshold":0.4})
scibert_mesh = spacy.load("en_core_sci_scibert")
scibert_mesh.add_pipe("abbreviation_detector")
scibert_mesh.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, 
                                            "linker_name": "mesh", 
                                            'filter_for_definitions':False, 
                                            "max_entities_per_mention":100, 
                                            "no_definition_threshold":.7, "threshold":0.4})

<scispacy.linking.EntityLinker at 0x7fbec1e8c5b0>

### Load UMLS semantic type mappings + tree

In [3]:
st21pv_types = {
    'T005': "Virus" ,
    'T007': "Bacterium",
    'T017': "Anatomical Structure",
    'T022': "Body System",
    'T031': "Body Substance",
    'T033': "Finding",
    'T037': "Injury or Poisoning",
    'T038': "Biologic Function",
    'T058': "Health Care Activity",
    'T062': "Research Activity",
    'T074': "Medical Device",
    'T082': "Spatial Concept",
    'T091': "Biomedical Occupation or Discipline",
    'T092': "Organization",
    'T097': "Professional or Occupational Group",
    'T098': "Population Group",
    'T103': "Chemical",
    'T168': "Food",
    'T170': "Intellectual Product",
    'T201': "Clinical Attribute",
    'T204': "Eukaryote",
}

tree = kb.semantic_type_tree

st21pv_subtypes = list(st21pv_types.keys())
for key in st21pv_types.keys():
    node = tree.get_node_from_id(key)
    st21pv_subtypes.extend([x.type_id for x in tree.get_children(node)])

with open('st21pv_subtypes.json', 'w') as f:
    f.write(ujson.dumps(st21pv_subtypes, indent=2))

type2name = {x.type_id:x.full_name for x in tree.flat_nodes}

In [10]:
# def get_semantic_type_hierarchy(umls_dir, tui)
umls_dir = '/mitchell/entity-linking/2017AA/META'

sem_network_cols = [
    "row_type",
    "tui",
    "name",
    "tree_index",
    "desc",
    "_1",
    "clarification",
    "_2",
    "abbrev",
    "inverse_relation",
    "_3",
]

semantic_network = pd.read_csv(
    os.path.join(umls_dir, "semantic_network/SRDEF"),
    sep="|",
    names=sem_network_cols,
    usecols=["row_type", "tui", "name", "tree_index"],
)

semantic_network = semantic_network.query('row_type == "STY"')

tree_index_to_name = semantic_network.set_index('tree_index')['name'].to_dict()

tree_index_to_parent = {x: '.'.join(x.split('.')[:-1]) for x in semantic_network.tree_index if len(x.split('.')) > 1}

tui_to_tree_index = semantic_network.set_index('tui')['tree_index'].to_dict()

tui_to_name = semantic_network.set_index('tui')['name'].to_dict()

name_to_tui

def get_tui_geneology(tui, min_level=1):
    types = []
    tree_index = tui_to_tree_index[tui]
    split_index = tree_index.split('.')
    for i in range(min_level, len(split_index) + 1):
        types.append(tree_index_to_name['.'.join(split_index[:i])])

    return types
    


tui_geneologies = {tui: get_tui_geneology(tui) for tui in tui_to_name.keys()}
tui_geneologies

with open('../data/tui2type_hierarchy.json', 'w') as f:
    f.write(ujson.dumps(tui_geneologies, indent=2))
# print(tui_to_name)

In [20]:
tui_geneologies['T170']


['Conceptual Entity', 'Intellectual Product']

### Helper functions for processing data

In [40]:
def filter_candidates(candidates, filter_types):
    final_candidates = [e for e in candidates if any([x in filter_types for x in kb.cui_to_entity[e['db_id'].split(':')[-1]].types])]
    return final_candidates

In [42]:
def process_abstract_scispacy(text: str,
                    model,
                    eps: float=1e-6, 
                    db_name: str='UMLS', 
                    filter_types: List=None):
    '''
    SciSpacy processing of a single abstract
    '''
    doc = model(text)
    ent_list = []
    for ent in doc.ents:
        offsets = [(ent.start_char, ent.end_char)]
        text = ent.text
        candidates = [{'db_id': db_name + ':'+e[0], 'score':e[1]} for e in ent._.kb_ents]

        # Filter to candidates of particular types when specified
        if filter_types is not None:
            candidates = filter_candidates(candidates, filter_types)
        if len(candidates) == 0:
            continue
        
        max_score = max([x['score'] for x in candidates])
        choice = np.random.choice([x['db_id'] for x in candidates if x['score'] > max_score - eps])
        ent_list.append({
            "offsets":offsets, 
            "text":text, 
            "candidates": candidates, 
            "normalized":[{'db_name': db_name, 'db_id': choice}]})
    return ent_list
    
    

In [43]:
dataset_names = ['medmentions_st21pv', 'bc5cdr','gnormplus','nlmchem','nlm_gene','medmentions_full',]
dataset_to_models = {'bc5cdr':{'scispacy_large':lg_mesh,
                               'scibert':scibert_mesh},
                     'medmentions_full':{'scispacy_large':nlp_lg,
                                         'scibert':nlp_scibert},
                     'medmentions_st21pv':{'scispacy_large':nlp_lg,
                                           'scibert':nlp_scibert},
                     'gnormplus':{'scispacy_large':nlp_lg,
                                  'scibert':nlp_scibert},
                     'nlmchem':{'scispacy_large':lg_mesh,
                                'scibert':scibert_mesh},
                     'nlm_gene':{'scispacy_large':nlp_lg,
                                 'scibert':nlp_scibert}
                    }

dataset_to_linked_ontology = {'bc5cdr':'MESH',
                              'medmentions_full':'UMLS',
                              'medmentions_st21pv':'UMLS',
                              'gnormplus':'UMLS',
                              'nlmchem':"MESH",
                              'nlm_gene':'UMLS'}

processed = {name:defaultdict(list) for name in dataset_names}
for dataset in tqdm(dataset_names):
    data = conhelps.for_config_name(f'{dataset}_bigbio_kb').load_dataset()
    for model_name, model in dataset_to_models[dataset].items():
#         for split in ['test','validation','train',]:
        for split in ['test']:
            if split not in data:
                print(f"No {split} found in {dataset}")
                continue
            for document in tqdm(data[split]):
                text = ' '.join([t for x in document['passages'] for t in x['text']])
                pmid = document['document_id']
                gold_entities = document['entities']
                
                filter_types = None
                if 'st21pv' in dataset:
                    filter_types = st21pv_subtypes
                
                scispacy_entities = process_abstract_scispacy(text, 
                                                              model, 
                                                              db_name=dataset_to_linked_ontology[dataset], 
                                                              filter_types=filter_types)
                processed[dataset][model_name].append({'pmid':pmid, 
                                  'text':text, 
                                  'dataset':dataset, 
                                  'split':split, 
                                  'gold_entities':gold_entities, 
                                  'predictions':scispacy_entities})
                
with open(f'scispacy_outputs.json', 'w') as f:
    f.write(ujson.dumps(processed, indent=2))

  0%|          | 0/6 [00:00<?, ?it/s]

Reusing dataset med_mentions_dataset (/home/ubuntu/.cache/huggingface/datasets/med_mentions_dataset/medmentions_st21pv_bigbio_kb/1.0.0/3fc6b8a3681d540ae6c7497c238636b543b90764247b5ff3642d243474000794)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/879 [00:00<?, ?it/s]

Reusing dataset bc5cdr_dataset (/home/ubuntu/.cache/huggingface/datasets/bc5cdr_dataset/bc5cdr_bigbio_kb/1.0.0/f01f16ea9b65ead985bedadf7335195c32297c8f1b09417fc607b102a6757d6f)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Reusing dataset gnormplus_dataset (/home/ubuntu/.cache/huggingface/datasets/gnormplus_dataset/gnormplus_bigbio_kb/1.0.0/a07d375cacec149beba70fa6c1ea3ac78628e6652dca5345f3478354e8c0ae96)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

Reusing dataset nlm_chem_dataset (/home/ubuntu/.cache/huggingface/datasets/nlm_chem_dataset/nlmchem_bigbio_kb/1.0.0/d91131823c66b7dd1162027991ea47c342e478209b37cf261c5f122d30409594)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Reusing dataset nlm_gene_dataset (/home/ubuntu/.cache/huggingface/datasets/nlm_gene_dataset/nlm_gene_bigbio_kb/1.0.0/961ce939e0f3a3315d6a179b7afa33173e73bcb3d781a26c058484fbb1a944ca)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Reusing dataset med_mentions_dataset (/home/ubuntu/.cache/huggingface/datasets/med_mentions_dataset/medmentions_full_bigbio_kb/1.0.0/3fc6b8a3681d540ae6c7497c238636b543b90764247b5ff3642d243474000794)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/879 [00:00<?, ?it/s]

## Helper functions for output evaluation

In [67]:
def _hit_at_k(gold_entity, candidate_entities, k, mode='strict'):
    '''
    Determine if correct entity link is in top k entity candidates
    '''
    gold_cuis = gold_entity['db_ids']
    top_candidates = sorted(candidate_entities, key=lambda x: x['score'])[::-1][:k]
    
    for x in top_candidates:
        if x['db_id'] in gold_cuis:
            return True

    return False
#     if gold_cui in [x['db_id'] for x in top_candidates]:
#         return True 
#     else:
#         return False
    


def hits_by_type(gold_lookup, model_lookup, ks=[1, 2, 4, 8, 16, 32, 64], custom_types=None):
    '''
    Get hits@k grouped by entity type.  

    By default, bases type annotations on 
    '''
    # Hits @ k
    hits_dict = {}
    entity_types = set([])
    for k in ks:
        hits = defaultdict(int)
        # Get hits @ k for each entity type
        for key, gold_entity in gold_lookup.items():
            ent_type = gold_entity['type']

            
            # This will break after recent code updates
            if custom_types is not None:
                cuis = [x.split(':')[-1] for x in gold_entity['db_id']]

                # Make sure CUI is in SciSpacy KB
                for cui in cuis:
                    if cui in kb.cui_to_entity:
                        pred_types = kb.cui_to_entity[cui].types
                        for group_name, vals in custom_types.items():
                            if any([x in vals for x in pred_types]):
                                # TODO: Handle case where an entity falls in more than 1 category in custom dict
                                ent_type = group_name
                                entity_types.add(ent_type)
                                break
            else:
                entity_types.add(ent_type)
            

            if key in model_lookup:
                hits[ent_type] += _hit_at_k(gold_entity, model_lookup[key]['candidates'], k)
                hits[ent_type + "_total"] += 1
            else:
                hits[ent_type + '_missing'] += 1

        overall_hits = sum(hits[x] for x in entity_types)/sum(hits[x + '_total'] for x in entity_types)
        overall_recall = sum(hits[x] for x in entity_types)/sum(hits[x + '_total'] + hits[x + '_missing'] for x in entity_types)
        hits['overall (exclude missing)'] = overall_hits
        hits['overall (include missing)'] = overall_recall
        hits_dict[k] = hits
        
    hits_dict['types'] = entity_types
    return hits_dict


def evaluate_linking(processed_data, model='scispacy', custom_types=None):
    '''
    Evaluate a number of metrics on entity linking models:
        * Hits@k for k = 1, 5, 10
    '''
    # preformat data into dict with keys of form (pmid, offset_start, offset_end) for faster access
    
    gold_lookup = {(doc['pmid'], x['offsets'][0][0], x['offsets'][0][1]): {'db_ids': [y['db_name'] + ':' + y['db_id'] for y in x['normalized']], 'type':x['type']} 
                        for doc in processed_data for x in doc['gold_entities'] if len(x['normalized']) > 0}
    model_lookup = {(doc['pmid'], x['offsets'][0][0], x['offsets'][0][1]): {'normalized': x['normalized'][0]['db_id'], 'candidates':x['candidates']} 
                        for doc in processed_data for x in doc['predictions']}

    # for doc in processed_data:
    #     pmid = doc['pmid']
    #     for x in doc['predictions']['model']

    hits_dict = hits_by_type(gold_lookup, model_lookup, custom_types=custom_types)

    return hits_dict
    

def format_output(hits_dict, type_to_name_mapping=type2name):
    '''
    Format entity linking results into pandas DataFrame
    '''
    entity_types = hits_dict['types']
    ks = [k for k in hits_dict.keys() if type(k) == int]
    rows = []
    for t in entity_types:
        if t not in type_to_name_mapping:
            name = t.upper()
        else:
            name = type_to_name_mapping[t]
        rows.append({**{'Semantic Type':name}, **{f'recall@{k}':hits_dict[k][t]/hits_dict[k][t + '_total'] for k in ks}, **{'Spans Matched':hits_dict[ks[0]][t + '_total'], 'Missing':hits_dict[ks[0]][t + '_missing']}})
    
    # Add overall scores
    rows.append({**{'Semantic Type':"**Overall**"}, **{f'recall@{k}':hits_dict[k]["overall (exclude missing)"] for k in ks}, **{'Spans Matched':np.sum([hits_dict[ks[0]][t + '_total'] for t in entity_types]), 'Missing':np.sum([hits_dict[ks[0]][t + '_missing'] for t in entity_types])}})
    rows.append({**{'Semantic Type':"**Overall (including missing spans)**"}, **{f'recall@{k}':hits_dict[k]["overall (include missing)"] for k in ks}, **{'Spans Matched':np.sum([hits_dict[ks[0]][t + '_total'] for t in entity_types]), 'Missing':np.sum([hits_dict[ks[0]][t + '_missing'] for t in entity_types])}})


    results = pd.DataFrame.from_records(rows)
    results['Total'] = results[['Spans Matched','Missing']].sum(axis=1)
    results['Missing Proportion'] = (results['Missing']/(results['Total']))
    return results.round(4)
            


In [65]:
processed['medmentions_st21pv']['scibert'][0]

{'pmid': '25847295',
 'text': 'Nonylphenol diethoxylate inhibits apoptosis induced in PC12 cells Nonylphenol and short-chain nonylphenol ethoxylates such as NP2 EO are present in aquatic environment as wastewater contaminants, and their toxic effects on aquatic species have been reported. Apoptosis has been shown to be induced by serum deprivation or copper treatment. To understand the toxicity of nonylphenol diethoxylate, we investigated the effects of NP2 EO on apoptosis induced by serum deprivation and copper by using PC12 cell system. Nonylphenol diethoxylate itself showed no toxicity and recovered cell viability from apoptosis. In addition, nonylphenol diethoxylate decreased DNA fragmentation caused by apoptosis in PC12 cells. This phenomenon was confirmed after treating apoptotic PC12 cells with nonylphenol diethoxylate, whereas the cytochrome c release into the cytosol decreased as compared to that in apoptotic cells not treated with nonylphenol diethoxylate s. Furthermore, Bax 

In [68]:
for dataset in dataset_names:
    for model in ['scibert','scispacy_large']:
        print(dataset, model)
        hits_dict = evaluate_linking(processed[dataset][model])
        display(format_output(hits_dict))
        

medmentions_st21pv scibert


Unnamed: 0,Semantic Type,recall@1,recall@2,recall@4,recall@8,recall@16,recall@32,recall@64,Spans Matched,Missing,Total,Missing Proportion
0,Health Care Activity,0.4884,0.6209,0.7233,0.7931,0.807,0.808,0.8083,3094,1693,4787,0.3537
1,Clinical Attribute,0.5388,0.7481,0.845,0.8488,0.8605,0.8605,0.8605,258,65,323,0.2012
2,Chemical,0.4567,0.5994,0.7027,0.7717,0.7954,0.8005,0.8027,5073,2334,7407,0.3151
3,Organization,0.6164,0.6681,0.7672,0.7716,0.7759,0.7759,0.7759,232,150,382,0.3927
4,Virus,0.7107,0.8182,0.9421,0.9421,0.9504,0.9504,0.9504,121,51,172,0.2965
5,Finding,0.4329,0.6164,0.7149,0.7537,0.7678,0.7704,0.7704,1908,1303,3211,0.4058
6,Anatomical Structure,0.5967,0.7208,0.7988,0.8281,0.8376,0.8384,0.8389,2321,1452,3773,0.3848
7,Injury or Poisoning,0.6625,0.7167,0.7792,0.7958,0.8042,0.8042,0.8042,240,117,357,0.3277
8,Bacterium,0.6517,0.7865,0.839,0.839,0.839,0.839,0.839,267,182,449,0.4053
9,Spatial Concept,0.6373,0.7431,0.8075,0.8558,0.8604,0.8604,0.8604,1304,1105,2409,0.4587


medmentions_st21pv scispacy_large


Unnamed: 0,Semantic Type,recall@1,recall@2,recall@4,recall@8,recall@16,recall@32,recall@64,Spans Matched,Missing,Total,Missing Proportion
0,Health Care Activity,0.4891,0.6213,0.7241,0.7924,0.8059,0.8065,0.8069,3179,1608,4787,0.3359
1,Clinical Attribute,0.5285,0.7529,0.8403,0.8441,0.8555,0.8555,0.8555,263,60,323,0.1858
2,Chemical,0.4656,0.6071,0.7095,0.7778,0.7997,0.8051,0.8068,5243,2164,7407,0.2922
3,Organization,0.5944,0.6546,0.751,0.755,0.759,0.759,0.759,249,133,382,0.3482
4,Virus,0.6984,0.8175,0.9444,0.9444,0.9524,0.9524,0.9524,126,46,172,0.2674
5,Finding,0.4352,0.6196,0.7138,0.7537,0.7696,0.7721,0.7721,1953,1258,3211,0.3918
6,Anatomical Structure,0.5916,0.7186,0.7916,0.8204,0.8304,0.8312,0.8317,2299,1474,3773,0.3907
7,Injury or Poisoning,0.6423,0.7,0.7538,0.7692,0.7808,0.7808,0.7808,260,97,357,0.2717
8,Bacterium,0.6464,0.7714,0.8214,0.8214,0.8214,0.8214,0.8214,280,169,449,0.3764
9,Spatial Concept,0.6351,0.7344,0.7991,0.8488,0.8518,0.8518,0.8518,1329,1080,2409,0.4483


bc5cdr scibert


Unnamed: 0,Semantic Type,recall@1,recall@2,recall@4,recall@8,recall@16,recall@32,recall@64,Spans Matched,Missing,Total,Missing Proportion
0,CHEMICAL,0.8952,0.8986,0.8989,0.8989,0.8989,0.8989,0.8989,2909,2455,5364,0.4577
1,DISEASE,0.8181,0.8669,0.8789,0.8829,0.8861,0.8861,0.8861,2749,1720,4469,0.3849
2,**Overall**,0.8577,0.8832,0.8892,0.8911,0.8927,0.8927,0.8927,5658,4175,9833,0.4246
3,**Overall (including missing spans)**,0.4935,0.5082,0.5116,0.5128,0.5137,0.5137,0.5137,5658,4175,9833,0.4246


bc5cdr scispacy_large


Unnamed: 0,Semantic Type,recall@1,recall@2,recall@4,recall@8,recall@16,recall@32,recall@64,Spans Matched,Missing,Total,Missing Proportion
0,CHEMICAL,0.8931,0.896,0.896,0.896,0.896,0.896,0.896,3088,2276,5364,0.4243
1,DISEASE,0.8201,0.865,0.8767,0.8807,0.8833,0.8833,0.8833,2741,1728,4469,0.3867
2,**Overall**,0.8588,0.8815,0.8869,0.8888,0.89,0.89,0.89,5829,4004,9833,0.4072
3,**Overall (including missing spans)**,0.5091,0.5225,0.5258,0.5269,0.5276,0.5276,0.5276,5829,4004,9833,0.4072


gnormplus scibert


Unnamed: 0,Semantic Type,recall@1,recall@2,recall@4,recall@8,recall@16,recall@32,recall@64,Spans Matched,Missing,Total,Missing Proportion
0,GENE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2148,1075,3223,0.3335
1,**Overall**,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2148,1075,3223,0.3335
2,**Overall (including missing spans)**,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2148,1075,3223,0.3335


gnormplus scispacy_large


Unnamed: 0,Semantic Type,recall@1,recall@2,recall@4,recall@8,recall@16,recall@32,recall@64,Spans Matched,Missing,Total,Missing Proportion
0,GENE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2266,957,3223,0.2969
1,**Overall**,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2266,957,3223,0.2969
2,**Overall (including missing spans)**,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2266,957,3223,0.2969


nlmchem scibert


Unnamed: 0,Semantic Type,recall@1,recall@2,recall@4,recall@8,recall@16,recall@32,recall@64,Spans Matched,Missing,Total,Missing Proportion
0,CHEMICAL,0.752,0.7586,0.7598,0.7598,0.7598,0.7598,0.7598,4064,7596,11660,0.6515
1,**Overall**,0.752,0.7586,0.7598,0.7598,0.7598,0.7598,0.7598,4064,7596,11660,0.6515
2,**Overall (including missing spans)**,0.2621,0.2644,0.2648,0.2648,0.2648,0.2648,0.2648,4064,7596,11660,0.6515


nlmchem scispacy_large


Unnamed: 0,Semantic Type,recall@1,recall@2,recall@4,recall@8,recall@16,recall@32,recall@64,Spans Matched,Missing,Total,Missing Proportion
0,CHEMICAL,0.7539,0.7614,0.7626,0.7626,0.7626,0.7626,0.7626,4254,7406,11660,0.6352
1,**Overall**,0.7539,0.7614,0.7626,0.7626,0.7626,0.7626,0.7626,4254,7406,11660,0.6352
2,**Overall (including missing spans)**,0.275,0.2778,0.2782,0.2782,0.2782,0.2782,0.2782,4254,7406,11660,0.6352


nlm_gene scibert


Unnamed: 0,Semantic Type,recall@1,recall@2,recall@4,recall@8,recall@16,recall@32,recall@64,Spans Matched,Missing,Total,Missing Proportion
0,GENERIF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,387,277,664,0.4172
1,DOMAIN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9,1,10,0.1
2,GENE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1087,648,1735,0.3735
3,OTHER,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,13,14,0.9286
4,STARGENE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,154,178,332,0.5361
5,**Overall**,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1638,1117,2755,0.4054
6,**Overall (including missing spans)**,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1638,1117,2755,0.4054


nlm_gene scispacy_large


Unnamed: 0,Semantic Type,recall@1,recall@2,recall@4,recall@8,recall@16,recall@32,recall@64,Spans Matched,Missing,Total,Missing Proportion
0,GENERIF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,408,256,664,0.3855
1,DOMAIN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8,2,10,0.2
2,GENE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1114,621,1735,0.3579
3,OTHER,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,12,14,0.8571
4,STARGENE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,176,156,332,0.4699
5,**Overall**,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1708,1047,2755,0.38
6,**Overall (including missing spans)**,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1708,1047,2755,0.38


medmentions_full scibert


ZeroDivisionError: division by zero

In [11]:
test_preds = ujson.load(open('scispacy_test_processed.json', 'r'))
train_preds = ujson.load(open('scispacy_train_processed.json', 'r'))
val_preds = ujson.load(open('scispacy_validation_processed.json', 'r'))

all_preds = [*train_preds, *test_preds, *val_preds]


all_hits_dict = evaluate_linking(all_preds)
format_output(all_hits_dict)

  0%|          | 0/203185 [00:00<?, ?it/s]

  overall_hits = np.sum(hits[x] for x in entity_types)/np.sum(hits[x + '_total'] for x in entity_types)
  overall_recall = np.sum(hits[x] for x in entity_types)/np.sum(hits[x + '_total'] + hits[x + '_missing'] for x in entity_types)


  0%|          | 0/203185 [00:00<?, ?it/s]

  0%|          | 0/203185 [00:00<?, ?it/s]

  0%|          | 0/203185 [00:00<?, ?it/s]

Unnamed: 0,Semantic Type,Hits @ 1,Hits @ 5,Hits @ 10,Hits @ 25,Spans Matched,Missing,Total,Missing Proportion
0,Virus,0.6174,0.8558,0.8605,0.8616,860,238,1098,0.2168
1,Clinical Attribute,0.5312,0.8423,0.8466,0.8545,1395,377,1772,0.2128
2,Biomedical Occupation or Discipline,0.6103,0.8985,0.9074,0.9221,680,236,916,0.2576
3,Organization,0.4584,0.7642,0.7739,0.7788,1442,701,2143,0.3271
4,Food,0.3409,0.69,0.8105,0.8157,971,380,1351,0.2813
5,Research Activity,0.5476,0.7771,0.8113,0.8265,6704,2463,9167,0.2687
6,Intellectual Product,0.3659,0.7317,0.7559,0.7653,6076,4246,10322,0.4114
7,Anatomical Structure,0.5519,0.8104,0.8312,0.8407,14848,5640,20488,0.2753
8,Chemical,0.4342,0.7289,0.777,0.7918,28490,8890,37380,0.2378
9,Bacterium,0.6066,0.8404,0.8468,0.8475,1416,633,2049,0.3089


In [69]:
hits_dict

{1: defaultdict(int,
             {'T131_missing': 111,
              'T052': 669,
              'T052_total': 920,
              'T043': 167,
              'T043_total': 343,
              'T169': 1379,
              'T169_total': 2672,
              'T025': 348,
              'T025_total': 561,
              'T131': 108,
              'T131_total': 208,
              'T033_missing': 1338,
              'T067': 111,
              'T067_total': 259,
              'T069': 24,
              'T069_total': 26,
              'T167': 132,
              'T167_total': 242,
              'T037_missing': 118,
              'T001': 59,
              'T001_total': 149,
              'T031_missing': 57,
              'T080_missing': 2850,
              'T196_missing': 149,
              'T169_missing': 2002,
              'T080': 1823,
              'T080_total': 3503,
              'T196': 56,
              'T196_total': 236,
              'T081': 956,
              'T081_total': 2361,
           

In [140]:
hits_dict = evaluate_linking(test_preds)
format_output(hits_dict)

Unnamed: 0,Semantic Type,Hits @ 1,Hits @ 5,Hits @ 10,Hits @ 25,Spans Matched,Missing,Total,Missing Proportion
0,Intellectual Product,0.3485,0.7194,0.741,0.7538,1251,1114,2365,0.471
1,Organization,0.4858,0.7449,0.753,0.7571,247,135,382,0.3534
2,Clinical Attribute,0.494,0.8606,0.8606,0.8725,251,72,323,0.2229
3,Eukaryote,0.4828,0.8474,0.8629,0.8681,1160,590,1750,0.3371
4,Finding,0.3941,0.7076,0.7482,0.7667,1946,1265,3211,0.394
5,Injury or Poisoning,0.6432,0.7759,0.7925,0.805,241,116,357,0.3249
6,Research Activity,0.5574,0.8019,0.8333,0.8475,1272,575,1847,0.3113
7,Body System,0.2807,0.7368,0.7368,0.8596,57,33,90,0.3667
8,Virus,0.6356,0.9068,0.9068,0.9153,118,54,172,0.314
9,Food,0.246,0.6043,0.738,0.7487,187,135,322,0.4193


In [141]:
p = preds[0]

In [142]:
p['predictions']['scispacy']

[{'offsets': [[0, 11]],
  'text': 'Nonylphenol',
  'candidates': [{'db_id': 'UMLS:C0068946', 'score': 1.0},
   {'db_id': 'UMLS:C0953814', 'score': 0.8478714227676392},
   {'db_id': 'UMLS:C0953815', 'score': 0.8301727175712585},
   {'db_id': 'UMLS:C0970196', 'score': 0.7795582413673401},
   {'db_id': 'UMLS:C0097405', 'score': 0.767101526260376},
   {'db_id': 'UMLS:C3857939', 'score': 0.757386326789856},
   {'db_id': 'UMLS:C0953813', 'score': 0.7489328384399414},
   {'db_id': 'UMLS:C4236359', 'score': 0.7475515604019165},
   {'db_id': 'UMLS:C0132779', 'score': 0.7382228374481201}],
  'normalized': [{'db_name': 'UMLS', 'db_id': 'UMLS:C0068946'}]},
 {'offsets': [[25, 33]],
  'text': 'inhibits',
  'candidates': [{'db_id': 'UMLS:C0021467', 'score': 0.7524629831314087},
   {'db_id': 'UMLS:C0021469', 'score': 0.7524629831314087},
   {'db_id': 'UMLS:C3463820', 'score': 0.7524629831314087},
   {'db_id': 'UMLS:C0074775', 'score': 0.7493148446083069},
   {'db_id': 'UMLS:C1999216', 'score': 0.73101

In [143]:
p['gold_entities'][0]['normalized']

[{'db_name': 'UMLS', 'db_id': 'UMLS:C0162638'}]

In [11]:
PLANT_CATEGORIES = [
    'T096',  # Group
    'T002',  # Plant
]

DISEASE_CATEGORIES = [
    'T020',  # Acquired abnormality
    'T190',  # Anatomical abnormaility
    'T049',  # Cell or Molecular dysfunction
    'T019',  # Congential abnormality
    'T047',  # Disease or Syndrome
    # 'T037',  # Injury or Poising
    'T048',  # Mental or behavioural dysfunction
    'T191',  # Neoplastic process
    'T046',  # Pathologic function
    'T184',  # Sign or Symptom
]

PROTEIN_CATEGORIES = [
    'T192',  # Receptor
    'T126',  # Enzyme
    'T116',  # Amino acid, peptide, protein
]

PROCESS_CATEGORIES = [
    'T038',  # Biological process
    'T043',  # Cell function
    'T044',  # Molecular function
    'T039',  # Physiological function
]

categories = {'plant_categories':PLANT_CATEGORIES, 'disease_categories':DISEASE_CATEGORIES, 'protein_categories':PROTEIN_CATEGORIES, 'process_categories':PROCESS_CATEGORIES}


in_mmst21pv = {}
not_in_mmst21pv = {}
for key, val in categories.items():
    in_mmst21pv[key.upper()] = {x: tree.get_canonical_name(x) for x in val if x in st21pv_subtypes}
    not_in_mmst21pv[key] = {x: tree.get_canonical_name(x) for x in val if x not in st21pv_subtypes}

print("Our Categories covered by MedMentions ST21PV")
display(in_mmst21pv)

print("Our Categories NOT covered by MedMentions ST21PV")
display(not_in_mmst21pv)

Our Categories covered by MedMentions ST21PV


{'PLANT_CATEGORIES': {'T002': 'Plant'},
 'DISEASE_CATEGORIES': {'T020': 'Acquired Abnormality',
  'T190': 'Anatomical Abnormality',
  'T049': 'Cell or Molecular Dysfunction',
  'T019': 'Congenital Abnormality',
  'T047': 'Disease or Syndrome',
  'T048': 'Mental or Behavioral Dysfunction',
  'T191': 'Neoplastic Process',
  'T046': 'Pathologic Function',
  'T184': 'Sign or Symptom'},
 'PROTEIN_CATEGORIES': {'T192': 'Receptor',
  'T126': 'Enzyme',
  'T116': 'Amino Acid, Peptide, or Protein'},
 'PROCESS_CATEGORIES': {'T038': 'Biologic Function',
  'T043': 'Cell Function',
  'T044': 'Molecular Function',
  'T039': 'Physiologic Function'}}

Our Categories NOT covered by MedMentions ST21PV


{'plant_categories': {'T096': 'Group'},
 'disease_categories': {},
 'protein_categories': {},
 'process_categories': {}}

In [16]:
test_preds = ujson.load(open('/efs/davidkartchner/el-robustness-comparison/output/scispacy_test_processed.json', 'r'))
# train_preds = ujson.load(open('/efs/davidkartchner/el-robustness-comparison/output/scispacy_train_processed.json', 'r'))
# val_preds = ujson.load(open('/efs/davidkartchner/el-robustness-comparison/output/scispacy_validation_processed.json', 'r'))
# all_preds = [*train_preds, *test_preds, *val_preds]

In [35]:
all_hits_dict = evaluate_linking(all_preds, custom_types=categories)
format_output(all_hits_dict)



  0%|          | 0/203185 [00:00<?, ?it/s]

  overall_hits = np.sum(hits[x] for x in entity_types)/np.sum(hits[x + '_total'] for x in entity_types)
  overall_recall = np.sum(hits[x] for x in entity_types)/np.sum(hits[x + '_total'] + hits[x + '_missing'] for x in entity_types)


  0%|          | 0/203185 [00:00<?, ?it/s]

  0%|          | 0/203185 [00:00<?, ?it/s]

  0%|          | 0/203185 [00:00<?, ?it/s]

Unnamed: 0,Semantic Type,Hits @ 1,Hits @ 5,Hits @ 10,Hits @ 25,Spans Matched,Missing,Total,Missing Proportion
0,PLANT_CATEGORIES,0.5542,0.8074,0.8287,0.834,1319,612,1931,0.3169
1,PROCESS_CATEGORIES,0.4062,0.6955,0.7476,0.7732,6137,2192,8329,0.2632
2,DISEASE_CATEGORIES,0.5291,0.8572,0.8878,0.9033,20966,3942,24908,0.1583
3,PROTEIN_CATEGORIES,0.2628,0.6627,0.744,0.7664,11513,3370,14883,0.2264
4,**Overall**,0.4343,0.7747,0.8229,0.8415,39935,10116,50051,0.2021
5,**Overall (including missing spans)**,0.3465,0.6181,0.6566,0.6715,39935,10116,50051,0.2021


In [33]:
format_output(all_hits_dict)

Unnamed: 0,Semantic Type,Hits @ 1,Hits @ 5,Hits @ 10,Hits @ 25,Spans Matched,Missing,Total,Missing Proportion
0,Eukaryote,0.4932,0.8438,0.8525,0.8562,5057,1563,6620,0.2361
1,DISEASE_CATEGORIES,0.5291,0.8572,0.8878,0.9033,20966,3942,24908,0.1583
2,Population Group,0.5642,0.8176,0.885,0.8882,5027,1119,6146,0.1821
3,PLANT_CATEGORIES,0.5542,0.8074,0.8287,0.834,1319,612,1931,0.3169
4,Injury or Poisoning,0.6077,0.7634,0.7819,0.7901,1458,390,1848,0.211
5,Biomedical Occupation or Discipline,0.6103,0.8985,0.9074,0.9221,680,236,916,0.2576
6,Anatomical Structure,0.5507,0.8114,0.8327,0.842,14234,5471,19705,0.2776
7,Chemical,0.5505,0.7738,0.7995,0.809,16979,5520,22499,0.2453
8,Virus,0.6174,0.8558,0.8605,0.8616,860,238,1098,0.2168
9,PROCESS_CATEGORIES,0.4062,0.6955,0.7476,0.7732,6137,2192,8329,0.2632


In [36]:
# test_hits_dict = evaluate_linking(test_preds, custom_types=categories)

# scispacy_results = format_output(test_hits_dict)
# scispacy_results['Model'] = 'Scispacy_en_sm'
# cols = ['Semantic Type'] + [col for col in scispacy_results.columns if col.startswith('recall')] + ['Total','Missing', 'Model']
scispacy_results[cols].to_pickle('/efs/davidkartchner/el-robustness-comparison/output/scispacy_custom_categories.pickle')



In [38]:
test_hits_dict = evaluate_linking(test_preds)
scispacy_results = format_output(test_hits_dict)
scispacy_results['Model'] = 'Scispacy_en_sm'
cols = ['Semantic Type'] + [col for col in scispacy_results.columns if col.startswith('recall')] + ['Total','Missing', 'Model']
scispacy_results[cols].to_pickle('/efs/davidkartchner/el-robustness-comparison/output/scispacy_output.pickle')



  0%|          | 0/40143 [00:00<?, ?it/s]

  overall_hits = np.sum(hits[x] for x in entity_types)/np.sum(hits[x + '_total'] for x in entity_types)
  overall_recall = np.sum(hits[x] for x in entity_types)/np.sum(hits[x + '_total'] + hits[x + '_missing'] for x in entity_types)


  0%|          | 0/40143 [00:00<?, ?it/s]

  0%|          | 0/40143 [00:00<?, ?it/s]

  0%|          | 0/40143 [00:00<?, ?it/s]

  0%|          | 0/40143 [00:00<?, ?it/s]

  0%|          | 0/40143 [00:00<?, ?it/s]

In [34]:
!pip install --upgrade pandas

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting pandas
  Downloading pandas-1.4.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m78.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.4.2
    Uninstalling pandas-1.4.2:
      Successfully uninstalled pandas-1.4.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pyenveda 0.0.6 requires statsmodels, which is not installed.[0m[31m
[0mSuccessfully installed pandas-1.4.3


In [29]:
scispacy_results

Unnamed: 0,Semantic Type,recall@1,recall@2,recall@4,recall@8,recall@16,recall@32,Spans Matched,Missing,Total,Missing Proportion,Model
0,PROCESS_CATEGORIES,0.4513,0.643,0.7302,0.7759,0.8002,0.8063,986,542,1528,0.3547,Scispacy_en_sm
1,PROTEIN_CATEGORIES,0.2623,0.4179,0.6223,0.7392,0.7814,0.7916,1967,845,2812,0.3005,Scispacy_en_sm
2,DISEASE_CATEGORIES,0.522,0.726,0.8535,0.8809,0.9069,0.9074,3803,1071,4874,0.2197,Scispacy_en_sm
3,PLANT_CATEGORIES,0.4455,0.6818,0.7727,0.8409,0.8455,0.8455,220,191,411,0.4647,Scispacy_en_sm
4,**Overall**,0.4364,0.626,0.7683,0.8248,0.8545,0.8585,6976,2649,9625,0.2752,Scispacy_en_sm
5,**Overall (including missing spans)**,0.3163,0.4537,0.5569,0.5978,0.6193,0.6222,6976,2649,9625,0.2752,Scispacy_en_sm


In [22]:
test_hits_dict = evaluate_linking(test_preds)
format_output(test_hits_dict)

  0%|          | 0/40143 [00:00<?, ?it/s]

  overall_hits = np.sum(hits[x] for x in entity_types)/np.sum(hits[x + '_total'] for x in entity_types)
  overall_recall = np.sum(hits[x] for x in entity_types)/np.sum(hits[x + '_total'] + hits[x + '_missing'] for x in entity_types)


  0%|          | 0/40143 [00:00<?, ?it/s]

  0%|          | 0/40143 [00:00<?, ?it/s]

  0%|          | 0/40143 [00:00<?, ?it/s]

  0%|          | 0/40143 [00:00<?, ?it/s]

  0%|          | 0/40143 [00:00<?, ?it/s]

Unnamed: 0,Semantic Type,recall@1,recall@2,recall@4,recall@8,recall@16,recall@32,Spans Matched,Missing,Total,Missing Proportion
0,Organization,0.4858,0.5911,0.7085,0.753,0.753,0.7571,247,135,382,0.3534
1,Chemical,0.4488,0.5911,0.7041,0.7713,0.7957,0.8024,5082,2325,7407,0.3139
2,Eukaryote,0.4828,0.7129,0.8328,0.8629,0.8681,0.8698,1160,590,1750,0.3371
3,Finding,0.3941,0.5329,0.6331,0.7446,0.7631,0.7667,1946,1265,3211,0.394
4,Body Substance,0.2338,0.2857,0.7143,0.8182,0.8312,0.8312,154,58,212,0.2736
5,Virus,0.6356,0.7881,0.9068,0.9068,0.9153,0.9153,118,54,172,0.314
6,Medical Device,0.4294,0.5706,0.6893,0.791,0.7966,0.7966,177,178,355,0.5014
7,Injury or Poisoning,0.6432,0.7012,0.7676,0.7925,0.805,0.805,241,116,357,0.3249
8,Bacterium,0.6292,0.764,0.8315,0.8315,0.8315,0.8315,267,182,449,0.4053
9,Anatomical Structure,0.5779,0.7191,0.7929,0.8276,0.838,0.8393,2303,1470,3773,0.3896


In [7]:
ujson.loads('"Identification of APC2, a homologue of the START adenomatous polyposis coli tumour END"')

'Identification of APC2, a homologue of the START adenomatous polyposis coli tumour END'