In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
import ujson
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")



from tqdm import tqdm
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
from datasets import load_dataset

from bigbio.dataloader import BigBioConfigHelpers

sys.path.append('..')
from bigbio_utils import dataset_to_df, DATASET_NAMES, CUIS_TO_EXCLUDE, CUIS_TO_REMAP, VALIDATION_DOCUMENT_IDS, load_dataset_df


from bioel.ontology import BiomedicalOntology


conhelps = BigBioConfigHelpers()

ner_helpers = conhelps.filtered(
    lambda x:
        x.is_bigbio_schema
        and not x.is_local
        and "NAMED_ENTITY_DISAMBIGUATION" in x.tasks
        and not x.is_large
)


# Filter out all warnings


In [4]:
data = load_dataset("../../biomedical/bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py", name="sourcedata_nlp_bigbio_kb")



Downloading data:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [8]:
tax_counts = defaultdict(set)
db_counts = defaultdict(int)

for split in ['train','validation','test']:
    for doc in data[split]:
        doi = doc['document_id']
        for e in doc['entities']:
            
            for x in e['normalized']:
                db_counts[x['db_name']] += 1
                if e['type'] == 'ORGANISM':
                    tax_counts[doi].add(x['db_id'])


In [9]:
db_counts

defaultdict(int,
            {'CL': 33276,
             'CHEBI': 64173,
             'BAO': 69170,
             'OBI': 23226,
             'PO': 1481,
             'NCBI': 44898,
             'GO': 39059,
             'NCBIGENE': 100691,
             'UNIPROT': 149643,
             'PUBCHEM': 10942,
             'UBERON': 34884,
             'CVCL': 27244,
             'MESH': 1618,
             'DOID': 5213,
             'RFAM': 175,
             'CORUM': 51})

In [20]:
s = pd.Series({key:len(val) for key, val in tax_counts.items()})
all_organisms = set.union(*tax_counts.values())
len(all_organisms)
org_list = list[all_organisms]

In [None]:
total_entities = 0
for split in data:
    for ent_list in data[split]['entities']:
        total_entities += len(ent_list)

total_entities

452520

In [None]:
df = load_dataset_df('sourcedata_nlp')
df['type'] = df['type'].map(lambda x: x[0])
df

Unnamed: 0,document_id,offsets,text,type,db_ids,split,mention_id
0,10.1002/eji.200323730,"[[1529, 1538]]",EBV‐B1.25,CELL_TYPE,[CL:0000236],train,10.1002/eji.200323730.1
1,10.1002/eji.200323730,"[[1666, 1675]]",EBV‐B1.11,CELL_TYPE,[CL:0000236],train,10.1002/eji.200323730.2
2,10.1002/eji.200323730,"[[1704, 1711]]",T cells,CELL_TYPE,[CL:0000084],train,10.1002/eji.200323730.3
3,10.1002/eji.200323730,"[[1820, 1829]]",EBV‐B1.11,CELL_TYPE,[CL:0000236],train,10.1002/eji.200323730.4
4,10.1002/eji.200323730,"[[1873, 1882]]",EBV‐B1.11,CELL_TYPE,[CL:0000236],train,10.1002/eji.200323730.5
...,...,...,...,...,...,...,...
389614,10.15252/msb.202211475,"[[4774, 4787]]",carbenicillin,SMALL_MOLECULE,[CHEBI:3393],train,10.15252/msb.202211475.18
389615,10.4161/auto.29468,"[[8664, 8667]]",LC3,GENEPROD,"[UNIPROT:Q9CQV6, UNIPROT:Q91VR7]",train,10.4161/auto.29468.1
389616,10.4161/auto.29468,"[[8861, 8865]]",Atg3,GENEPROD,[UNIPROT:Q9CPX6],train,10.4161/auto.29468.2
389617,10.4161/auto.29468,"[[9061, 9064]]",GL2,GENEPROD,[UNIPROT:P60521],train,10.4161/auto.29468.3


In [None]:
df.split.value_counts()

split
train         307991
validation     44162
test           37466
Name: count, dtype: int64

In [None]:
entity_totals= df.groupby('split')['type'].value_counts()#.unstack('split')


In [None]:
entity_type_totals = df.groupby(['type']).agg({'db_ids': lambda x: len(set([z for y in x for z in y]))}).rename(columns={'db_ids':"total"})
entity_type_totals['total'] = entity_type_totals['total'].map(lambda x: format(x, ','))
entity_type_totals

Unnamed: 0_level_0,total
type,Unnamed: 1_level_1
CELL_LINE,732
CELL_TYPE,450
DISEASE,489
EXP_ASSAY,623
GENEPROD,15185
ORGANISM,431
SMALL_MOLECULE,3663
SUBCELLULAR,754
TISSUE,1195


In [None]:

unique_entities = df.groupby(['split','type']).agg({'db_ids': lambda x: len(set([z for y in x for z in y]))}).rename(columns={'db_ids':"unique_ents"})#.unstack('split')
unique_entities

Unnamed: 0_level_0,Unnamed: 1_level_0,unique_ents
split,type,Unnamed: 2_level_1
test,CELL_LINE,134
test,CELL_TYPE,171
test,DISEASE,107
test,EXP_ASSAY,275
test,GENEPROD,2620
test,ORGANISM,88
test,SMALL_MOLECULE,732
test,SUBCELLULAR,237
test,TISSUE,332
train,CELL_LINE,643


In [None]:
stats = pd.concat([entity_totals, unique_entities], axis=1)
stats['percent'] = 100 * (stats['unique_ents']/stats['count']).round(3)
for col in ['unique_ents','count']:
    stats[col] = stats[col].map(lambda x: format(x, ','))
stats = stats.unstack('split').swaplevel(axis=1)

In [None]:
output_stats.type.unique()

array(['CELL_LINE', 'CELL_TYPE', 'DISEASE', 'EXP_ASSAY', 'GENEPROD',
       'ORGANISM', 'SMALL_MOLECULE', 'SUBCELLULAR', 'TISSUE'],
      dtype=object)

In [None]:
output_stats = pd.concat([stats[split] for split in df['split'].unique()] + [entity_type_totals], axis=1).reset_index()
output_stats.index = [''] * output_stats.shape[0]
output_stats['type'] = output_stats['type'].map({'CELL_LINE': "Cell Line", 'CELL_TYPE': "Cell Type", 'DISEASE': "Disease", 'EXP_ASSAY': "Exp. assay", 'GENEPROD': "Gene products",
       'ORGANISM':'Organism', 'SMALL_MOLECULE':"Small Mol.", 'SUBCELLULAR':"Subcellular", 'TISSUE':"Tissue"})
print(output_stats.round(1).to_latex(escape=False,float_format=lambda x: f'{x:.1f}'))


\begin{tabular}{llllrllrllrl}
\toprule
 & type & count & unique_ents & percent & count & unique_ents & percent & count & unique_ents & percent & total \\
\midrule
 & Cell Line & 15,383 & 643 & 4.2 & 1,712 & 134 & 7.8 & 2,325 & 164 & 7.1 & 732 \\
 & Cell Type & 17,621 & 409 & 2.3 & 2,198 & 171 & 7.8 & 2,490 & 165 & 6.6 & 450 \\
 & Disease & 4,577 & 419 & 9.2 & 529 & 107 & 20.2 & 581 & 99 & 17.0 & 489 \\
 & Exp. assay & 59,926 & 580 & 1.0 & 7,118 & 275 & 3.9 & 7,995 & 263 & 3.3 & 623 \\
 & Gene products & 102,099 & 12,902 & 12.6 & 12,673 & 2,620 & 20.7 & 15,183 & 2,837 & 18.7 & 15,185 \\
 & Organism & 24,192 & 366 & 1.5 & 3,287 & 88 & 2.7 & 3,230 & 115 & 3.6 & 431 \\
 & Small Mol. & 43,827 & 3,176 & 7.2 & 4,656 & 732 & 15.7 & 6,069 & 822 & 13.5 & 3,663 \\
 & Subcellular & 20,312 & 671 & 3.3 & 2,689 & 237 & 8.8 & 3,104 & 244 & 7.9 & 754 \\
 & Tissue & 20,054 & 1,045 & 5.2 & 2,604 & 332 & 12.7 & 3,185 & 359 & 11.3 & 1,195 \\
\bottomrule
\end{tabular}



In [None]:
output_stats

Unnamed: 0_level_0,count,unique_ents,percent,count,unique_ents,percent,count,unique_ents,percent,total
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CELL_LINE,15383,643,4.2,1712,134,7.8,2325,164,7.1,732
CELL_TYPE,17621,409,2.3,2198,171,7.8,2490,165,6.6,450
DISEASE,4577,419,9.2,529,107,20.2,581,99,17.0,489
EXP_ASSAY,59926,580,1.0,7118,275,3.9,7995,263,3.3,623
GENEPROD,102099,12902,12.6,12673,2620,20.7,15183,2837,18.7,15185
ORGANISM,24192,366,1.5,3287,88,2.7,3230,115,3.6,431
SMALL_MOLECULE,43827,3176,7.2,4656,732,15.7,6069,822,13.5,3663
SUBCELLULAR,20312,671,3.3,2689,237,8.8,3104,244,7.9,754
TISSUE,20054,1045,5.2,2604,332,12.7,3185,359,11.3,1195


In [None]:
format(123445, ',')

'123,445'

In [None]:
for split in ['train','test','validation']:
    print(split)
    display(len(set([y for x in df.loc[df.split==split, 'db_ids']for y in x] )))
    for ent_type in ['CELL_TYPE', 'GENEPROD', 'SUBCELLULAR', 'SMALL_MOLECULE','EXP_ASSAY', 'CELL_LINE', 'TISSUE', 'ORGANISM', 'DISEASE']:
        

20204

4696

5067

In [None]:
test_cases = [
        # {'filepath': 'https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo',
        # 'name': 'disease ontology',
        # 'prefix_to_keep': None,
        # 'entity_type': 'DISEASE',
        # 'abbrev': 'DOID'},
        # {'filepath': 'http://purl.obolibrary.org/obo/cl.obo',
        # 'name': 'cell ontology',
        # 'prefix_to_keep': 'CL',
        # 'entity_type': 'CELL_TYPE',
        # 'abbrev': 'CL'},
        {'filepath': 'http://purl.obolibrary.org/obo/uberon.obo',
        'name': 'uberon',
        'prefix_to_keep': 'UBERON',
        'entity_type': 'TISSUE',
        'abbrev': 'UBERON'},
        # {'filepath': 'http://purl.obolibrary.org/obo/obi.obo',
        # 'name': 'ontology of biological investigations',
        # 'prefix_to_keep': 'OBI',
        # 'entity_type': 'EXP_ASSAY',
        # 'abbrev': 'OBI'},
        # {'filepath': 'https://ftp.expasy.org/databases/cellosaurus/cellosaurus.obo',
        # 'name': 'cellosaurus',
        # 'prefix_to_keep': None,
        # 'entity_type': 'CELL_LINE',
        # 'abbrev': 'CVCL'},
        # {'filepath': 'http://purl.obolibrary.org/obo/go.obo',
        # 'name': 'gene ontology',
        # 'prefix_to_keep': None,
        # 'entity_type': 'SUBCELLULAR',
        # 'abbrev': 'GO'},
        # {'filepath': 'http://purl.obolibrary.org/obo/po.obo',
        # 'name': 'plant ontology',
        # 'prefix_to_keep': "PO",
        # 'entity_type': 'TISSUE',
        # 'abbrev': 'PO'},
        # {'filepath': 'http://purl.obolibrary.org/obo/chebi.obo',
        # 'name': 'ChEBI',
        # 'prefix_to_keep': None,
        # 'entity_type': 'SMALL_MOLECULE',
        # 'abbrev': 'CHEBI'},
        ]

for obo_dict in tqdm(test_cases):
    ontology = BiomedicalOntology.load_obo(**obo_dict)
    display(ontology.entities[:10])
    aliases = ontology.get_aliases()
    with open(os.path.join('..','data','soda_aliases',obo_dict['entity_type']), 'w') as f:
        for key, val in tqdm(aliases.items()):
            for a in val:
                f.write(f"{key}||{a}\n")

                 
                     
            
    
    

  0%|          | 0/1 [00:00<?, ?it/s][2024-04-06 22:06:08] [ontology.py] [INFO] Reading OBO ontology from http://purl.obolibrary.org/obo/uberon.obo


100%|██████████| 25100/25100 [00:00<00:00, 353010.91it/s]


[BiomedicalEntity(cui='UBERON:0000000', name='processual entity', types=['TISSUE'], aliases=[], definition='An occurrent [span:Occurrent] that exists in time by occurring or happening, has temporal parts and always involves and depends on some entity.', equivalant_cuis=None, taxonomy=None, metadata=None),
 BiomedicalEntity(cui='UBERON:0000002', name='uterine cervix', types=['TISSUE'], aliases=['canalis cervicis uteri', 'caudal segment of uterus', 'cervical canal', 'cervical canal of uterus', 'cervix', 'cervix of uterus', 'cervix uteri', 'neck of uterus', 'uterine cervix'], definition='Lower, narrow portion of the uterus where it joins with the top end of the vagina.', equivalant_cuis=None, taxonomy=None, metadata=None),
 BiomedicalEntity(cui='UBERON:0000003', name='naris', types=['TISSUE'], aliases=[], definition='Orifice of the olfactory system. The naris is the route by which odorants enter the olfactory system[MAH].', equivalant_cuis=None, taxonomy=None, metadata=None),
 BiomedicalE

100%|██████████| 14480/14480 [00:00<00:00, 641833.79it/s]
100%|██████████| 1/1 [00:06<00:00,  6.01s/it]


In [None]:
corum_ents = []
for doc in data['train']['entities']:
    for x in doc:
        for y in x['normalized']:
            if 'corum' in y['db_name'].lower():
                corum_ents.append(x)
                continue

[{'id': '312824',
  'type': 'SUBCELLULAR',
  'text': ['MRN'],
  'offsets': [[4559, 4562]],
  'normalized': [{'db_name': 'Corum', 'db_id': '71'}]},
 {'id': '312825',
  'type': 'SUBCELLULAR',
  'text': ['9-1-1'],
  'offsets': [[4579, 4584]],
  'normalized': [{'db_name': 'Corum', 'db_id': '267'}]},
 {'id': '312829',
  'type': 'SUBCELLULAR',
  'text': ['MRN'],
  'offsets': [[4799, 4802]],
  'normalized': [{'db_name': 'Corum', 'db_id': '71'}]},
 {'id': '312830',
  'type': 'SUBCELLULAR',
  'text': ['9-1-1'],
  'offsets': [[4819, 4824]],
  'normalized': [{'db_name': 'Corum', 'db_id': '267'}]},
 {'id': '325230',
  'type': 'SUBCELLULAR',
  'text': ['GSEC'],
  'offsets': [[1407, 1411]],
  'normalized': [{'db_name': 'Corum', 'db_id': '5871'}]},
 {'id': '325242',
  'type': 'SUBCELLULAR',
  'text': ['GSECs'],
  'offsets': [[1561, 1566]],
  'normalized': [{'db_name': 'Corum', 'db_id': '5871'}]},
 {'id': '325247',
  'type': 'SUBCELLULAR',
  'text': ['GSEC'],
  'offsets': [[1679, 1683]],
  'normalized

In [None]:
from transformers import (
    AutoTokenizer, 
    AutoModel, 
)

In [None]:
# AutoModel.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")
AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext", use_fast=True, do_lower_case = False)

PreTrainedTokenizerFast(name_or_path='microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext', vocab_size=30522, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

# Process BAO ontology

In [None]:
df = pd.read_csv('../data/soda_ontologies/BAO.csv')
df.columns = [x.lower() for x in df.columns]
bao = df[df['class id'].map(lambda x: "BAO" in x)]

bao['curie'] = bao['class id'].map(lambda x: x.split("#")[-1])
bao = bao.set_index('curie')

In [None]:
cols = []
for col in [x.lower() for x in bao.columns.tolist()]:
    for x in ['label','alias','synonym','definition','description',"term"]:
        if x in col:
            cols.append(col)

            print(col)
            display(df.loc[~df[col].isnull(), ["preferred label", col]].head())
            print('\n\n')

cols

In [None]:
synonym_columns = ['preferred label',
 'synonyms',
 'alternative term',
 'editor preferred term',
 'has exact synonym',
 'has_broad_synonym',
 'has_narrow_synonym',
 'has_related_synonym',
 'iedb alternative term',
 'label',
 ]

definition_cols = [
 'definitions',
 'alt_definition',
 'external_definition',
 'textual definition'
]


bao_synonyms = bao[synonym_columns].apply(lambda x: '|'.join([y for y in x if not pd.isna(y)]), axis=1)

# with open(os.path.join('..','data','soda_aliases','EXP_ASSAY.txt'), 'a') as f:
#     for key, val in bao_synonyms.map(lambda x: list(set(x.split('|')))).to_dict().items():
#         for v in val:
#             f.write(f"{key}||{v}\n")



# Process NCBI Taxonomy

In [None]:
ncbi = pd.read_csv('../data/ncbi_taxonomy_names.txt', delimiter='|', na_filter=False, index_col=False, names=['tax_id','name','unique_name','name_class'])
for col in ncbi.columns:
    if ncbi[col].dtype not in [int, float]:
        ncbi[col] = ncbi[col].map(lambda x: x.strip())
        
ncbi_scientific_names = ncbi.query('name_class == "scientific name"')
tax2name = ncbi_scientific_names.set_index('tax_id')['name'].to_dict()

with open('../data/tax2name.json','w') as f:
    f.write(ujson.dumps(tax2name, indent=2))

# Keep only name classes that encode useful information about aliases
ncbi = ncbi[ncbi.name_class.isin(['synonym','scientific name','blast name','equivalent name','genbank common name','common name','acronym','genbank acronym'])]
ncbi['tax_id'] = ncbi['tax_id'].map(lambda x: f'NCBI:{x}')

ncbi2alias = ncbi.groupby('tax_id').agg({'name':lambda x: list(set(x))}).to_dict()['name']

with open('../data/ncbi_to_alias.json','w') as f:
    f.write(ujson.dumps(ncbi2alias))

In [None]:
with open(os.path.join('..','data','soda_aliases','ORGANISM.txt'), 'w') as f:
    for key, val in ncbi2alias.items():
        for v in val:
            f.write(f"{key}||{v}\n")

# Entrez (NCBI Gene)

In [21]:
ontology = BiomedicalOntology.load_ncbi_gene(taxa=org_list)
len(ontology)

[2024-04-17 23:31:08] [ontology.py] [INFO] Loading NCBI Gene (Entrez).  This ontology is large and may take a few minutes.
[2024-04-17 23:35:24] [ontology.py] [INFO] Filtering to desired taxa and gene types


TypeError: 'types.GenericAlias' object is not iterable

In [None]:
aliases = ontology.get_aliases()
with open(os.path.join('..','data','soda_aliases',"GENEPROD.txt"), 'w') as f:
    for key, val in tqdm(aliases.items()):
        for a in val:
            f.write(f"{key}||{a}\n")

100%|██████████| 45510673/45510673 [00:53<00:00, 849525.96it/s]
