In [2]:
import obonet
import ujson
import sys
import logging
import pandas as pd

from tqdm.auto import tqdm
from collections import defaultdict
from bigbio.dataloader import BigBioConfigHelpers


sys.path.insert(0, '..')
from umls_utils import UmlsMappings
from bigbio_utils import dataset_to_df



logger = logging.getLogger()
logger.setLevel(logging.INFO)

tqdm.pandas()
pd.set_option('display.max_rows', 200)

conhelps = BigBioConfigHelpers()

%load_ext autoreload
%autoreload 2

In [3]:
data = conhelps.for_config_name(f"nlm_gene_bigbio_kb").load_dataset(from_hub=False)
df = dataset_to_df(data)
all_nlmgene_entities = set([])
for normalizations in df['db_ids']:
    if any([x for x in normalizations if x.endswith(':') ]):
        print(normalizations)
    elif any([x for x in normalizations if ',' in x]):
        print(normalizations)

    all_nlmgene_entities.update([x for x in normalizations])


Downloading and preparing dataset nlm_gene/nlm_gene_bigbio_kb to /home/dkartchner3/.cache/huggingface/datasets/nlm_gene/nlm_gene_bigbio_kb/1.0.0/92249f0c0c401d2d902c7a0af6d76ca2e535383a56c0c4d4099036c7c0dc9581...


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nlm_gene downloaded and prepared to /home/dkartchner3/.cache/huggingface/datasets/nlm_gene/nlm_gene_bigbio_kb/1.0.0/92249f0c0c401d2d902c7a0af6d76ca2e535383a56c0c4d4099036c7c0dc9581. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
def extract_synonyms(data):
    if 'synonym' in data:
        return [syn.split('"')[1]for syn in data['synonym']]
    else:
        # print(data)
        return []
        
def term_to_synonyms(graph):
    node_dict = {}
    for curie, data in tqdm(graph.nodes(data=True)):
        synonyms = [data['name']] + extract_synonyms(data)
        node_dict[curie] = synonyms

    return node_dict


In [5]:

url = 'http://purl.obolibrary.org/obo/chebi.obo'
graph = obonet.read_obo(url)
synonyms = term_to_synonyms(graph)

  0%|          | 0/163657 [00:00<?, ?it/s]

In [None]:
# for curie, data in enumerate(tqdm(graph.nodes(data=True))):
#     if curie < 10:
#         print(data)

## Get MeSH synonyms

In [7]:
# # 2022 UMLS
umls = UmlsMappings(umls_dir='/mitchell/entity-linking/2022AA/META/', debug=False, force_reprocess=False)

# 2017 UMLS
# umls_2017 = UmlsMappings(umls_dir='/mitchell/entity-linking/2022AA/META/', debug=False, force_reprocess=False)

Loading cached UMLS data from /mitchell/entity-linking/2022AA/META/.cached_df.feather


In [5]:
umls.umls.columns

Index(['cui', 'lang', 'scui', 'sdui', 'sab', 'alias', 'tui', 'group'], dtype='object')

In [4]:
mesh_to_alias = umls.get_aliases(ontologies_to_include=['MSH'],
                        use_umls_curies=False,
                        mapping_cols={'MSH':'sdui'},
                        prefixes={"MSH":"MESH"},
                        lowercase=True,
                        )

alias_to_mesh = umls.get_aliases(ontologies_to_include=['MSH'],
                        use_umls_curies=False,
                        mapping_cols={'MSH':'sdui'},
                        prefixes={"MSH":"MESH"},
                        reverse=True,
                        lowercase=True,
                        )


mesh_to_alias_chem_only = umls.get_aliases(ontologies_to_include=['MSH'],
                        groups_to_include=['CHEM'],
                        use_umls_curies=False,
                        mapping_cols={'MSH':'sdui'},
                        prefixes={"MSH":"MESH"},
                        lowercase=True,
                        )

In [8]:
with open('../data/mesh_to_alias.txt', 'w') as f:
    f.write('\n'.join([curie + '||' + alias for curie, alias_list in mesh_to_alias.items() for alias in alias_list]))

with open('../data/mesh_to_alias_chem_only.txt', 'w') as f:
    f.write('\n'.join([curie + '||' + alias for curie, alias_list in mesh_to_alias_chem_only.items() for alias in alias_list]))

with open('../data/alias_to_mesh.txt', 'w') as f:
    f.write('\n'.join([alias + '||' + '|'.join(curie) for alias, curie in alias_to_mesh.items()]))

## Get OMIM Aliases

In [9]:
# OMIM
omim_to_alias = umls.get_aliases(ontologies_to_include=['OMIM'],
                        use_umls_curies=False,
                        mapping_cols={'OMIM':'sdui'},
                        prefixes={"OMIM":"OMIM"},
                        lowercase=True,
                        )

alias_to_omim = umls.get_aliases(ontologies_to_include=['OMIM'],
                        use_umls_curies=False,
                        mapping_cols={'OMIM':'sdui'},
                        prefixes={"OMIM":"OMIM"},
                        reverse=True,
                        lowercase=True,
                        )

                        
with open("../data/omim_to_alias.txt", "w") as f:
    f.write(
        "\n".join(
            [
                curie + "||" + alias
                for curie, alias_list in omim_to_alias.items()
                for alias in alias_list
            ]
        )
    )

with open('../data/alias_to_omim.txt', 'w') as f:
    f.write('\n'.join([alias + '||' + '|'.join(curie) for alias, curie in alias_to_omim.items()]))

## Combine MeSH and OMIM for NCBI-Disease

In [9]:
# # Combine MeSH and OMIM for NCBI-Disease corpus
# alias_to_mesh_and_omim = defaultdict(list)
# for alias, curie_list in alias_to_mesh.items():
#     alias_to_mesh_and_omim[alias] = curie_list

# for alias, curie_list in alias_to_omim.items():
#     alias_to_mesh_and_omim[alias].extend(curie_list)

mesh_and_omim_to_alias_disease_only = umls.get_aliases(ontologies_to_include=['MSH','OMIM'],
                        # groups_to_include=['DISO'],
                        use_umls_curies=False,
                        mapping_cols={'MSH':'sdui', 'OMIM': 'sdui'},
                        prefixes={"MSH":"MESH", 'OMIM':'OMIM'},
                        lowercase=True,
                        )

# mesh_and_omim_to_alias_disease_only = umls_2017.get_aliases(ontologies_to_include=['MSH','OMIM'],
#                         # groups_to_include=['DISO'],
#                         use_umls_curies=False,
#                         mapping_cols={'MSH':'sdui', 'OMIM': 'sdui'},
#                         prefixes={"MSH":"MESH", 'OMIM':'OMIM'},
#                         lowercase=True,
#                         )

# with open('../data/mesh_and_omim_to_alias_disease_only.txt', 'w') as f:
#     f.write('\n'.join([curie + '||' + alias for curie, alias_list in mesh_and_omim_to_alias_disease_only.items() for alias in alias_list]))

with open('../data/mesh_and_omim_to_alias.txt', 'w') as f:
    f.write('\n'.join([curie + '||' + alias for curie, alias_list in mesh_and_omim_to_alias_disease_only.items() for alias in alias_list]))

## UMLS Full Aliases

In [None]:
# Full UMLS
umls_to_alias = umls.get_aliases(ontologies_to_include='all',
                        use_umls_curies=True,
                        lowercase=True,
                        )

alias_to_umls = umls.get_aliases(ontologies_to_include='all',
                        use_umls_curies=True,
                        reverse=True,
                        lowercase=True,
                        )

with open("../data/umls_to_alias.txt", "w") as f:
    f.write(
        "\n".join(
            [
                curie + "||" + alias
                for curie, alias_list in umls_to_alias.items()
                for alias in alias_list
            ]
        )
    )

with open('../data/alias_to_umls.txt', 'w') as f:
    f.write('\n'.join([alias + '||' + '|'.join(curie) for alias, curie in alias_to_umls.items()]))

## UMLS ST21PV Aliases

In [4]:
# ST21PV
st21pv_vocabs = ['MSH','CPT','FMA','GO', 'HGNC','HPO','ICD10','ICD10CM','ICD9CM','MDR','MTH','NCBI','NCI','NDDF','MED-RT','OMIM','RXNORM','SNOMEDCT_US']
unique_vocabs = umls.umls.sab.unique()
for x in st21pv_vocabs:
    if x not in unique_vocabs:
        print(x)
    assert x in unique_vocabs

st21pv_types = ujson.load(open('../data/st21pv_subtypes.json','r'))


st21pv_to_alias = umls.get_aliases(ontologies_to_include=st21pv_vocabs,
                        types_to_include=st21pv_types,
                        use_umls_curies=True,
                        lowercase=True,
                        )

alias_to_st21pv = umls.get_aliases(ontologies_to_include=st21pv_vocabs,
                        types_to_include=st21pv_types,
                        use_umls_curies=True,
                        reverse=True,
                        lowercase=True,
                        )


with open("../data/st21pv_to_alias.txt", "w") as f:
    f.write(
        "\n".join(
            [
                curie + "||" + alias
                for curie, alias_list in st21pv_to_alias.items()
                for alias in alias_list
            ]
        )
    )

with open('../data/alias_to_st21pv.txt', 'w') as f:
    f.write('\n'.join([alias + '||' + '|'.join(curie) for alias, curie in alias_to_st21pv.items()]))

## Parse NCBI Taxonomy

In [2]:
ncbi = pd.read_csv('../data/ncbi_taxonomy_names.txt', delimiter='|', na_filter=False, index_col=False, names=['tax_id','name','unique_name','name_class'])
for col in ncbi.columns:
    if ncbi[col].dtype not in [int, float]:
        ncbi[col] = ncbi[col].map(lambda x: x.strip())
        
ncbi_scientific_names = ncbi.query('name_class == "scientific name"')
tax2name = ncbi_scientific_names.set_index('tax_id')['name'].to_dict()

with open('../data/tax2name.json','w') as f:
    f.write(ujson.dumps(tax2name, indent=2))



In [41]:
# Keep only name classes that encode useful information about aliases
ncbi = ncbi[ncbi.name_class.isin(['synonym','scientific name','blast name','equivalent name','genbank common name','common name','acronym','genbank acronym'])]
ncbi['tax_id'] = ncbi['tax_id'].map(lambda x: f'NCBI:{x}')

ncbi2alias = ncbi.groupby('tax_id').agg({'name':lambda x: list(set(x))}).to_dict()['name']

with open('../data/ncbi_to_alias.json','w') as f:
    f.write(ujson.dumps(ncbi2alias))


KeyboardInterrupt: 

In [None]:
ncbi.name_class.value_counts()


scientific name        2477980
authority               651869
synonym                 237259
type material           203828
includes                 71506
equivalent name          55937
genbank common name      30240
common name              14610
acronym                   2081
in-part                    678
blast name                 230
genbank acronym             25
Name: name_class, dtype: int64

## Parsem Entrez (NCBI-Gene) Ontology

In [3]:
# taxa_used = [9606, 10090, 10116, 559292, 7227, 3702, 6239, 7955, 8355, 9940]
entrez = pd.read_csv(
    "../data/gene_info.tsv",
    delimiter="\t",
    usecols=[
        "#tax_id",
        "GeneID",
        "Symbol",
        "Synonyms",
        "Symbol_from_nomenclature_authority",
        "Full_name_from_nomenclature_authority",
        "Other_designations",
        "type_of_gene",
        "description",
        "dbXrefs"
    ],
    na_filter=False,
    low_memory=False,
).rename(
    {
        "Symbol_from_nomenclature_authority": "official_symbol",
        "Full_name_from_nomenclature_authority": "official_name",
        '#tax_id':'tax_id'
    }, axis=1,
)
entrez.columns = [x.lower() for x in entrez.columns]
# entrez = entrez[entrez.tax_id.isin(taxa_used)]

entrez


Unnamed: 0,tax_id,geneid,symbol,synonyms,dbxrefs,description,type_of_gene,official_symbol,official_name,other_designations
0,7,5692769,NEWENTRY,-,-,Record to support submission of GeneRIFs for a...,other,-,-,-
1,9,2827857,NEWENTRY,-,-,Record to support submission of GeneRIFs for a...,other,-,-,-
2,11,10823747,NEWENTRY,-,-,Record to support submission of GeneRIFs for a...,other,-,-,-
3,14,6951813,NEWENTRY,-,-,Record to support submission of GeneRIFs for a...,other,-,-,-
4,19,3758873,NEWENTRY,-,-,Record to support submission of GeneRIFs for a...,other,-,-,-
...,...,...,...,...,...,...,...,...,...,...
41456246,2998809,2935004,OrniCt014,-,-,tRNA-Ser,tRNA,-,-,-
41456247,2998809,2935005,OrniCt017,-,-,tRNA-Phe,tRNA,-,-,-
41456248,2998809,3276630,rps12,-,-,ribosomal protein S12,protein-coding,-,-,ribosomal protein S12
41456249,2998809,4036376,OrniCt037,-,-,tRNA-Ile,tRNA,-,-,-


In [4]:
# Make DB xref dictionary
xref_df = entrez.query("dbxrefs != '-'")
xref_df = xref_df[['geneid','dbxrefs']]
xref_df['xref_list'] = xref_df.dbxrefs.map(lambda x: x.split('|'))
hgnc_omim_mask = xref_df.xref_list.map(lambda x: any([('MIM' in z.split(':')) or ('HGNC' in z.split(':')) for z in x]))



filtered = xref_df[hgnc_omim_mask]
filtered['filtered_xrefs'] = filtered.xref_list.map(lambda x: [':'.join(y.split(':')[-2:]) for y in x if (y.startswith('MIM') or y.startswith('HGNC'))])
filtered.geneid = filtered.geneid.map(lambda x: f'NCBIGene:{x}')
filtered = filtered.set_index("geneid")
entrez_to_hgnc_omim = filtered.filtered_xrefs.to_dict()

hgnc_omim_to_entrez = defaultdict(set)
for key, value in entrez_to_hgnc_omim.items():
    for x in value:
        hgnc_omim_to_entrez[x].add(key)

if max([len(x) for x in hgnc_omim_to_entrez.values()]) == 1:
    hgnc_omim_to_entrez = {key: list(val)[0] for key, val in hgnc_omim_to_entrez.items()}
else:
    hgnc_omim_to_entrez = {key: list(val) for key, val in hgnc_omim_to_entrez.items()}

# with open('../data/hgnc_omim2entrez.json','w') as f:
#     f.write(ujson.dumps(hgnc_omim_to_entrez))

# with open('../data/entrez2hgnc_omim.json','w') as f:
#     f.write(ujson.dumps(entrez_to_hgnc_omim))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['filtered_xrefs'] = filtered.xref_list.map(lambda x: [':'.join(y.split(':')[-2:]) for y in x if (y.startswith('MIM') or y.startswith('HGNC'))])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered.geneid = filtered.geneid.map(lambda x: f'NCBIGene:{x}')


In [4]:
# Unique entities in NLM-Gene or GNormPlus
nlm_gene_data = conhelps.for_config_name(f"nlm_gene_bigbio_kb").load_dataset(from_hub=False)
df = dataset_to_df(nlm_gene_data)
all_nlm_gene_entities = set([])
for doc_id, normalizations in df[['document_id','db_ids']].values:
    if any([x for x in normalizations if x.endswith(':') ]):
        print(doc_id, normalizations)
    elif any([x for x in normalizations if ',' in x]):
        print(normalizations)
    all_nlm_gene_entities.update([x for x in normalizations])

# Unique entities in GNormPlus
gnormplus_data = conhelps.for_config_name(f"gnormplus_bigbio_kb").load_dataset(from_hub=False)
df = dataset_to_df(gnormplus_data)
all_gnormplus_entities = set([])
for normalizations in df['db_ids']:
    if any([x for x in normalizations if x.endswith(')') ]):
        print(normalizations)
    elif any([x for x in normalizations if ',' in x]):
        print(normalizations)
    all_gnormplus_entities.update([x for x in normalizations])


Found cached dataset nlm_gene (/home/dkartchner3/.cache/huggingface/datasets/nlm_gene/nlm_gene_bigbio_kb/1.0.0/92249f0c0c401d2d902c7a0af6d76ca2e535383a56c0c4d4099036c7c0dc9581)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset gnormplus (/home/dkartchner3/.cache/huggingface/datasets/gnormplus/gnormplus_bigbio_kb/1.0.0/97a2714b58185305591c949b067cea2febfca2447016096c3d08021d84bf7b69)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# Identify updated/discontinued entities in NLM-Gene
nlm_gene_ids = set([int(x.split(':')[-1]) for x in all_nlm_gene_entities])
nlm_gene_entity_mask = entrez.geneid.isin(nlm_gene_ids)
missing_nlm_gene_ids = nlm_gene_ids - set(entrez.loc[nlm_gene_entity_mask, 'geneid'].tolist())

# Find updated/discontinued entities in GNormPlus
gnormplus_ids = set([int(x.split(':')[-1]) for x in all_gnormplus_entities])
gnormplus_entity_mask = entrez.geneid.isin(gnormplus_ids)
missing_gnormplus_ids = gnormplus_ids - set(entrez.loc[gnormplus_entity_mask, 'geneid'].tolist())


# Mappings to correct missing ids
updated_nlm_gene_ids = {
    139: -1,
    502: 498,
    1651: 4137,
    7899: 392255,
    8097: 26353,
    22529: 108394,
    26916: 110253,
    38105: -1,
    38853: -1,
    47925: -1,
    55473: -1,
    98561: 59125,
    100934: 16826,
    113718: 284167,
    128039: -1,
    130505: -1,
    155841: -1,
    267370: -1,
    723788: 8412,
    3654564: -1,
    100911597: 25745,
}

updated_gnormplus_ids = {
    2087: -1,
    4197: 2122,
    4276: 100507436,
    6686: 123606,
    7330: 283556,
    3637603: -1,
    5692362: -1,
    7683036: -1,
    17494192: -1,
}

# Mappings to correct missing ids
updated_ids = {
    139: -1,
    502: 498,
    1651: 4137,
    2087: -1,
    7899: 392255,
    8097: 26353,
    22529: 108394,
    26916: 110253,
    38105: -1,
    38853: -1,
    47925: -1,
    55473: -1,
    98561: 59125,
    100934: 16826,
    113718: 284167,
    128039: -1,
    130505: -1,
    155841: -1,
    267370: -1,
    723788: 8412,
    3654564: -1,
    100911597: 25745,
    4197: 2122,
    4276: 100507436,
    6686: 123606,
    7330: 283556,
    3637603: -1,
    5692362: -1,
    7683036: -1,
    17494192: -1,
}

In [7]:
missing_gnormplus_ids

{2087, 4197, 4276, 6686, 7330, 3637603, 5692362, 7683036, 17494192}

In [6]:
# Fix missing ids
nlm_gene_ids |= set([x for x in updated_nlm_gene_ids.values() if x > 0])
nlm_gene_ids -= set(updated_nlm_gene_ids.keys())
all_nlm_gene_entities |= set([f'NCBIGene:{x}' for x in updated_nlm_gene_ids.values() if x > 0])
all_nlm_gene_entities -= set([f'NCBIGene:{x}' for x in updated_nlm_gene_ids.keys()])

gnormplus_ids |= set([x for x in updated_gnormplus_ids.values() if x > 0])
gnormplus_ids -= set(updated_gnormplus_ids.keys())
all_gnormplus_entities |= set([f'NCBIGene:{x}' for x in updated_gnormplus_ids.values() if x > 0])
all_gnormplus_entities -= set([f'NCBIGene:{x}' for x in updated_gnormplus_ids.keys()])

In [7]:
# Get subsets of taxonomy coresponding to each data subset
nlm_gene_entity_mask = entrez.geneid.isin(nlm_gene_ids)
nlm_gene_taxa = entrez[nlm_gene_entity_mask].tax_id.unique().tolist()
with open('../data/taxonomy_subsets/nlm_gene.json', 'w') as f:
    f.write(ujson.dumps(nlm_gene_taxa))


gnormplus_entity_mask = entrez.geneid.isin(gnormplus_ids)
gnormplus_taxa = entrez[gnormplus_entity_mask].tax_id.unique().tolist()
with open('../data/taxonomy_subsets/gnormplus.json', 'w') as f:
    f.write(ujson.dumps(gnormplus_taxa))

print(len(gnormplus_taxa))

22


In [44]:
# Filter to only include genes from the taxonomies linked to by NLM-Gene and GNormPlus
# Filter out genes of type "unkonwn" and "tRNA" because they have large numbers of non-unique gene symbols
# Also filter out "NEWENTRY" entities (for adding new genes) and hypothetical proteins
nlm_gene_taxa_mask = (entrez.tax_id.isin(nlm_gene_taxa)) & (~entrez.type_of_gene.isin(['unknown','tRNA','biological-region'])) & (entrez.description != 'hypothetical protein') & (~entrez.official_name.map(lambda x: x.lower().startswith("predicted"))) #& (entrez.symbol != 'NEWENTRY')
gnormplus_taxa_mask = (entrez.tax_id.isin(gnormplus_taxa)) & (~entrez.type_of_gene.isin(['unknown','tRNA', 'biological-region'])) & (entrez.description != 'hypothetical protein') & (~entrez.official_name.map(lambda x: x.lower().startswith("predicted"))) #& (entrez.symbol != 'NEWENTRY')

print(nlm_gene_taxa_mask.sum())
print(gnormplus_taxa_mask.sum())

704575
549545


In [None]:
# for mask in [nlm_gene_entity_mask, gnormplus_entity_mask]:
mask = nlm_gene_entity_mask | gnormplus_entity_mask
subset = entrez[mask]
display(pd.DataFrame(subset.type_of_gene.value_counts()))
display(subset[subset.type_of_gene=='unknown'])



all_dbs = [y for x in subset.dbxrefs.map(lambda x: [a for b in x.split('|') for a in b.split(':')] ) for y in x]
pd.Series(all_dbs).value_counts()[:30]

Unnamed: 0,type_of_gene
protein-coding,4416
ncRNA,103
other,22
pseudo,11
unknown,4
miscRNA,1


Unnamed: 0,tax_id,geneid,symbol,synonyms,dbxrefs,description,type_of_gene,official_symbol,official_name,other_designations
5884075,7227,252643,Si,-,FLYBASE:FBgn0003404|AllianceGenome:FB:FBgn0003404,Ski,unknown,-,-,-
10180658,9606,6022,RMD1,-,MIM:600332,rippling muscle disease 1,unknown,-,-,-
11987771,10090,98285,AI255230,-,MGI:MGI:2138180,expressed sequence AI255230,unknown,AI255230,expressed sequence AI255230,-
11993122,10090,114737,Iba1,-,MGI:MGI:2149992,induction of brown adipocytes 1,unknown,Iba1,induction of brown adipocytes 1,-


HGNC              7145
AllianceGenome    4250
Ensembl           3981
MGI               3299
MIM               2350
RGD                814
SGD                302
ZFIN               114
FB                 110
FLYBASE            110
miRBase             88
TAIR                80
Araport             80
WB                  58
WormBase            58
Xenbase             57
-                   57
EnsemblRapid        48
VGNC                28
IMGT/GENE-DB        13
CGNC                 9
BGD                  9
ECOCYC               9
ASAP                 9
3282                 4
2319                 4
3647                 4
2711                 4
3573                 4
2888                 4
dtype: int64

In [70]:
# # Unique entities in NLM-Gene or GNormPlus
# nlm_gene_data = conhelps.for_config_name(f"nlm_gene_bigbio_kb").load_dataset()
# df = dataset_to_df(nlm_gene_data)
# all_nlm_gene_entities = set([])
# for normalizations in df['db_ids']:
#     # if any([x for x in normalizations if x.endswith(':') ]):
#     #     print(normalizations)
#     # elif any([x for x in normalizations if ',' in x]):
#     #     print(normalizations)
#     all_nlm_gene_entities.update([x for x in normalizations])

# # Identify updated/discontinued entities in NLM-Gene
# nlm_gene_ids = set([int(x.split(':')[-1]) for x in all_nlm_gene_entities])
# nlm_gene_entity_mask = entrez.geneid.isin(nlm_gene_ids)
# missing_nlm_gene_ids = nlm_gene_ids - set(entrez.loc[nlm_gene_entity_mask, 'geneid'].tolist())

# Filter Entrez to entities used in dataset
nlm_gene_filtered = entrez[nlm_gene_taxa_mask]

# Find duplicated symbols
nlm_gene_symbols = nlm_gene_filtered.symbol.value_counts()
nlm_gene_duplicated_symbols = nlm_gene_symbols[nlm_gene_symbols > 1].index.tolist()


# Add additional canonical symbol for symbols that are repeated across different organisms
nlm_gene_duplicated_symbol_mask = nlm_gene_filtered.symbol.isin(nlm_gene_duplicated_symbols)

nlm_gene_filtered['canonical_symbol'] = nlm_gene_filtered['symbol']
nlm_gene_filtered.loc[nlm_gene_duplicated_symbol_mask, 'canonical_symbol'] = nlm_gene_filtered.loc[nlm_gene_duplicated_symbol_mask, ['tax_id','symbol']].apply(lambda x: f"{x[1]} ({tax2name[x[0]]})", axis=1)


# Complie list of all symbols
nlm_gene_filtered['all_symbols'] = nlm_gene_filtered[['symbol','synonyms','official_symbol','official_name','other_designations', 'canonical_symbol']].progress_apply(lambda x: '|'.join(list(set([i for i in x if i.strip() != '-']))), axis=1)
nlm_gene_filtered['geneid'] = nlm_gene_filtered.geneid.map(lambda x: f"NCBIGene:{x}")
nlm_gene_geneid2synonym = nlm_gene_filtered.set_index('geneid')['all_symbols'].to_dict()

with open("../data/entrez_to_alias_nlm_gene.json",'w') as f:
    f.write(ujson.dumps(nlm_gene_geneid2synonym))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nlm_gene_filtered['canonical_symbol'] = nlm_gene_filtered['symbol']


  0%|          | 0/704575 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nlm_gene_filtered['all_symbols'] = nlm_gene_filtered[['symbol','synonyms','official_symbol','official_name','other_designations', 'canonical_symbol']].progress_apply(lambda x: '|'.join(list(set([i for i in x if i.strip() != '-']))), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nlm_gene_filtered['geneid'] = nlm_gene_filtered.geneid.map(lambda x: f"NCBIGene:{x}")


In [71]:
# Filter Entrez to entities used in dataset
gnormplus_filtered = entrez[gnormplus_taxa_mask]

# Find duplicated symbols
gnormplus_symbols = gnormplus_filtered.symbol.value_counts()
gnormplus_duplicated_symbols = gnormplus_symbols[gnormplus_symbols > 1].index.tolist()


# Add additional canonical symbol for symbols that are repeated across different organisms
gnormplus_duplicated_symbol_mask = gnormplus_filtered.symbol.isin(gnormplus_duplicated_symbols)

gnormplus_filtered['canonical_symbol'] = gnormplus_filtered['symbol']
gnormplus_filtered.loc[gnormplus_duplicated_symbol_mask, 'canonical_symbol'] = gnormplus_filtered.loc[gnormplus_duplicated_symbol_mask, ['tax_id','symbol']].apply(lambda x: f"{x[1]} ({tax2name[x[0]]})", axis=1)


# Complie list of all symbols
gnormplus_filtered['all_symbols'] = gnormplus_filtered[['symbol','synonyms','official_symbol','official_name','other_designations', 'canonical_symbol']].progress_apply(lambda x: '|'.join(list(set([i for i in x if i.strip() != '-']))), axis=1)
gnormplus_filtered['geneid'] = gnormplus_filtered.geneid.map(lambda x: f"NCBIGene:{x}")
gnormplus_geneid2synonym = gnormplus_filtered.set_index('geneid')['all_symbols'].to_dict()

with open("../data/entrez_to_alias_gnormplus.json",'w') as f:
    f.write(ujson.dumps(gnormplus_geneid2synonym))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gnormplus_filtered['canonical_symbol'] = gnormplus_filtered['symbol']


  0%|          | 0/549545 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gnormplus_filtered['all_symbols'] = gnormplus_filtered[['symbol','synonyms','official_symbol','official_name','other_designations', 'canonical_symbol']].progress_apply(lambda x: '|'.join(list(set([i for i in x if i.strip() != '-']))), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gnormplus_filtered['geneid'] = gnormplus_filtered.geneid.map(lambda x: f"NCBIGene:{x}")


In [46]:
nlm_gene_filtered[['symbol','synonyms','official_symbol','official_name','other_designations','description','all_symbols']].sample(100)

Unnamed: 0,symbol,synonyms,official_symbol,official_name,other_designations,description,all_symbols
9078218,MRPS27,-,MRPS27,mitochondrial ribosomal protein S27,"28S ribosomal protein S27, mitochondrial",mitochondrial ribosomal protein S27,MRPS27|MRPS27|mitochondrial ribosomal protein ...
9077413,PLCH1,PLCD4|PLCL3,PLCH1,phospholipase C eta 1,"1-phosphatidylinositol 4,5-bisphosphate phosph...",phospholipase C eta 1,PLCH1|PLCD4|PLCL3|PLCH1|phospholipase C eta 1|...
8422001,pnn.L,drs|drsp|mema|pinin|pnn|pnn-a|pnn-b|sdk2-a|sdk3,pnn.L,"pinin, desmosome associated protein L homeolog","pinin, desmosome associated protein L homeolog...","pinin, desmosome associated protein L homeolog",pnn.L|drs|drsp|mema|pinin|pnn|pnn-a|pnn-b|sdk2...
11679835,LOC127490866,-,-,-,-,small nucleolar RNA SNORA32,LOC127490866
21575016,LOC9314147,-,-,-,tRNA (guanine(26)-N(2))-dimethyltransferase|pr...,tRNA (guanine(26)-N(2))-dimethyltransferase,LOC9314147|tRNA (guanine(26)-N(2))-dimethyltra...
10322912,MAPK8IP2,-,MAPK8IP2,mitogen-activated protein kinase 8 interacting...,C-Jun-amino-terminal kinase-interacting protein 2,mitogen-activated protein kinase 8 interacting...,MAPK8IP2|MAPK8IP2|mitogen-activated protein ki...
11985656,C030026M15Rik,-,C030026M15Rik,RIKEN cDNA C030026M15 gene,uncharacterized protein LOC77378,RIKEN cDNA C030026M15 gene,C030026M15Rik|C030026M15Rik|RIKEN cDNA C030026...
33158216,NOP6,-,-,-,Nop6p,Nop6p,NOP6|Nop6p
12071296,Tuba1a,Tuba1,Tuba1a,"tubulin, alpha 1A",tubulin alpha-1A chain|alpha-tubulin 1|tubulin...,"tubulin, alpha 1A","Tuba1a|Tuba1|Tuba1a|tubulin, alpha 1A|tubulin ..."
1723690,AT4G07917,-,-,-,-,pseudo,AT4G07917


In [None]:
alias_to_entrez = defaultdict(set)
for geneid, synonyms in tqdm(geneid_to_synonym.items()):
    syns = set([x.strip() for x in synonyms.split('|')])
    for s in syns:
        alias_to_entrez[s].add(geneid)



  0%|          | 0/431643 [00:00<?, ?it/s]

In [None]:
len(entrez.geneid.unique())

431643

In [None]:
with open('../data/alias_to_entrez.txt', 'w') as f:
    f.write('\n'.join([alias + '||' + '|'.join(str(curie)) for alias, curie in alias_to_entrez.items()]))

In [None]:
with open('../data/alias_to_entrez.txt', 'w') as f:
    f.write('\n'.join([str + '||' + '|'.join(str(curie)) for alias, curie in alias_to_entrez.items()]))

In [None]:
entrez_to_alias = defaultdict(set)
for alias, curie_set in alias_to_entrez.items():
    for curie in curie_set:
        entrez_to_alias[curie].add(alias)

with open('../data/entrez_to_alias.txt', 'w') as f:
    f.write('\n'.join(['ncbigene:' + str(curie) + '||' + alias for alias, curie_set in alias_to_entrez.items() for curie in curie_set]))



# max_len = 0
# for curie, alias_set in entrez_to_alias.items():
#     if len(alias_set) > max_len:
#         max_len = len(alias_set)
#         print(curie, alias_set)



In [None]:
print('Dmel\\CG9438')

Dmel\CG9438
