# Mapping between tokens and human readable

## Get concept token

In [7]:
from gensim.models import KeyedVectors
import os, sys, json, requests, numpy as np

In [2]:
YOUR_BIOCONCEPTVEC_PATH = '../embeddings/bioconceptvec_glove.bin'
YOUR_JSON_PATH = '../embeddings/concept_glove.json'

In [3]:
with open(YOUR_JSON_PATH) as json_file:  
    concept_vectors = json.load(json_file)
print('load', len(concept_vectors), 'concepts')

load 402712 concepts


In [4]:
list(concept_vectors.keys())[:10]

['Disease_MESH_D001845',
 'Gene_2799940',
 'Gene_3726_54751',
 'Disease_MESH_D000652',
 'ProteinMutation_p_R450H_RS_189261858',
 'ProteinMutation_p_I123V',
 'SNP_rs11944405',
 'Chemical_MESH_C511970',
 'ProteinMutation_p_Q106E',
 'Gene_56941_116255']

## What are all the different token types?

In [5]:
import re

def extract_concepts(concept_keys):
    concept_types = set()

    for key in concept_keys:
        match = re.match(r"([a-zA-Z]+)_", key)
        if match:
            concept_types.add(match.group(1))

    return list(concept_types)

# Assuming concept_vectors is a dictionary
concept_keys = list(concept_vectors.keys())
concept_types = extract_concepts(concept_keys)

print(concept_types)

['DomainMotif', 'CellLine', 'SNP', 'DNAMutation', 'Disease', 'ProteinMutation', 'Chemical', 'Gene', 'Species']


## What are examples of each

In [None]:
def find_examples(concept_vectors, concept_types):
    examples = {}

    # Iterate over each item in the dictionary
    for key in concept_vectors.keys():
        # If the concept type of the current key is in our list, add it to our examples
        for concept in concept_types:
            if key.startswith(concept) and concept not in examples:
                examples[concept] = key

    return examples

# Get examples
examples = find_examples(concept_vectors, concept_types)
print(examples)

{'Disease': 'Disease_MESH_D001845', 'Gene': 'Gene_2799940', 'ProteinMutation': 'ProteinMutation_p_R450H_RS_189261858', 'SNP': 'SNP_rs11944405', 'Chemical': 'Chemical_MESH_C511970', 'DNAMutation': 'DNAMutation_c_1324delC_CorrespondingGene_6535', 'Species': 'Species_7461', 'CellLine': 'CellLine_CVCL_K990', 'DomainMotif': 'DomainMotif_Focus_9606_7003_7004'}


In [None]:
list(examples.values())

['Disease_MESH_D001845',
 'Gene_2799940',
 'ProteinMutation_p_R450H_RS_189261858',
 'SNP_rs11944405',
 'Chemical_MESH_C511970',
 'DNAMutation_c_1324delC_CorrespondingGene_6535',
 'Species_7461',
 'CellLine_CVCL_K990',
 'DomainMotif_Focus_9606_7003_7004']

## Create and test each type of token and make sure it works with an API

In [8]:
# Change it so that it can use the following Gene_2799940
def fetch_entrez_gene(id):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    params = {
        "db": "gene",
        "id": id,
        "retmode": "json"
    }

    response = requests.get(url, params=params)
    print(response.status_code)
    print(response.text)
    if response.status_code == 200:
        return response.json()["result"][id]["description"]
    else:
        return None

# Test
print(fetch_entrez_gene("2799940"))

200
{"header":{"type":"esummary","version":"0.3"},"result":{"uids":["2799940"],"2799940":{"uid":"2799940","name":"M2","description":"matrix protein M2-1;matrix protein M2-2","status":2,"currentid":"","chromosome":"","geneticsource":"genomic","maplocation":"","otheraliases":"HmVgp5","otherdesignations":"matrix protein M2-1;matrix protein M2-2","nomenclaturesymbol":"","nomenclaturename":"","nomenclaturestatus":"","mim":[],"genomicinfo":[{"chrloc":"","chraccver":"NC_004148.2","chrstart":4710,"chrstop":5455,"exoncount":""}],"geneweight":"","summary":"","chrsort":"~~last","chrstart":999999999,"organism":{"scientificname":"Human metapneumovirus","commonname":"","taxid":162145},"locationhist":[]}}}

matrix protein M2-1;matrix protein M2-2


In [9]:
# Change it so that it can use the following Disease_MESH_D001845
def fetch_mesh_descriptor(id):
    url = f"https://id.nlm.nih.gov/mesh/lookup/label?resource={id}"
    headers = {"Accept": "application/json"}

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()[0]
    else:
        return None

# Test
print(fetch_mesh_descriptor("D001845"))

Bone Cysts


In [10]:
# Change it so that it can use the following Species_7461
import requests
import xml.etree.ElementTree as ET

def fetch_ncbi_species(id):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id={id}"
    response = requests.get(url)
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        scientific_name = root.find('.//Item[@Name="ScientificName"]').text
        return scientific_name
    else:
        return None

# Test
print(fetch_ncbi_species("7461"))

Apis cerana


In [None]:
# Change it so that it can use the following CellLine_CVCL_K990
def fetch_cellosaurus(id):
    url = f"https://api.cellosaurus.org/cell-line/{id}"
    response = requests.get(url, params=params)
    print(response.text)
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Test
print(fetch_cellosaurus("CVCL_K990"))

{
  "Cellosaurus": {
    "cell-line-list": [
      {
        "accession-list": [
          {
            "type": "primary",
            "value": "CVCL_K990"
          }
        ],
        "age": "Age unspecified",
        "category": "Transformed cell line",
        "child-list": [],
        "comment-list": [
          {
            "category": "Derived from sampling site",
            "value": "Peripheral blood"
          },
          {
            "category": "Transformant",
            "cv-term": {
              "accession": "10376",
              "terminology": "NCBI-Taxonomy",
              "value": "Epstein-Barr virus (EBV)"
            }
          },
          {
            "category": "Population",
            "value": "Caucasian"
          },
          {
            "category": "Part of",
            "value": "ECACC randomly selected UK Caucasian blood donors cell line selection"
          }
        ],
        "created": "2013-05-06",
        "entry-version": "10",
        "la

In [3]:
import requests
# Change it so that it can use the following ProteinMutation_p_R450H_RS_189261858
def fetch_dbsnp(rs_id):
    url = f"https://api.ncbi.nlm.nih.gov/variation/v0/beta/refsnp/{rs_id}"
    response = requests.get(url)
    print(response.status_code)
    # print(response.text)
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Test
print(fetch_dbsnp("189261858"))

200
{'refsnp_id': '189261858', 'create_date': '2011-09-17T04:22Z', 'last_update_date': '2022-10-16T19:52Z', 'last_update_build_id': '156', 'dbsnp1_merges': [], 'citations': [11442002, 21714469, 22876533, 25741868], 'lost_obs_movements': [], 'present_obs_movements': [{'component_ids': [{'type': 'subsnp', 'value': '282082079'}], 'observation': {'seq_id': 'NC_000014.7', 'position': 80679503, 'deleted_sequence': 'G', 'inserted_sequence': 'G'}, 'allele_in_cur_release': {'seq_id': 'NC_000014.9', 'position': 81143406, 'deleted_sequence': 'G', 'inserted_sequence': 'G'}, 'other_rsids_in_cur_release': [], 'previous_release': {'allele': {'seq_id': 'NC_000014.9', 'position': 81143406, 'deleted_sequence': 'G', 'inserted_sequence': 'G'}, 'rsids': ['189261858']}, 'last_added_to_this_rs': '151'}, {'component_ids': [{'type': 'frequency', 'value': '1000Genomes.1:64835276'}, {'type': 'frequency', 'value': 'ExAC.1:1934953'}, {'type': 'frequency', 'value': 'GnomAD_exomes.2:10127007'}, {'type': 'frequency',

In [26]:
# Turn json object from fetch_dbsnp("189261858") into a python dictionary
def parse_dbsnp(dbsnp_json):
    if dbsnp_json:
        return {
            "chromosome": dbsnp_json["refsnp_id"],
            # "snps": dbsnp_json["primary_snapshot_data"]["placements_with_allele"][0]["alleles"],
            # "gene": dbsnp_json["primary_snapshot_data"]["allele_annotations"][0]["assembly_annotation"]["genes"][0]["name"]
        }
    else:
        return None

# Json object from fetch_dbsnp("189261858")
parse_dbsnp(fetch_dbsnp("189261858"))

ConnectionError: HTTPSConnectionPool(host='api.ncbi.nlm.nih.gov', port=443): Max retries exceeded with url: /variation/v0/beta/refsnp/189261858 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x10640ae80>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

In [27]:
json_ting = fetch_dbsnp("189261858")

ConnectionError: HTTPSConnectionPool(host='api.ncbi.nlm.nih.gov', port=443): Max retries exceeded with url: /variation/v0/beta/refsnp/189261858 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x10640a6d0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

In [23]:
json_ting

'thyroid stimulating hormone receptor'

In [28]:
# Change it so that it can use the following SNP_rs11944405
fetch_dbsnp('11944405')

ConnectionError: HTTPSConnectionPool(host='api.ncbi.nlm.nih.gov', port=443): Max retries exceeded with url: /variation/v0/beta/refsnp/11944405 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x1067114c0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

In [17]:
# Change to use DNAMutation_c_1324delC_CorrespondingGene_6535
# Reuse fetch_entrez_gene()
print(fetch_entrez_gene("6535"))

200
{"header":{"type":"esummary","version":"0.3"},"result":{"uids":["6535"],"6535":{"uid":"6535","name":"SLC6A8","description":"solute carrier family 6 member 8","status":"","currentid":"","chromosome":"X","geneticsource":"genomic","maplocation":"Xq28","otheraliases":"CCDS1, CRT, CRT-1, CRT1, CRTR, CT1, CTR5","otherdesignations":"sodium- and chloride-dependent creatine transporter 1|creatine transporter 1|solute carrier family 6 (neurotransmitter transporter), member 8|solute carrier family 6 (neurotransmitter transporter, creatine), member 8","nomenclaturesymbol":"SLC6A8","nomenclaturename":"solute carrier family 6 member 8","nomenclaturestatus":"Official","mim":["300036"],"genomicinfo":[{"chrloc":"X","chraccver":"NC_000023.11","chrstart":153687925,"chrstop":153696592,"exoncount":14}],"geneweight":8658,"summary":"The protein encoded by this gene is a plasma membrane protein whose function is to transport creatine into and out of cells. Defects in this gene can result in X-linked creat

In [20]:
# Change to use Chemical_MESH_C511970
print(fetch_mesh_descriptor("C511970"))

M58373


According to the PubTator Central documentation, a domain motif entity is defined as “a conserved part of a given protein sequence and structure that can evolve, function, and exist independently of the rest of the protein chain"

In [34]:
def fetch_ncbi_taxonomy(taxonomy_id):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id={taxonomy_id}"
    response = requests.get(url)
    # print(response.status_code)
    # print(response.text)
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        return root.find("DocSum/Item[@Name='ScientificName']").text
    else:
        return None

# Test
print(fetch_ncbi_taxonomy("9606"))

Homo sapiens


## Domain Motif
The following code is my attempt to find an API for DomainMotif_Focus_9606_7003_7004. I can't find it, so I'm skiping it for now.

In [35]:
def fetch_interpro(ipr_id):
    url = f"https://www.ebi.ac.uk/interpro/api/entry/InterPro/{ipr_id}"
    response = requests.get(url)
    print(response.status_code)
    print(response.text)
    if response.status_code == 200:
        return response.json()['metadata']['name']
    else:
        return None

# Test
print(fetch_interpro("IPR001849"))  # Example InterPro ID
print(fetch_interpro("IPR000719"))  # Example InterPro ID

200
{"metadata":{"accession":"IPR001849","entry_id":null,"type":"domain","go_terms":null,"source_database":"interpro","member_databases":{"pfam":{"PF16457":"Pleckstrin homology domain","PF16652":"Pleckstrin homology domain","PF00169":"PH domain"},"profile":{"PS50003":"PH domain profile"},"smart":{"SM00233":"Pleckstrin homology domain."}},"integrated":null,"hierarchy":{"accession":"IPR001849","name":"Pleckstrin homology domain","type":"Domain","children":[{"accession":"IPR001605","name":"Pleckstrin homology domain, spectrin-type","type":"Domain","children":[]},{"accession":"IPR024774","name":"Pleckstrin homology domain, Mcp5-type","type":"Domain","children":[]},{"accession":"IPR033511","name":"Rho guanine nucleotide exchange factor Cdc24/Scd1, PH domain","type":"Domain","children":[]},{"accession":"IPR035534","name":"DBS, PH domain","type":"Domain","children":[]},{"accession":"IPR035939","name":"FGD1, N-terminal PH domain","type":"Domain","children":[]},{"accession":"IPR035941","name":"

In [37]:
def process_domain_motif_token(token):
    parts = token.split("_")
    species = fetch_ncbi_taxonomy(parts[2])
    interpro_ids = parts[3:]
    domains = [fetch_interpro(id) for id in interpro_ids]
    return {
        "species": species,
        "domains": domains
    }

# Test
print(process_domain_motif_token('DomainMotif_Focus_9606_7003_7004'))

404
{"Error":"the level '7003' is not a valid interpro level"}
404
{"Error":"the level '7004' is not a valid interpro level"}
{'species': 'Homo sapiens', 'domains': [None, None]}


# Cleaned up code for going through all concepts and making the mapping

In [None]:
# Change it so that it can use the following Gene_2799940
def fetch_entrez_gene(id):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    params = {
        "db": "gene",
        "id": id,
        "retmode": "json"
    }

    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()["result"][id]["description"]
    else:
        return None

# Test
print(fetch_entrez_gene("2799940"))
# matrix protein M2-1;matrix protein M2-2

# Change it so that it can use the following Disease_MESH_D001845
def fetch_mesh_descriptor(id):
    url = f"https://id.nlm.nih.gov/mesh/lookup/label?resource={id}"
    headers = {"Accept": "application/json"}

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()[0]
    else:
        return None

# Test
print(fetch_mesh_descriptor("D001845"))
# Bone Cysts

# Change it so that it can use the following Species_7461
import requests
import xml.etree.ElementTree as ET

def fetch_ncbi_species(id):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id={id}"
    response = requests.get(url)
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        scientific_name = root.find('.//Item[@Name="ScientificName"]').text
        return scientific_name
    else:
        return None

# Test
print(fetch_ncbi_species("7461"))
# Apis cerana

# Change it so that it can use the following CellLine_CVCL_K990
def fetch_cellosaurus(id):
    url = f"https://api.cellosaurus.org/cell-line/{id}"
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Test
print(fetch_cellosaurus("CVCL_K990"))
# {'Cellosaurus': {'cell-line-list': [{'accession-list': [{'type': 'primary', 'value': 'CVCL_K990'}], 'age': 'Age unspecified', 'category': 'Transformed cell line', 'child-list': [], 'comment-list': [{'category': 'Derived from sampling site', 'value': 'Peripheral blood'}, {'category': 'Transformant', 'cv-term': {'accession': '10376', 'terminology': 'NCBI-Taxonomy', 'value': 'Epstein-Barr virus (EBV)'}}, {'category': 'Population', 'value': 'Caucasian'}, {'category': 'Part of', 'value': 'ECACC randomly selected UK Caucasian blood donors cell line selection'}], 'created': '2013-05-06', 'entry-version': '10', 'last-updated': '2023-03-21', 'name-list': [{'type': 'identifier', 'value': 'UKTS9050'}], 'sex': 'Sex unspecified', 'species-list': [{'accession': '9606', 'terminology': 'NCBI-Taxonomy', 'value': 'Homo sapiens (Human)'}], 'xref-list': [{'accession': '91030403', 'category': 'Cell line collections', 'database': 'ECACC', 'url': {'value': 'https://www.phe-culturecollections.org.uk/products/celllines/humanrandomcontrol/detail.jsp?refId=91030403&collection=ecacc_hrc'}}, {'accession': 'Q54990680', 'category': 'Encyclopedic resources', 'database': 'Wikidata', 'url': {'value': 'https://www.wikidata.org/wiki/Q54990680'}}]}]}}

# Change it so that it can use the following ProteinMutation_p_R450H_RS_189261858
def fetch_dbsnp(rs_id):
    url = f"https://api.ncbi.nlm.nih.gov/variation/v0/beta/refsnp/{rs_id}"
    response = requests.get(url)
    # print(response.text)
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Test
print(fetch_dbsnp("189261858"))

# Change it so that it can use the following SNP_rs11944405
fetch_dbsnp('11944405')