# Mapping between tokens and human readable

## Get concept token

In [2]:
from gensim.models import KeyedVectors
import os, sys, json, requests, numpy as np

In [3]:
YOUR_BIOCONCEPTVEC_PATH = '../embeddings/bioconceptvec_glove.bin'
YOUR_JSON_PATH = '../embeddings/concept_glove.json'

In [4]:
with open(YOUR_JSON_PATH) as json_file:  
    concept_vectors = json.load(json_file)
print('load', len(concept_vectors), 'concepts')

load 402712 concepts


In [5]:
list(concept_vectors.keys())[:10]

['Disease_MESH_D001845',
 'Gene_2799940',
 'Gene_3726_54751',
 'Disease_MESH_D000652',
 'ProteinMutation_p_R450H_RS_189261858',
 'ProteinMutation_p_I123V',
 'SNP_rs11944405',
 'Chemical_MESH_C511970',
 'ProteinMutation_p_Q106E',
 'Gene_56941_116255']

In [8]:
import re

disease_concepts = [concept for concept in concept_vectors.keys() if concept.startswith('Disease')]

prefixes = set()

for concept in disease_concepts:
    match = re.match(r'^(Disease_[^_]+)_', concept)
    match = re.match(r'^(Disease_[^_]+)_', concept)
    if match:
        prefix = match.group(1)
        prefixes.add(prefix)

print('Different prefixes for disease concepts:', prefixes)


Different prefixes for disease concepts: {'Disease_JAN', 'Disease_aphthovirus', 'Disease_loss', 'Disease_renal', 'Disease_absence', 'Disease_parapsilosis', 'Disease_liver', 'Disease_NKI', 'Disease_hypovirus', 'Disease_Deficiencies', 'Disease_MDM2', 'Disease_decreased', 'Disease_Tile', 'Disease_Methanogenic', 'Disease_arthritidis', 'Disease_dysregulation', 'Disease_involvement', 'Disease_hypomorphic', 'Disease_intravascular', 'Disease_Xenorhabdus', 'Disease_FMDV', 'Disease_ALI', 'Disease_tuberin', 'Disease_invasive', 'Disease_alkaliphilus', 'Disease_nematophilus', 'Disease_glomerulonephritis', 'Disease_turgidiscabies', 'Disease_0Ni0', 'Disease_TAT', 'Disease_IMD', 'Disease_Y3', 'Disease_Tau', 'Disease_Hafnia', 'Disease_Leishmania', 'Disease_Sp1', 'Disease_Tg', 'Disease_baboon', 'Disease_HMS1', 'Disease_M12', 'Disease_Idiomarina', 'Disease_EMD', 'Disease_kidney', 'Disease_Shewanella', 'Disease_TGM', 'Disease_abnormal', 'Disease_degradation', 'Disease_A', 'Disease_reduction', 'Disease_pun

In [14]:
deficiency_disease_concepts = [concept for concept in concept_vectors.keys() if concept.startswith('Disease_Methanogenic')]

print('Number of deficiency disease concepts:', len(deficiency_disease_concepts))
print('Some deficiency disease concepts:', deficiency_disease_concepts[:10])

Number of deficiency disease concepts: 1
Some deficiency disease concepts: ['Disease_Methanogenic_archaeon_CH1270']


## What are all the different token types?

In [5]:
import re

def extract_concepts(concept_keys):
    concept_types = set()

    for key in concept_keys:
        match = re.match(r"([a-zA-Z]+)_", key)
        if match:
            concept_types.add(match.group(1))

    return list(concept_types)

# Assuming concept_vectors is a dictionary
concept_keys = list(concept_vectors.keys())
concept_types = extract_concepts(concept_keys)

print(concept_types)

['Gene', 'Species', 'DomainMotif', 'CellLine', 'Disease', 'ProteinMutation', 'SNP', 'DNAMutation', 'Chemical']


## What are examples of each

In [6]:
def find_examples(concept_vectors, concept_types):
    examples = {}

    # Iterate over each item in the dictionary
    for key in concept_vectors.keys():
        # If the concept type of the current key is in our list, add it to our examples
        for concept in concept_types:
            if key.startswith(concept) and concept not in examples:
                examples[concept] = key

    return examples

# Get examples
examples = find_examples(concept_vectors, concept_types)
print(examples)

{'Disease': 'Disease_MESH_D001845', 'Gene': 'Gene_2799940', 'ProteinMutation': 'ProteinMutation_p_R450H_RS_189261858', 'SNP': 'SNP_rs11944405', 'Chemical': 'Chemical_MESH_C511970', 'DNAMutation': 'DNAMutation_c_1324delC_CorrespondingGene_6535', 'Species': 'Species_7461', 'CellLine': 'CellLine_CVCL_K990', 'DomainMotif': 'DomainMotif_Focus_9606_7003_7004'}


In [7]:
list(examples.values())

['Disease_MESH_D001845',
 'Gene_2799940',
 'ProteinMutation_p_R450H_RS_189261858',
 'SNP_rs11944405',
 'Chemical_MESH_C511970',
 'DNAMutation_c_1324delC_CorrespondingGene_6535',
 'Species_7461',
 'CellLine_CVCL_K990',
 'DomainMotif_Focus_9606_7003_7004']

## Create and test each type of token and make sure it works with an API

In [31]:
# Change it so that it can use the following Gene_2799940
def fetch_entrez_gene(id):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    params = {
        "db": "gene",
        "id": id,
        "retmode": "json"
    }

    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()["result"][id]["description"]
    else:
        return None

# Test
print(fetch_entrez_gene("107831434"))

1-aminocyclopropane-1-carboxylate synthase


In [9]:
# Change it so that it can use the following Disease_MESH_D001845
def fetch_mesh_descriptor(id):
    url = f"https://id.nlm.nih.gov/mesh/lookup/label?resource={id}"
    headers = {"Accept": "application/json"}

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()[0]
    else:
        return None

# Test
print(fetch_mesh_descriptor("D001845"))

Bone Cysts


In [10]:
# Change it so that it can use the following Species_7461
import requests
import xml.etree.ElementTree as ET

def fetch_ncbi_species(id):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id={id}"
    response = requests.get(url)
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        scientific_name = root.find('.//Item[@Name="ScientificName"]').text
        return scientific_name
    else:
        return None

# Test
print(fetch_ncbi_species("7461"))

Apis cerana


In [78]:
# Change it so that it can use the following CellLine_CVCL_K990
def fetch_cellosaurus(id):
    url = f"https://api.cellosaurus.org/cell-line/{id}"
    response = requests.get(url)
    # print(response.text)
    if response.status_code == 200:
        value_list = []
        response_json = response.json()
        if 'Cellosaurus' in response_json:
            if 'cell-line-list' in response_json['Cellosaurus']:
                for name in response_json['Cellosaurus']['cell-line-list']:
                    if 'name-list' in name:
                        for value in name['name-list']:
                            if 'value' in value:
                                value_list.append(value['value'])
        return value_list
    else:
        return None

# Test
print(fetch_cellosaurus("CVCL_K990"))

['UKTS9050']


In [6]:
import requests
# Change it so that it can use the following ProteinMutation_p_R450H_RS_189261858
def fetch_dbsnp(rs_id):
    url = f"https://api.ncbi.nlm.nih.gov/variation/v0/beta/refsnp/{rs_id}"
    response = requests.get(url)
    print(response.status_code)
    # print(response.text)
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Test
print(fetch_dbsnp("11944405"))

200
{'refsnp_id': '11944405', 'create_date': '2004-02-28T21:15Z', 'last_update_date': '2022-10-13T06:32Z', 'last_update_build_id': '156', 'dbsnp1_merges': [], 'citations': [], 'lost_obs_movements': [], 'present_obs_movements': [{'component_ids': [{'type': 'subsnp', 'value': '108173738'}, {'type': 'subsnp', 'value': '110297294'}, {'type': 'subsnp', 'value': '117115213'}, {'type': 'subsnp', 'value': '160019633'}, {'type': 'subsnp', 'value': '163924626'}, {'type': 'subsnp', 'value': '166948126'}, {'type': 'subsnp', 'value': '198926715'}, {'type': 'subsnp', 'value': '277846056'}, {'type': 'subsnp', 'value': '479634392'}, {'type': 'subsnp', 'value': '1589959560'}, {'type': 'subsnp', 'value': '1712693479'}, {'type': 'subsnp', 'value': '3643452189'}], 'observation': {'seq_id': 'NC_000004.10', 'position': 101018983, 'deleted_sequence': 'T', 'inserted_sequence': 'T'}, 'allele_in_cur_release': {'seq_id': 'NC_000004.12', 'position': 99878803, 'deleted_sequence': 'T', 'inserted_sequence': 'T'}, 'o

In [10]:
# Turn json object from fetch_dbsnp("189261858") into a python dictionary
def parse_dbsnp(dbsnp_json):
    if dbsnp_json:
        gene_set = set()
        if 'primary_snapshot_data' in dbsnp_json:
            if 'allele_annotations' in dbsnp_json['primary_snapshot_data']:
                for alle_annotation in dbsnp_json["primary_snapshot_data"]["allele_annotations"]:
                    if 'assembly_annotation' in alle_annotation:
                        for assembly_annotation in alle_annotation["assembly_annotation"]:
                            if 'genes' in assembly_annotation:
                                for gene in assembly_annotation['genes']:
                                    if 'name' in gene:
                                        gene_set.add(gene['name'])
        snps_list = []
        if 'primary_snapshot_data' in dbsnp_json:
            if 'placements_with_allele' in dbsnp_json['primary_snapshot_data']:
                for placements_with_allele in dbsnp_json["primary_snapshot_data"]["placements_with_allele"]:
                    if 'alleles' in placements_with_allele:
                        snps_list.append(placements_with_allele['alleles'])
        return {
            "chromosome": dbsnp_json["refsnp_id"],
            "snps": snps_list,
            "gene": list(gene_set)
        }
    else:
        return None

# Json object from fetch_dbsnp("189261858")
fetch_dbsnp("11944405")

{'chromosome': '11944405',
 'snps': [[{'allele': {'spdi': {'seq_id': 'NC_000004.12',
      'position': 99878803,
      'deleted_sequence': 'T',
      'inserted_sequence': 'T'}},
    'hgvs': 'NC_000004.12:g.99878804='},
   {'allele': {'spdi': {'seq_id': 'NC_000004.12',
      'position': 99878803,
      'deleted_sequence': 'T',
      'inserted_sequence': 'C'}},
    'hgvs': 'NC_000004.12:g.99878804T>C'}],
  [{'allele': {'spdi': {'seq_id': 'NC_000004.11',
      'position': 100799960,
      'deleted_sequence': 'T',
      'inserted_sequence': 'T'}},
    'hgvs': 'NC_000004.11:g.100799961='},
   {'allele': {'spdi': {'seq_id': 'NC_000004.11',
      'position': 100799960,
      'deleted_sequence': 'T',
      'inserted_sequence': 'C'}},
    'hgvs': 'NC_000004.11:g.100799961T>C'}],
  [{'allele': {'spdi': {'seq_id': 'NM_021970.4',
      'position': 3694,
      'deleted_sequence': 'A',
      'inserted_sequence': 'A'}},
    'hgvs': 'NM_021970.4:c.*3190='},
   {'allele': {'spdi': {'seq_id': 'NM_021970

In [17]:
# Change to use DNAMutation_c_1324delC_CorrespondingGene_6535
# Reuse fetch_entrez_gene()
print(fetch_entrez_gene("6535"))

200
{"header":{"type":"esummary","version":"0.3"},"result":{"uids":["6535"],"6535":{"uid":"6535","name":"SLC6A8","description":"solute carrier family 6 member 8","status":"","currentid":"","chromosome":"X","geneticsource":"genomic","maplocation":"Xq28","otheraliases":"CCDS1, CRT, CRT-1, CRT1, CRTR, CT1, CTR5","otherdesignations":"sodium- and chloride-dependent creatine transporter 1|creatine transporter 1|solute carrier family 6 (neurotransmitter transporter), member 8|solute carrier family 6 (neurotransmitter transporter, creatine), member 8","nomenclaturesymbol":"SLC6A8","nomenclaturename":"solute carrier family 6 member 8","nomenclaturestatus":"Official","mim":["300036"],"genomicinfo":[{"chrloc":"X","chraccver":"NC_000023.11","chrstart":153687925,"chrstop":153696592,"exoncount":14}],"geneweight":8658,"summary":"The protein encoded by this gene is a plasma membrane protein whose function is to transport creatine into and out of cells. Defects in this gene can result in X-linked creat

In [20]:
# Change to use Chemical_MESH_C511970
print(fetch_mesh_descriptor("C511970"))

M58373


According to the PubTator Central documentation, a domain motif entity is defined as “a conserved part of a given protein sequence and structure that can evolve, function, and exist independently of the rest of the protein chain"

In [34]:
def fetch_ncbi_taxonomy(taxonomy_id):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id={taxonomy_id}"
    response = requests.get(url)
    # print(response.status_code)
    # print(response.text)
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        return root.find("DocSum/Item[@Name='ScientificName']").text
    else:
        return None

# Test
print(fetch_ncbi_taxonomy("9606"))

Homo sapiens


## Domain Motif
The following code is my attempt to find an API for DomainMotif_Focus_9606_7003_7004. I can't find it, so I'm skiping it for now.

In [35]:
def fetch_interpro(ipr_id):
    url = f"https://www.ebi.ac.uk/interpro/api/entry/InterPro/{ipr_id}"
    response = requests.get(url)
    print(response.status_code)
    print(response.text)
    if response.status_code == 200:
        return response.json()['metadata']['name']
    else:
        return None

# Test
print(fetch_interpro("IPR001849"))  # Example InterPro ID
print(fetch_interpro("IPR000719"))  # Example InterPro ID

200
{"metadata":{"accession":"IPR001849","entry_id":null,"type":"domain","go_terms":null,"source_database":"interpro","member_databases":{"pfam":{"PF16457":"Pleckstrin homology domain","PF16652":"Pleckstrin homology domain","PF00169":"PH domain"},"profile":{"PS50003":"PH domain profile"},"smart":{"SM00233":"Pleckstrin homology domain."}},"integrated":null,"hierarchy":{"accession":"IPR001849","name":"Pleckstrin homology domain","type":"Domain","children":[{"accession":"IPR001605","name":"Pleckstrin homology domain, spectrin-type","type":"Domain","children":[]},{"accession":"IPR024774","name":"Pleckstrin homology domain, Mcp5-type","type":"Domain","children":[]},{"accession":"IPR033511","name":"Rho guanine nucleotide exchange factor Cdc24/Scd1, PH domain","type":"Domain","children":[]},{"accession":"IPR035534","name":"DBS, PH domain","type":"Domain","children":[]},{"accession":"IPR035939","name":"FGD1, N-terminal PH domain","type":"Domain","children":[]},{"accession":"IPR035941","name":"

In [37]:
def process_domain_motif_token(token):
    parts = token.split("_")
    species = fetch_ncbi_taxonomy(parts[2])
    interpro_ids = parts[3:]
    domains = [fetch_interpro(id) for id in interpro_ids]
    return {
        "species": species,
        "domains": domains
    }

# Test
print(process_domain_motif_token('DomainMotif_Focus_9606_7003_7004'))

404
{"Error":"the level '7003' is not a valid interpro level"}
404
{"Error":"the level '7004' is not a valid interpro level"}
{'species': 'Homo sapiens', 'domains': [None, None]}


# Cleaned up code for going through all concepts and making the mapping

In [None]:
# Change it so that it can use the following Gene_2799940
def fetch_entrez_gene(id):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    params = {
        "db": "gene",
        "id": id,
        "retmode": "json"
    }

    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()["result"][id]["description"]
    else:
        return None

# Test
print(fetch_entrez_gene("2799940"))
# matrix protein M2-1;matrix protein M2-2

# Change it so that it can use the following Disease_MESH_D001845
def fetch_mesh_descriptor(id):
    url = f"https://id.nlm.nih.gov/mesh/lookup/label?resource={id}"
    headers = {"Accept": "application/json"}

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()[0]
    else:
        return None

# Test
print(fetch_mesh_descriptor("D001845"))
# Bone Cysts

# Change it so that it can use the following Species_7461
import requests
import xml.etree.ElementTree as ET

def fetch_ncbi_species(id):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id={id}"
    response = requests.get(url)
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        scientific_name = root.find('.//Item[@Name="ScientificName"]').text
        return scientific_name
    else:
        return None

# Test
print(fetch_ncbi_species("7461"))
# Apis cerana

# Change it so that it can use the following CellLine_CVCL_K990
def fetch_cellosaurus(id):
    url = f"https://api.cellosaurus.org/cell-line/{id}"
    response = requests.get(url)
    # print(response.text)
    if response.status_code == 200:
        value_list = []
        response_json = response.json()
        if 'Cellosaurus' in response_json:
            if 'cell-line-list' in response_json['Cellosaurus']:
                for name in response_json['Cellosaurus']['cell-line-list']:
                    if 'name-list' in name:
                        for value in name['name-list']:
                            if 'value' in value:
                                value_list.append(value['value'])
        return value_list
    else:
        return None

# Test
print(fetch_cellosaurus("CVCL_K990"))
# ['UKTS9050']

# Change it so that it can use the following ProteinMutation_p_R450H_RS_189261858
def fetch_dbsnp(rs_id):
    url = f"https://api.ncbi.nlm.nih.gov/variation/v0/beta/refsnp/{rs_id}"
    response = requests.get(url)
    # print(response.status_code)
    # print(response.text)
    if response.status_code == 200:
        return parse_dbsnp(response.json())
    else:
        return None

# Turn json object from fetch_dbsnp("189261858") into a python dictionary
def parse_dbsnp(dbsnp_json):
    if dbsnp_json:
        gene_set = set()
        if 'primary_snapshot_data' in dbsnp_json:
            if 'allele_annotations' in dbsnp_json['primary_snapshot_data']:
                for alle_annotation in dbsnp_json["primary_snapshot_data"]["allele_annotations"]:
                    if 'assembly_annotation' in alle_annotation:
                        for assembly_annotation in alle_annotation["assembly_annotation"]:
                            if 'genes' in assembly_annotation:
                                for gene in assembly_annotation['genes']:
                                    if 'name' in gene:
                                        gene_set.add(gene['name'])
        snps_list = []
        if 'primary_snapshot_data' in dbsnp_json:
            if 'placements_with_allele' in dbsnp_json['primary_snapshot_data']:
                for placements_with_allele in dbsnp_json["primary_snapshot_data"]["placements_with_allele"]:
                    if 'alleles' in placements_with_allele:
                        snps_list.append(placements_with_allele['alleles'])
        return {
            "chromosome": dbsnp_json["refsnp_id"],
            "snps": snps_list,
            "gene": list(gene_set)
        }
    else:
        return None

# Test
print(fetch_dbsnp("189261858"))
# {'chromosome': ..., 'snps': ..., 'gene': ...}

In [12]:
import requests
import xml.etree.ElementTree as ET

# Fetch the gene description from NCBI Entrez
# Gene_2799940
def fetch_entrez_gene(id):
    try:
        url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
        params = {
            "db": "gene",
            "id": id,
            "retmode": "json"
        }
        response = requests.get(url, params=params)
        return response.json()["result"][id]["description"] if response.status_code == 200 else None
    except Exception as e:
        print(f"An error occurred in fetch_entrez_gene: {e}")
        return None

# Fetch the disease name from MESH
# Disease_MESH_D001845
def fetch_mesh_descriptor(id):
    try:
        url = f"https://id.nlm.nih.gov/mesh/lookup/label?resource={id}"
        headers = {"Accept": "application/json"}
        response = requests.get(url, headers=headers)
        if response.status_code == 200 and response.json():
            return response.json()[0]
        else:
            return None
    except Exception as e:
        print(f"An error occurred in fetch_mesh_descriptor: {e}")
        return None

# Fetch the species scientific name from NCBI taxonomy
# Species_7461
def fetch_ncbi_species(id):
    try:
        url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id={id}"
        response = requests.get(url)
        if response.status_code == 200:
            root = ET.fromstring(response.content)
            return root.find('.//Item[@Name="ScientificName"]').text
        else:
            return None
    except Exception as e:
        print(f"An error occurred in fetch_ncbi_species: {e}")
        return None

# Fetch the cell line from Cellosaurus
# CellLine_CVCL_K990
def fetch_cellosaurus(id):
    try:
        url = f"https://api.cellosaurus.org/cell-line/{id}"
        response = requests.get(url)
        if response.status_code == 200:
            return [value['value'] for name in response.json().get('Cellosaurus', {}).get('cell-line-list', []) for value in name.get('name-list', [])]
        else:
            return None
    except Exception as e:
        print(f"An error occurred in fetch_cellosaurus: {e}")
        return None

# Fetch the SNP from dbSNP
# ProteinMutation_p_R450H_RS_189261858
def fetch_dbsnp(rs_id):
    try:
        url = f"https://api.ncbi.nlm.nih.gov/variation/v0/beta/refsnp/{rs_id}"
        response = requests.get(url)
        if response.status_code == 200:
            return parse_dbsnp(response.json())
        else:
            return None
    except Exception as e:
        print(f"An error occurred in fetch_dbsnp: {e}")
        return None

# Parse the SNP JSON response
def parse_dbsnp(dbsnp_json):
    try:
        gene_set = set()
        for alle_annotation in dbsnp_json.get("primary_snapshot_data", {}).get("allele_annotations", []):
            for assembly_annotation in alle_annotation.get("assembly_annotation", []):
                for gene in assembly_annotation.get('genes', []):
                    gene_set.add(gene.get('name', ''))
        
        snps_list = []
        for placements_with_allele in dbsnp_json.get("primary_snapshot_data", {}).get("placements_with_allele", []):
            snps_list.append(placements_with_allele.get('alleles', []))
        
        return {
            "chromosome": dbsnp_json.get("refsnp_id", ''),
            "snps": snps_list,
            "gene": list(gene_set)
        }
    except Exception as e:
        print(f"An error occurred in parse_dbsnp: {e}")
        return None

# Fetch the concept description
def fetch_concept_description(concept_id):
    try:
        concept_type, identifier = concept_id.split('_', 1)
        if concept_type == "Disease":
            if identifier.startswith("MESH"):
                # MESH_D001845 
                # Make the identifier just D001845
                identifier = identifier[identifier.find('_')+1:]
                return fetch_mesh_descriptor(identifier)
            else:
                # Take everything after Disease and return it as the description without _
                # Remove the _ from the identifier
                return identifier[identifier.find('_')+1:].replace('_', ' ')
        elif concept_type == "Gene":
            return fetch_entrez_gene(identifier)
        elif concept_type == "Species":
            return fetch_ncbi_species(identifier)
        elif concept_type == "CellLine":
            return fetch_cellosaurus(identifier)
        elif concept_type == "ProteinMutation":
            # Take the last series of numbers as the rs_id. Ex ProteinMutation_p_R450H_RS_189261858
            # becomes 189261858
            identifier = identifier[identifier.rfind('_')+1:]
            return fetch_dbsnp(identifier)
        elif concept_type == "SNP":
            import time
            # Wait a second before making the request. This is to avoid hitting the API too hard
            time.sleep(1)
            # SNP_rs11944405 into 11944405. Remove the rs and return the description
            identifier = identifier[2:]
            return fetch_dbsnp(identifier)
        elif concept_type == "Chemical":
            # Chemical_MESH_C511970 into C511970
            identifier = identifier[identifier.find('_')+1:]
            return fetch_mesh_descriptor(identifier)
        elif concept_type == "DNAMutation":
            # DNAMutation_c_1324delC_CorrespondingGene_6535. I'm not sure what to do with this one. Just give it back
            # as the description with the underscores removed with the concept type included
            return concept_type + ' ' + identifier.replace('_', ' ')
        elif concept_type == "DomainMotif":
            # DomainMotif_Focus_9606_7003_7004. I'm not sure what to do with this one. Just give it back
            # as the description with the underscores removed with the concept type included
            return concept_type + ' ' + identifier.replace('_', ' ')
        else:
            return None
    except Exception as e:
        print(f"An error occurred in fetch_concept_description: {e}")
        return None

# Examples
examples = [
    'Disease_MESH_D001845',
    'Disease_Methanogenic_archaeon_CH1270',
    'Gene_2799940',
    'ProteinMutation_p_R450H_RS_189261858',
    'SNP_rs11944405',
    'Chemical_MESH_C511970',
    'DNAMutation_c_1324delC_CorrespondingGene_6535',
    'Species_7461',
    'CellLine_CVCL_K990',
    'DomainMotif_Focus_9606_7003_7004'
]

# Fetch the descriptions
concept_descriptions = {concept: fetch_concept_description(concept) for concept in examples}

print(concept_descriptions)

{'Disease_MESH_D001845': 'Bone Cysts', 'Disease_Methanogenic_archaeon_CH1270': 'archaeon CH1270', 'Gene_2799940': 'matrix protein M2-1;matrix protein M2-2', 'ProteinMutation_p_R450H_RS_189261858': {'chromosome': '189261858', 'snps': [[{'allele': {'spdi': {'seq_id': 'NC_000014.9', 'position': 81143406, 'deleted_sequence': 'G', 'inserted_sequence': 'G'}}, 'hgvs': 'NC_000014.9:g.81143407='}, {'allele': {'spdi': {'seq_id': 'NC_000014.9', 'position': 81143406, 'deleted_sequence': 'G', 'inserted_sequence': 'A'}}, 'hgvs': 'NC_000014.9:g.81143407G>A'}, {'allele': {'spdi': {'seq_id': 'NC_000014.9', 'position': 81143406, 'deleted_sequence': 'G', 'inserted_sequence': 'C'}}, 'hgvs': 'NC_000014.9:g.81143407G>C'}, {'allele': {'spdi': {'seq_id': 'NC_000014.9', 'position': 81143406, 'deleted_sequence': 'G', 'inserted_sequence': 'T'}}, 'hgvs': 'NC_000014.9:g.81143407G>T'}], [{'allele': {'spdi': {'seq_id': 'NC_000014.8', 'position': 81609750, 'deleted_sequence': 'G', 'inserted_sequence': 'G'}}, 'hgvs': 

In [18]:
dna_mutation_concepts = [concept for concept in concept_vectors.keys() if concept.startswith('DNAMutation') and not concept.startswith('DNAMutation_c')]

print('Number of DNA mutation concepts:', len(dna_mutation_concepts))
print('Some DNA mutation concepts:', dna_mutation_concepts[:10])

Number of DNA mutation concepts: 2875
Some DNA mutation concepts: ['DNAMutation_g_4656G_C', 'DNAMutation_rs1013151', 'DNAMutation_g_4300A_G', 'DNAMutation_g_14106505G_A', 'DNAMutation_g_19518C_T', 'DNAMutation_g_13491C_G_RS_2787094', 'DNAMutation_g_8069C_G_RS_1052133', 'DNAMutation_p_183delK', 'DNAMutation_g_4877G_A', 'DNAMutation_g_20210G_A_CorrespondingGene_5054']


# Parallelize the mapping process

In [14]:
import requests
import xml.etree.ElementTree as ET
from threading import Semaphore, Thread
import time

# The semaphore to limit the number of simultaneous API calls
sem = Semaphore(10)

# Fetch the gene description from NCBI Entrez
# Gene_2799940
def fetch_entrez_gene(id):
    try:
        with sem:
            url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
            params = {
                "db": "gene",
                "id": id,
                "retmode": "json"
            }
            response = requests.get(url, params=params)
            return response.json()["result"][id]["description"] if response.status_code == 200 else None
    except Exception as e:
        print(f"An error occurred in fetch_entrez_gene: {e}")
        return None

# Fetch the disease name from MESH
# Disease_MESH_D001845
def fetch_mesh_descriptor(id):
    try:
        with sem:
            url = f"https://id.nlm.nih.gov/mesh/lookup/label?resource={id}"
            headers = {"Accept": "application/json"}
            response = requests.get(url, headers=headers)
            if response.status_code == 200 and response.json():
                return response.json()[0]
            else:
                return None
    except Exception as e:
        print(f"An error occurred in fetch_mesh_descriptor: {e}")
        return None

# Fetch the species scientific name from NCBI taxonomy
# Species_7461
def fetch_ncbi_species(id):
    try:
        with sem:
            url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id={id}"
            response = requests.get(url)
            if response.status_code == 200:
                root = ET.fromstring(response.content)
                return root.find('.//Item[@Name="ScientificName"]').text
            else:
                return None
    except Exception as e:
        print(f"An error occurred in fetch_ncbi_species: {e}")
        return None

# Fetch the cell line from Cellosaurus
# CellLine_CVCL_K990
def fetch_cellosaurus(id):
    try:
        with sem:
            url = f"https://api.cellosaurus.org/cell-line/{id}"
            response = requests.get(url)
            if response.status_code == 200:
                return [value['value'] for name in response.json().get('Cellosaurus', {}).get('cell-line-list', []) for value in name.get('name-list', [])]
            else:
                return None
    except Exception as e:
        print(f"An error occurred in fetch_cellosaurus: {e}")
        return None

# Fetch the SNP from dbSNP
# ProteinMutation_p_R450H_RS_189261858
def fetch_dbsnp(rs_id):
    try:
        with sem:
            url = f"https://api.ncbi.nlm.nih.gov/variation/v0/beta/refsnp/{rs_id}"
            response = requests.get(url)
            if response.status_code == 200:
                return parse_dbsnp(response.json())
            else:
                return None
    except Exception as e:
        print(f"An error occurred in fetch_dbsnp: {e}")
        return None

# Fetch the concept description
def fetch_concept_description(concept_id):
    try:
        concept_type, identifier = concept_id.split('_', 1)
        if concept_type == "Disease":
            if identifier.startswith("MESH"):
                identifier = identifier[identifier.find('_')+1:]
                return fetch_mesh_descriptor(identifier)
            else:
                return identifier[identifier.find('_')+1:].replace('_', ' ')
        elif concept_type == "Gene":
            return fetch_entrez_gene(identifier)
        elif concept_type == "Species":
            return fetch_ncbi_species(identifier)
        elif concept_type == "CellLine":
            return fetch_cellosaurus(identifier)
        elif concept_type == "ProteinMutation":
            identifier = identifier[identifier.rfind('_')+1:]
            return fetch_dbsnp(identifier)
        elif concept_type == "SNP":
            time.sleep(0.1)
            identifier = identifier[2:]
            return fetch_dbsnp(identifier)
        elif concept_type == "Chemical":
            identifier = identifier[identifier.find('_')+1:]
            return fetch_mesh_descriptor(identifier)
        elif concept_type == "DNAMutation":
            return concept_type + ' ' + identifier.replace('_', ' ')
        elif concept_type == "DomainMotif":
            return concept_type + ' ' + identifier.replace('_', ' ')
        else:
            return None
    except Exception as e:
        print(f"An error occurred in fetch_concept_description: {e}")
        return None

# Examples
examples = [
    'Disease_MESH_D001845',
    'Disease_Methanogenic_archaeon_CH1270',
    'Gene_2799940',
    'ProteinMutation_p_R450H_RS_189261858',
    'SNP_rs11944405',
    'Chemical_MESH_C511970',
    'DNAMutation_c_1324delC_CorrespondingGene_6535',
    'Species_7461',
    'CellLine_CVCL_K990',
    'DomainMotif_Focus_9606_7003_7004'
]

# Fetch the descriptions
concept_descriptions = {concept: None for concept in examples}

threads = []
for concept in examples:
    t = Thread(target=lambda: concept_descriptions.update({concept: fetch_concept_description(concept)}))
    t.start()
    threads.append(t)

for t in threads:
    t.join()

print(concept_descriptions)

{'Disease_MESH_D001845': 'Bone Cysts', 'Disease_Methanogenic_archaeon_CH1270': 'archaeon CH1270', 'Gene_2799940': 'matrix protein M2-1;matrix protein M2-2', 'ProteinMutation_p_R450H_RS_189261858': {'chromosome': '189261858', 'snps': [[{'allele': {'spdi': {'seq_id': 'NC_000014.9', 'position': 81143406, 'deleted_sequence': 'G', 'inserted_sequence': 'G'}}, 'hgvs': 'NC_000014.9:g.81143407='}, {'allele': {'spdi': {'seq_id': 'NC_000014.9', 'position': 81143406, 'deleted_sequence': 'G', 'inserted_sequence': 'A'}}, 'hgvs': 'NC_000014.9:g.81143407G>A'}, {'allele': {'spdi': {'seq_id': 'NC_000014.9', 'position': 81143406, 'deleted_sequence': 'G', 'inserted_sequence': 'C'}}, 'hgvs': 'NC_000014.9:g.81143407G>C'}, {'allele': {'spdi': {'seq_id': 'NC_000014.9', 'position': 81143406, 'deleted_sequence': 'G', 'inserted_sequence': 'T'}}, 'hgvs': 'NC_000014.9:g.81143407G>T'}], [{'allele': {'spdi': {'seq_id': 'NC_000014.8', 'position': 81609750, 'deleted_sequence': 'G', 'inserted_sequence': 'G'}}, 'hgvs': 

In [4]:
import pickle
# Load from pickle file
with open('/Users/danielgeorge/Documents/work/ml/bioconceptvec-explorer/bioconceptvec-explorer/datasets/concept_descriptions.pkl', 'rb') as f:
    concept_descriptions = pickle.load(f)

In [5]:
concept_descriptions.keys()#['Disease_MESH_D001845']

dict_keys(['Disease_MESH_D001845', 'Gene_2799940', 'Gene_3726_54751', 'Disease_MESH_D000652', 'ProteinMutation_p_R450H_RS_189261858', 'ProteinMutation_p_I123V', 'SNP_rs11944405', 'Chemical_MESH_C511970', 'ProteinMutation_p_Q106E', 'Gene_56941_116255', 'SNP_rs12536544', 'Chemical_MESH_C558709', 'Gene_100513670', 'Gene_30244', 'Chemical_MESH_C543977', 'Chemical_MESH_C023449', 'Chemical_MESH_C087010', 'DNAMutation_c_1324delC_CorrespondingGene_6535', 'Chemical_MESH_C504005', 'Gene_3681', 'Chemical_MESH_C011727', 'Chemical_MESH_C022616', 'DNAMutation_c_IVS_1_1G_T', 'Species_7461', 'Species_1552824', 'Chemical_MESH_C006624', 'Chemical_MESH_C514851', 'Gene_34226', 'Gene_830120', 'Gene_432478', 'ProteinMutation_p_R92L', 'Gene_1487604', 'Chemical_MESH_C513512', 'Gene_4595', 'Gene_819725', 'Gene_100052326', 'DNAMutation_c_314T_A', 'Gene_101678238', 'Gene_836092', 'Gene_64157', 'Gene_100304457', 'Gene_100357727', 'Chemical_MESH_C036899', 'Chemical_MESH_C054482', 'Gene_105334726', 'Gene_172661', '

In [6]:
concept_descriptions

{'Disease_MESH_D001845': 'Bone Cysts',
 'Gene_2799940': 'matrix protein M2-1;matrix protein M2-2',
 'Gene_3726_54751': 'JunB proto-oncogene, AP-1 transcription factor subunit',
 'Disease_MESH_D000652': 'Amniotic Band Syndrome',
 'ProteinMutation_p_R450H_RS_189261858': ['thyroid stimulating hormone receptor',
  'uncharacterized LOC101928462'],
 'ProteinMutation_p_I123V': 'ProteinMutation I123V',
 'SNP_rs11944405': ['late endosomal/lysosomal adaptor, MAPK and MTOR activator 3'],
 'Chemical_MESH_C511970': 'M58373',
 'ProteinMutation_p_Q106E': 'ProteinMutation Q106E',
 'Gene_56941_116255': '5-hydroxymethylcytosine binding, ES cell specific',
 'SNP_rs12536544': ['Rac family small GTPase 1'],
 'Chemical_MESH_C558709': '16-(4-(3-(imidazol-1-yl)propoxy)-3-methoxybenzylidene)-5-androstene-3b,17b-diol',
 'Gene_100513670': 'non-SMC condensin I complex subunit G',
 'Gene_30244': None,
 'Chemical_MESH_C543977': 'wogonin-5-0-beta-D-glucuronide methyl ester',
 'Chemical_MESH_C023449': 'tizolemide',
 