# Extract all concept IDs for Pubtator Central

In [1]:
import gzip
import os
from pathlib import Path

import pandas as pd
import plydata as ply
import tqdm

In [2]:
Path("output/intermediate_files").mkdir(exist_ok=True, parents=True)

# Extract MESH IDs

In [3]:
mesh_chemical_url = (
    "https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/asciimesh/c2022.bin"
)
mesh_disease_url = (
    "https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/asciimesh/d2022.bin"
)

In [4]:
chemical_file = f"output/intermediate_files/{Path(mesh_chemical_url).name}"
if not Path(chemical_file).exists():
    os.system(f"wget {mesh_chemical_url} -O {chemical_file}")

disease_file = f"output/intermediate_files/{Path(mesh_disease_url).name}"
if not Path(disease_file).exists():
    os.system(f"wget {mesh_disease_url} -O {disease_file}")

In [5]:
mesh_map_file = Path("output/mesh_headings_id_mapper.tsv")
if not mesh_map_file.exists():
    with open(disease_file, "r") as disease_infile:
        with open(chemical_file, "r") as chem_infile:
            data_rows = []
            for line in tqdm.tqdm(disease_infile):
                line = line.strip()
                if "MH = " in line:
                    concept = line.replace("MH = ", "").lower()

                if "UI = " in line:
                    concept_id = line.replace("UI = ", "").lower()
                    data_rows.append({"mesh_id": concept_id, "mesh_heading": concept})

            for line in tqdm.tqdm(chem_infile):
                line = line.strip()
                if "NM = " in line:
                    concept = line.replace("NM = ", "").lower()

                if "UI = " in line:
                    concept_id = line.replace("UI = ", "").lower()
                    data_rows.append({"mesh_id": concept_id, "mesh_heading": concept})

    concept_df = pd.DataFrame.from_records(data_rows)
    concept_df >> ply.call("to_csv", str(mesh_map_file), sep="\t", index=False)
else:
    concept_df = pd.read_csv(str(mesh_map_file), sep="\t")
print(concept_df.shape)
concept_df >> ply.slice_rows(10)

841110it [00:00, 1940560.48it/s]
4974852it [00:02, 2283078.60it/s]


(348746, 2)


Unnamed: 0,mesh_id,mesh_heading
0,d000001,calcimycin
1,d000002,temefos
2,d000003,abattoirs
3,d000004,abbreviations as topic
4,d000005,abdomen
5,d000006,"abdomen, acute"
6,d000007,abdominal injuries
7,d000008,abdominal neoplasms
8,d000009,abdominal muscles
9,d000010,abducens nerve


# Extract Species IDs

In [6]:
species_id_url = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz"

In [7]:
species_file = f"output/intermediate_files/{Path(species_id_url).name}"
if not Path(species_file).exists():
    os.system(f"wget {species_id_url} -O {species_file}")
    Path("output/intermediate_files/species_temp").mkdir(exist_ok=True, parents=True)
    os.system(
        f"gunzip -c {species_file} | tar xf - -C output/intermediate_files/species_temp"
    )

In [8]:
species_map_file = Path("output/species_id_map.tsv")
if not species_map_file.exists():
    with open(
        "output/intermediate_files/species_temp/names.dmp", "r"
    ) as species_infile:
        data_rows = []
        for idx, line in enumerate(species_infile):
            fields = line.strip().replace("\t", "").split("|")

            if fields[3] == "scientific name":
                data_rows.append(
                    {"species_id": int(fields[0]), "species_name": fields[1].lower()}
                )

        species_df = pd.DataFrame.from_records(data_rows)
        species_df >> ply.call("to_csv", str(species_map_file), sep="\t", index=False)
else:
    species_df = pd.read_csv(str(species_map_file), sep="\t")

print(species_df.shape)
species_df >> ply.slice_rows(10)

(2402566, 2)


Unnamed: 0,species_id,species_name
0,1,root
1,2,bacteria
2,6,azorhizobium
3,7,azorhizobium caulinodans
4,9,buchnera aphidicola
5,10,cellvibrio
6,11,cellulomonas gilvus
7,13,dictyoglomus
8,14,dictyoglomus thermophilum
9,16,methylophilus


# Extract Gene IDS

In [9]:
gene_id_url = "https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/All_Data.gene_info.gz"

In [10]:
gene_file = Path(f"output/intermediate_files/{Path(gene_id_url).name}")
if not gene_file.exists():
    os.system(f"wget {gene_id_url} -O {gene_file}")

In [11]:
gene_map_file = Path("output/gene_id_map.tsv")
if not gene_map_file.exists():
    with gzip.open(str(gene_file), "rt") as infile:
        data_rows = []
        for idx, line in tqdm.tqdm(enumerate(infile)):
            # Skip the first line which is the header
            if idx == 0:
                continue

            gene_row = line.split("\t")
            data_rows.append(
                {
                    "tax_id": gene_row[0],
                    "gene_id": gene_row[1],
                    "gene_symbol": gene_row[2],
                }
            )

        gene_df = pd.DataFrame.from_records(data_rows)
        gene_df >> ply.call("to_csv", str(gene_map_file), sep="\t", index=False)
else:
    gene_df = pd.read_csv(str(gene_map_file), sep="\t")
print(gene_df.shape)
gene_df >> ply.slice_rows(10)

(36559909, 3)


Unnamed: 0,tax_id,gene_id,gene_symbol
0,7,5692769,NEWENTRY
1,9,2827857,NEWENTRY
2,11,10823747,NEWENTRY
3,14,6951813,NEWENTRY
4,19,3758873,NEWENTRY
5,24,5129993,NEWENTRY
6,24,67441593,dnaA
7,24,67441594,dnaN
8,24,67441595,recF
9,24,67441596,gyrB


# Extract Cell Line IDs

In [12]:
celline_id_url = "https://ftp.expasy.org/databases/cellosaurus/cellosaurus.obo"

In [13]:
celline_file = f"output/intermediate_files/{Path(celline_id_url).name}"
if not Path(celline_file).exists():
    os.system(f"wget {celline_id_url} -O {celline_file}")

In [14]:
celline_map_file = Path("output/celline_id_map.tsv")
if not celline_map_file.exists():
    with open(celline_file, "r") as celline_infile:
        data_rows = []
        for idx, line in tqdm.tqdm(enumerate(celline_infile)):
            line = line.strip()
            if "id:" in line:
                entry = dict(celline_id=line.split(" ")[1])

            if "name:" in line:
                entry["celline_name"] = line.split(" ")[1]
                data_rows.append(entry)

        celline_df = pd.DataFrame.from_records(data_rows)
        celline_df >> ply.call("to_csv", str(celline_map_file), sep="\t", index=False)
else:
    celline_df = pd.read_csv(str(celline_map_file), sep="\t")

print(celline_df.shape)
celline_df >> ply.slice_rows(10)

(134849, 2)


Unnamed: 0,celline_id,celline_name
0,CVCL_B0T9,#132
1,CVCL_B0T8,#132
2,CVCL_E548,#15310-LN
3,CVCL_KA96,#16-15
4,CVCL_IW91,#40a
5,CVCL_B375,#490
6,CVCL_X345,#822
7,CVCL_E549,#W7079
8,CVCL_G217,(BF1)
9,CVCL_VG99,(L)PC6


# Merge all ids into one

In [16]:
total_concept_file = Path("output/all_concept_ids.tsv.xz")
if not total_concept_file.exists():
    total_concepts_df = pd.concat(
        [
            (
                concept_df
                >> ply.define(
                    concept_id="mesh_id.apply(lambda x: 'mesh_'+x.lower())",
                    concept="mesh_heading",
                )
                >> ply.select("concept_id", "concept")
            ),
            (
                species_df
                >> ply.define(
                    concept_id="species_id.apply(lambda x: 'species_'+str(x))",
                    concept="species_name",
                )
                >> ply.select("concept_id", "concept")
            ),
            (
                gene_df
                >> ply.define(
                    concept_id="gene_id.apply(lambda x: 'gene_'+str(x))",
                    concept="gene_symbol",
                )
                >> ply.select("concept_id", "concept")
            ),
            (
                celline_df
                >> ply.define(
                    concept_id="celline_id.apply(lambda x: 'cellline_'+str(x))",
                    concept="celline_name",
                )
                >> ply.select("concept_id", "concept")
            ),
        ]
    )
    total_concepts_df >> ply.call(
        "to_csv", str(total_concept_file), sep="\t", index=False, compression="xz"
    )
else:
    total_concepts_df = pd.read_csv(str(total_concept_file), sep="\t")
total_concepts_df >> ply.sample_n(20, random_state=100)

Unnamed: 0,concept_id,concept
543819,gene_57972401,DK182_RS04135
25840386,gene_114818499,LOC114818499
23705729,gene_105632890,LOC105632890
29203991,gene_8742979,HTUR_RS11695
27300370,gene_5878640,EDI_322610
1237435,species_1537359,arthrobacter sp. thg-dn3.21
11382192,gene_113742134,LOC113742134
11350479,gene_113710421,LOC113710421
26443279,gene_41795762,psaJ
35659526,gene_59344705,MIND_00540600
