#### Sources:
- SciSpacy repo: https://github.com/allenai/scispacy/blob/4ade4ec897fa48c2ecf3187caa08a949920d126d/scripts/export_umls_json.py
- umls sources: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/index.html

In [1]:
import json
from typing import List, Dict, NamedTuple, Optional, Set

class Entity(NamedTuple):

    concept_id: str
    canonical_name: str
    aliases: List[str]
    types: List[str] = []
    definition: Optional[str] = None

    def __repr__(self):

        rep = ""
        num_aliases = len(self.aliases)
        rep = rep + f"CUI: {self.concept_id}, Name: {self.canonical_name}\n"
        rep = rep + f"Definition: {self.definition}\n"
        rep = rep + f"TUI(s): {', '.join(self.types)}\n"
        if num_aliases > 10:
            rep = (
                rep
                + f"Aliases (abbreviated, total: {num_aliases}): \n\t {', '.join(self.aliases[:10])}"
            )
        else:
            rep = (
                rep + f"Aliases: (total: {num_aliases}): \n\t {', '.join(self.aliases)}"
            )
        return rep


DEFAULT_UMLS_PATH = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/kbs/2020-10-09/umls_2020_aa_cat0129.jsonl"  # noqa
DEFAULT_UMLS_TYPES_PATH = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_semantic_type_tree.tsv"


class KnowledgeBase:
    """
    A class representing two commonly needed views of a Knowledge Base:
    1. A mapping from concept_id to an Entity NamedTuple with more information.
    2. A mapping from aliases to the sets of concept ids for which they are aliases.
    Parameters
    ----------
    file_path: str, required.
        The file path to the json/jsonl representation of the KB to load.
    """

    def __init__(
        self,
        file_path: str = None,
    ):
        if file_path is None:
            raise ValueError(
                "Do not use the default arguments to KnowledgeBase. "
                "Instead, use a subclass (e.g UmlsKnowledgeBase) or pass a path to a kb."
            )
        if file_path.endswith("jsonl"):
            raw = (json.loads(line) for line in open(cached_path(file_path)))
        else:
            raw = json.load(open(cached_path(file_path)))

        alias_to_cuis: Dict[str, Set[str]] = defaultdict(set)
        self.cui_to_entity: Dict[str, Entity] = {}

        for concept in raw:
            unique_aliases = set(concept["aliases"])
            unique_aliases.add(concept["canonical_name"])
            for alias in unique_aliases:
                alias_to_cuis[alias].add(concept["concept_id"])
            self.cui_to_entity[concept["concept_id"]] = Entity(**concept)

        self.alias_to_cuis: Dict[str, Set[str]] = {**alias_to_cuis}


class UmlsKnowledgeBase(KnowledgeBase):
    def __init__(
        self,
        file_path: str = DEFAULT_UMLS_PATH,
        types_file_path: str = DEFAULT_UMLS_TYPES_PATH,
    ):

        super().__init__(file_path)

        self.semantic_type_tree: UmlsSemanticTypeTree = construct_umls_tree_from_tsv(
            types_file_path
        )

In [2]:
# preferred definition sources (from S2)
DEF_SOURCES_PREFERRED = {"NCI_BRIDG", "NCI_NCI-GLOSS", "NCI", "GO", "MSH", "NCI_FDA"}


def read_umls_file_headers(meta_path: str, filename: str) -> List[str]:
    """
    Read the file descriptor MRFILES.RRF from a UMLS release and get column headers (names)
    for the given file
    MRFILES.RRF file format: a pipe-separated values
    Useful columns:
        column 0: name of one of the files in the META directory
        column 2: column names of that file
    Args:
        meta_path: path to the META directory of an UMLS release
        filename: name of the file to get its column headers
    Returns:
        a list of column names
    """
    file_descriptors = f"{meta_path}/MRFILES.RRF"  # to get column names
    with open(file_descriptors, encoding="utf-8") as fin:
        for line in fin:
            splits = line.split("|")
            found_filename = splits[0]
            column_names = (splits[2] + ",").split(
                ","
            )  # ugly hack because all files end with an empty column
            if found_filename in filename:
                return column_names
    assert False, f"Couldn't find column names for file {filename}"
    return None


def read_umls_concepts(meta_path: str, concept_details: Dict, source: str = None):
    """
    Read the concepts file MRCONSO.RRF from a UMLS release and store it in
    concept_details dictionary. Each concept is represented with
    - concept_id
    - canonical_name
    - aliases
    - types
    - definition
    This function fills the first three. If a canonical name is not found, it is left empty.
    MRFILES.RRF file format: a pipe-separated values
    Useful columns: CUI, LAT, SUPPRESS, STR, ISPREF, TS, STT
    Args:
        meta_path: path to the META directory of an UMLS release
        concept_details: a dictionary to be filled with concept informations
        source: An optional source identifier, used as a filter to extract only a
                specific source from UMLS.
    """
    concepts_filename = "MRCONSO.RRF"
    headers = read_umls_file_headers(meta_path, concepts_filename)
    with open(f"{meta_path}/{concepts_filename}", encoding="utf-8") as fin:
        for line in fin:
            splits = line.strip().split("|")
            assert len(headers) == len(splits), (headers, splits)
            concept = dict(zip(headers, splits))
            if concept["LAT"] != "ENG" or concept["SUPPRESS"] != "N":
                continue  # Keep English non-suppressed concepts only

            if source is not None:
                if concept["SAB"] != source:
                    continue

            concept_id = concept["CUI"]
            if concept_id not in concept_details:  # a new concept
                # add it to the dictionary with an empty list of aliases and types
                concept_details[concept_id] = {
                    "concept_id": concept_id,
                    "aliases": [],
                    "types": [],
                }

            concept_name = concept["STR"]
            # this condition is copied from S2. It checks if the concept name is canonical or not
            is_canonical = (
                concept["ISPREF"] == "Y"
                and concept["TS"] == "P"
                and concept["STT"] == "PF"
            )

            if not is_canonical or "canonical_name" in concept_details[concept_id]:
                # not a canonical name or a canonical name already found
                concept_details[concept_id]["aliases"].append(
                    concept_name
                )  # add it as an alias
            else:
                concept_details[concept_id][
                    "canonical_name"
                ] = concept_name  # set as canonical name


def read_umls_types(meta_path: str, concept_details: Dict):
    """
    Read the types file MRSTY.RRF from a UMLS release and store it in
    concept_details dictionary. This function adds the `types` field
    to the information of each concept
    MRSTY.RRF file format: a pipe-separated values
    Useful columns: CUI, TUI
    Args:
        meta_path: path to the META directory of an UMLS release
        concept_details: a dictionary to be filled with concept informations
    """
    types_filename = "MRSTY.RRF"
    headers = read_umls_file_headers(meta_path, types_filename)
    with open(f"{meta_path}/{types_filename}", encoding="utf-8") as fin:
        for line in fin:
            splits = line.strip().split("|")
            assert len(headers) == len(splits)
            concept_type = dict(zip(headers, splits))

            concept = concept_details.get(concept_type["CUI"])
            if (
                concept is not None
            ):  # a small number of types are for concepts that don't exist
                concept["types"].append(concept_type["TUI"])


def read_umls_definitions(meta_path: str, concept_details: Dict):
    """
    Read the types file MRDEF.RRF from a UMLS release and store it in
    concept_details dictionary. This function adds the `definition` field
    to the information of each concept
    MRDEF.RRF file format: a pipe-separated values
    Useful columns: CUI, SAB, SUPPRESS, DEF
    Args:
        meta_path: path to the META directory of an UMLS release
        concept_details: a dictionary to be filled with concept informations
    """
    definitions_filename = "MRDEF.RRF"
    headers = read_umls_file_headers(meta_path, definitions_filename)
    with open(f"{meta_path}/{definitions_filename}", encoding="utf-8") as fin:
        headers = read_umls_file_headers(meta_path, definitions_filename)
        for line in fin:
            splits = line.strip().split("|")
            assert len(headers) == len(splits)
            definition = dict(zip(headers, splits))

            if definition["SUPPRESS"] != "N":
                continue
            is_from_preferred_source = definition["SAB"] in DEF_SOURCES_PREFERRED
            concept = concept_details.get(definition["CUI"])
            if (
                concept is None
            ):  # a small number of definitions are for concepts that don't exist
                continue

            if (
                "definition" not in concept
                or is_from_preferred_source
                and concept["is_from_preferred_source"] == "N"
            ):
                concept["definition"] = definition["DEF"]
                concept["is_from_preferred_source"] = (
                    "Y" if is_from_preferred_source else "N"
                )

In [3]:
"""
Convert a umls release to a jsonl file of concepts.
"""

def main(meta_path: str, output_path: str, source: str = None):

    concept_details = {}  # dictionary of concept_id -> {
                          #                 'concept_id': str,
                          #                 'canonical_name': str
                          #                 'aliases': List[str]
                          #                 'types': List[str]
                          #                 'definition': str
                          # }

    print('Reading concepts ... ')
    read_umls_concepts(meta_path, concept_details, source)

    print('Reading types ... ')
    read_umls_types(meta_path, concept_details)

    print('Reading definitions ... ')
    read_umls_definitions(meta_path, concept_details)

    without_canonical_name_count = 0
    without_aliases_count = 0
    with_one_alias_count = 0
    with_more_than_one_alias_count = 0
    without_type_count = 0
    with_one_type_count = 0
    with_more_than_one_type_count = 0
    without_definition_count = 0
    with_definition_pref_source_count = 0
    with_definition_other_sources_count = 0
    for concept in concept_details.values():
        without_canonical_name_count += 1 if 'canonical_name' not in concept else 0
        without_aliases_count += 1 if len(concept['aliases']) == 0 else 0
        with_one_alias_count += 1 if len(concept['aliases']) == 1 else 0
        with_more_than_one_alias_count += 1 if len(concept['aliases']) > 1 else 0
        without_type_count += 1 if len(concept['types']) == 0 else 0
        with_one_type_count += 1 if len(concept['types']) == 1 else 0
        with_more_than_one_type_count += 1 if len(concept['types']) >= 1 else 0
        without_definition_count += 1 if 'definition' not in concept else 0
        with_definition_pref_source_count += 1 if concept.get('is_from_preferred_source') == 'Y' else 0
        with_definition_other_sources_count += 1 if concept.get('is_from_preferred_source') == 'N' else 0

    print(f'Number of concepts: {len(concept_details)}')
    print(f'Number of concepts without canonical name (one of the aliases will be used instead): '
          f'{without_canonical_name_count}')
    print(f'Number of concepts with no aliases: {without_aliases_count}')
    print(f'Number of concepts with 1 alias: {with_one_alias_count}')
    print(f'Number of concepts with > 1 alias: {with_more_than_one_alias_count}')
    print(f'Number of concepts with no type: {without_type_count}')
    print(f'Number of concepts with 1 type: {with_one_type_count}')
    print(f'Number of concepts with > 1 type: {with_more_than_one_type_count}')
    print(f'Number of concepts with no definition: {without_definition_count}')
    print(f'Number of concepts with definition from preferred sources: {with_definition_pref_source_count}')
    print(f'Number of concepts with definition from other sources: {with_definition_other_sources_count}')

    print('Deleting unused fields and choosing a canonical name from aliases ... ')
    for concept in concept_details.values():

        # Some concepts have many duplicate aliases. Here we remove them.
        concept["aliases"] = list(set(concept["aliases"]))

        # if a concept doesn't have a canonical name, use the first alias instead
        if 'canonical_name' not in concept:
            aliases = concept['aliases']
            concept['canonical_name'] = aliases[0]
            del aliases[0]

        # deleting `is_from_preferred_source`
        if 'is_from_preferred_source' in concept:
            del concept['is_from_preferred_source']

    print('Exporting to the a jsonl file {} ...'.format(output_path))
    with open(output_path, 'w') as fout:

        for value in concept_details.values():
            fout.write(json.dumps(value) + "\n")
    print('DONE.')

In [4]:
main('2022AB/META/', '2022AB/umls_2022_ab.jsonl', None)

Reading concepts ... 


KeyboardInterrupt: 

In [38]:
main('2022AB/META/', '2022AB/go_umls_2022_ab.jsonl', 'GO')

Reading concepts ... 
Reading types ... 
Reading definitions ... 
Number of concepts: 66563
Number of concepts without canonical name (one of the aliases will be used instead): 4769
Number of concepts with no aliases: 34127
Number of concepts with 1 alias: 13004
Number of concepts with > 1 alias: 19432
Number of concepts with no type: 0
Number of concepts with 1 type: 66555
Number of concepts with > 1 type: 66563
Number of concepts with no definition: 22375
Number of concepts with definition from preferred sources: 44177
Number of concepts with definition from other sources: 11
Deleting unused fields and choosing a canonical name from aliases ... 
Exporting to the a jsonl file 2022AB/go_umls_2022_ab.jsonl ...
DONE.


In [77]:
main('2022AB/META/', '2022AB/ncbi_umls_2022_ab.jsonl', 'NCBI')

Reading concepts ... 
Reading types ... 
Reading definitions ... 
Number of concepts: 2062313
Number of concepts without canonical name (one of the aliases will be used instead): 98126
Number of concepts with no aliases: 1831588
Number of concepts with 1 alias: 168002
Number of concepts with > 1 alias: 62723
Number of concepts with no type: 0
Number of concepts with 1 type: 2061982
Number of concepts with > 1 type: 2062313
Number of concepts with no definition: 2057407
Number of concepts with definition from preferred sources: 4695
Number of concepts with definition from other sources: 211
Deleting unused fields and choosing a canonical name from aliases ... 
Exporting to the a jsonl file 2022AB/ncbi_umls_2022_ab.jsonl ...
DONE.


In [78]:
main('2022AB/META/', '2022AB/snomed_umls_2022_ab.jsonl', 'SNOMEDCT_US')

Reading concepts ... 
Reading types ... 
Reading definitions ... 
Number of concepts: 357071
Number of concepts without canonical name (one of the aliases will be used instead): 87816
Number of concepts with no aliases: 466
Number of concepts with 1 alias: 164590
Number of concepts with > 1 alias: 192015
Number of concepts with no type: 0
Number of concepts with 1 type: 335941
Number of concepts with > 1 type: 357071
Number of concepts with no definition: 312384
Number of concepts with definition from preferred sources: 31291
Number of concepts with definition from other sources: 13396
Deleting unused fields and choosing a canonical name from aliases ... 
Exporting to the a jsonl file 2022AB/snomed_umls_2022_ab.jsonl ...
DONE.


In [5]:
main('2022AB/META/', '2022AB/hpo_umls_2022_ab.jsonl', 'HPO')

Reading concepts ... 
Reading types ... 
Reading definitions ... 
Number of concepts: 18233
Number of concepts without canonical name (one of the aliases will be used instead): 5684
Number of concepts with no aliases: 6150
Number of concepts with 1 alias: 6011
Number of concepts with > 1 alias: 6072
Number of concepts with no type: 0
Number of concepts with 1 type: 18222
Number of concepts with > 1 type: 18233
Number of concepts with no definition: 3819
Number of concepts with definition from preferred sources: 2984
Number of concepts with definition from other sources: 11430
Deleting unused fields and choosing a canonical name from aliases ... 
Exporting to the a jsonl file 2022AB/hpo_umls_2022_ab.jsonl ...
DONE.


In [47]:
main('2022AB/META/', '2022AB/mesh_umls_2022_ab.jsonl', 'MSH')

Reading concepts ... 
Reading types ... 
Reading definitions ... 
Number of concepts: 457204
Number of concepts without canonical name (one of the aliases will be used instead): 45079
Number of concepts with no aliases: 212385
Number of concepts with 1 alias: 121006
Number of concepts with > 1 alias: 123813
Number of concepts with no type: 0
Number of concepts with 1 type: 238950
Number of concepts with > 1 type: 457204
Number of concepts with no definition: 409602
Number of concepts with definition from preferred sources: 43998
Number of concepts with definition from other sources: 3604
Deleting unused fields and choosing a canonical name from aliases ... 
Exporting to the a jsonl file 2022AB/mesh_umls_2022_ab.jsonl ...
DONE.


In [67]:
main('2022AB/META/', '2022AB/rxnorm_umls_2022_ab.jsonl', 'RXNORM')

Reading concepts ... 
Reading types ... 
Reading definitions ... 
Number of concepts: 107127
Number of concepts without canonical name (one of the aliases will be used instead): 15392
Number of concepts with no aliases: 64403
Number of concepts with 1 alias: 18032
Number of concepts with > 1 alias: 24692
Number of concepts with no type: 0
Number of concepts with 1 type: 89449
Number of concepts with > 1 type: 107127
Number of concepts with no definition: 102510
Number of concepts with definition from preferred sources: 4430
Number of concepts with definition from other sources: 187
Deleting unused fields and choosing a canonical name from aliases ... 
Exporting to the a jsonl file 2022AB/rxnorm_umls_2022_ab.jsonl ...
DONE.


In [68]:
main('2022AB/META/', '2022AB/drugbank_umls_2022_ab.jsonl', 'DRUGBANK')

Reading concepts ... 
Reading types ... 
Reading definitions ... 
Number of concepts: 10227
Number of concepts without canonical name (one of the aliases will be used instead): 7351
Number of concepts with no aliases: 2407
Number of concepts with 1 alias: 3464
Number of concepts with > 1 alias: 4356
Number of concepts with no type: 0
Number of concepts with 1 type: 1021
Number of concepts with > 1 type: 10227
Number of concepts with no definition: 6046
Number of concepts with definition from preferred sources: 4034
Number of concepts with definition from other sources: 147
Deleting unused fields and choosing a canonical name from aliases ... 
Exporting to the a jsonl file 2022AB/drugbank_umls_2022_ab.jsonl ...
DONE.


In [69]:
main('2022AB/META/', '2022AB/gs_umls_2022_ab.jsonl', 'GS')

Reading concepts ... 
Reading types ... 
Reading definitions ... 
Number of concepts: 26225
Number of concepts without canonical name (one of the aliases will be used instead): 19296
Number of concepts with no aliases: 6558
Number of concepts with 1 alias: 17415
Number of concepts with > 1 alias: 2252
Number of concepts with no type: 0
Number of concepts with 1 type: 21063
Number of concepts with > 1 type: 26225
Number of concepts with no definition: 23308
Number of concepts with definition from preferred sources: 2874
Number of concepts with definition from other sources: 43
Deleting unused fields and choosing a canonical name from aliases ... 
Exporting to the a jsonl file 2022AB/gs_umls_2022_ab.jsonl ...
DONE.


In [21]:
#with open('umls_2020_aa_cat0129.jsonl', 'r') as json_file:
with open('2022AB/umls_2022_ab.jsonl', 'r') as json_file:
    json_list = list(json_file)

all_cuis = []
for json_str in json_list:
    result = json.loads(json_str)
    all_cuis.append(result['concept_id'])

In [26]:
with open("2022AB/umls_cui_list.json", "w") as outfile:
    json.dump(all_cuis, outfile)