In [1]:
# import nltk
# nltk.download('stopwords')

from nltk.corpus import stopwords

# Get English stopwords
english_stopwords = stopwords.words('english')

# Add single-letter and single-digit stopwords
single_letter_stopwords = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
single_digit_stopwords = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

# Combine all stopwords
all_stopwords = set(english_stopwords + single_letter_stopwords + single_digit_stopwords)

In [23]:
from tqdm.auto import tqdm

def filter_entities(entities):
    
    def filter_spans(spans):
        spans = list(set(spans))
        # Sort the spans by start index
        sorted_spans = sorted(spans, key=lambda x: (x[0], -x[1]))
        # Create a list to store the filtered spans
        filtered_spans = []
        discard_spans = set()
        # Loop through the sorted spans
        for i, span in enumerate(sorted_spans):
            if span in discard_spans:
                continue
            # Check if this span overlaps with another span
            overlaps = False
            for other_span in sorted_spans[i+1:]:
                if span[1] > other_span[0]:
                    if span[1] < other_span[1]:
                        # partial overlap, discard both
                        discard_spans.add(other_span)
                    overlaps = True
            if overlaps:
                continue
            # Otherwise, add this span to the filtered list
            filtered_spans.append(span)
        return filtered_spans

    spans = [tuple(ent['offset']) for ent in entities]
    filtered_spans = set(filter_spans(spans))
    return [ent for ent in entities if tuple(ent['offset']) in filtered_spans]


class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_word = True

    def search(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                return False
            node = node.children[char]
        return node.is_end_of_word

class EntityExtractor:
    def __init__(self, entity_alias_map):
        self.trie = Trie()
        self.entity_alias_map = entity_alias_map
        print("Building trie...")
        # for mention in tqdm(entity_alias_map.keys()):
        #     self.trie.insert(mention)
        
        list(map(self.trie.insert, tqdm(entity_alias_map.keys())))
    
    def extract_entities(self, text):
        entities = []
        text_length = len(text)
        word_indices = [0] + [i + 1 for i, char in enumerate(text) if char.isspace()]
        for i in word_indices:
            node = self.trie.root
            for j in range(i, text_length):
                char = text[j]
                if char in node.children:
                    node = node.children[char]
                    if node.is_end_of_word:
                        # if (j+1 < len(text)) and not text[j+1].isspace():
                        if (j+1 < len(text)) and text[j+1].isalnum():
                            continue
                        entity = text[i:j + 1]
                        try:
                            ids = self.entity_alias_map[entity]
                            entities.append({
                                "cui": ids['cui'],
                                "tui": ids['tui'],
                                "mention": entity,
                                "offset": [i, j + 1],
                            })
                        except KeyError:
                            pass
                else:
                    break
        return entities
    
    def get_context(self, entities, text, window_size):
        k = 9 * (window_size//2) # 9 is 1.25 times the average word length
        context_entities = []
        text_length = len(text)
        for entity in entities:
            i, j = entity['offset']
            start_index = max(0, i - k)
            end_index = min(text_length, j + k)
            left_context = text[start_index:i].split()
            right_context = text[j:end_index].split()
            left_context = " ".join([tok for tok in left_context[-window_size//2:]])
            right_context = " ".join([tok for tok in right_context[:window_size//2]])
            entity_w_context = left_context + " " + entity['mention'] + " " + right_context
            entity['mention_w_context'] = entity_w_context
            context_entities.append(entity)
        return context_entities

In [3]:
pubmed_example = '''Title: Advances in Hypertension Management: A Comprehensive Review

Abstract:
Hypertension, characterized by elevated blood pressure levels, remains a significant public health concern worldwide due to its association with increased cardiovascular morbidity and mortality. In recent years, there have been notable advancements in the understanding and management of hypertension. This comprehensive review aims to highlight key developments in hypertension management, encompassing pharmacological and non-pharmacological interventions, as well as emerging therapeutic targets and personalized treatment strategies.

Introduction:
Hypertension, defined as systolic blood pressure (SBP) ≥ 140 mmHg and/or diastolic blood pressure (DBP) ≥ 90 mmHg, affects a substantial proportion of the global population. Despite extensive research and therapeutic innovations, hypertension remains inadequately controlled in many individuals, contributing to the burden of cardiovascular diseases (CVDs) such as stroke, coronary artery disease, and heart failure. However, recent advancements in hypertension management offer promising avenues for improving patient outcomes and reducing cardiovascular risk.

Pharmacological Interventions:
Pharmacotherapy remains the cornerstone of hypertension management, with a diverse array of antihypertensive agents available for clinical use. Among the most commonly prescribed medications are angiotensin-converting enzyme (ACE) inhibitors, angiotensin II receptor blockers (ARBs), calcium channel blockers (CCBs), beta-blockers, and diuretics. These drugs exert their antihypertensive effects through various mechanisms, including vasodilation, inhibition of the renin-angiotensin-aldosterone system (RAAS), and reduction of sympathetic activity.

Non-Pharmacological Approaches:
In addition to pharmacotherapy, lifestyle modifications play a crucial role in hypertension management. Dietary interventions, such as the Dietary Approaches to Stop Hypertension (DASH) diet, emphasize the consumption of fruits, vegetables, whole grains, and lean proteins, while limiting sodium intake and processed foods. Regular physical activity, smoking cessation, and moderation of alcohol consumption are also recommended as part of a comprehensive approach to blood pressure control.

Emerging Therapeutic Targets:
Recent research has identified novel therapeutic targets for hypertension treatment, including endothelin receptor antagonists, mineralocorticoid receptor antagonists, and sodium-glucose cotransporter 2 (SGLT2) inhibitors. These agents offer potential benefits in terms of blood pressure lowering, cardiovascular protection, and renal outcomes, particularly in patients with resistant hypertension or comorbidities such as diabetes mellitus and chronic kidney disease.

Personalized Treatment Strategies:
Advances in precision medicine have paved the way for personalized approaches to hypertension management, taking into account individual patient characteristics, genetic factors, and biomarkers. Genetic testing and pharmacogenomic profiling may help identify patients who are more likely to respond to specific antihypertensive agents or experience adverse drug reactions. Furthermore, risk stratification tools such as the American College of Cardiology/American Heart Association (ACC/AHA) cardiovascular risk calculator enable clinicians to tailor treatment decisions based on an individual's overall cardiovascular risk profile.

Conclusion:
In conclusion, hypertension management has witnessed significant progress in recent years, driven by ongoing research efforts and therapeutic innovations. A multifaceted approach incorporating pharmacological and non-pharmacological interventions, alongside emerging therapeutic targets and personalized treatment strategies, holds promise for optimizing blood pressure control and reducing the burden of cardiovascular disease. Continued collaboration between clinicians, researchers, and policymakers is essential to further advance the field of hypertension management and improve patient outcomes.'''

entity_mentions = [
"Hypertension",
"blood pressure",
"systolic blood pressure",
"diastolic blood pressure",
"cardiovascular morbidity",
"cardiovascular mortality",
"pharmacotherapy",
"antihypertensive agents",
"angiotensin-converting enzyme inhibitors",
"angiotensin II receptor blockers",
"calcium channel blockers",
"beta-blockers",
"diuretics",
"renin-angiotensin-aldosterone system",
"lifestyle modifications",
"Dietary Approaches to Stop Hypertension (DASH) diet",
"physical activity",
"smoking cessation",
"alcohol consumption",
"endothelin receptor antagonists",
"mineralocorticoid receptor antagonists",
"sodium-glucose cotransporter 2 (SGLT2) inhibitors",
"precision medicine",
"genetic testing",
"pharmacogenomic profiling",
"American College of Cardiology/American Heart Association (ACC/AHA)"
]

# entity_alias_map = {mention: None for mention in entity_mentions}

# extractor = EntityExtractor(entity_alias_map)
# text = pubmed_example
# entities = extractor.extract_entities_w_context(text)

# for cui, entity, context, *offset in entities:
#     print("CUI:", cui)
#     print("Entity:", entity)
#     print("Context:", context)
#     print("Char Offset:", offset)
#     print('-'*50)

In [4]:
import sys 
sys.path.append('../')

from umls_utils import UmlsMappings

umls = UmlsMappings(umls_dir='/mitchell/entity-linking/2017AA/META', umls_api_key='1ba2c206-44fd-4efe-aaf1-0a5698561f7f').umls.dropna(subset=['alias'])
selected_ontologies = {'HPO', 'GO', 'MDR', 'NCBI', 'OMIM', 'SNOMEDCT_US', 'NCI', 'HGNC', 'NDDF', 'RXNORM', 'ICD10', 'ICD10CM', 'MTH', 'NDFRT', 'MSH', 'FMA', 'ICD9CM', 'CPT'}
selected_types = {'T005', 'T007', 'T017', 'T022', 'T031', 'T033', 'T037', 'T038', 'T058', 'T062', 'T074', 'T082', 'T091', 'T092', 'T097', 'T098', 'T103', 'T168', 'T170', 'T201', 'T204', 'T018', 'T021', 'T023', 'T024', 'T025', 'T026', 'T028', 'T190', 'T019', 'T020', 'T034', 'T184', 'T039', 'T040', 'T041', 'T042', 'T043', 'T044', 'T045', 'T046', 'T047', 'T048', 'T191', 'T049', 'T050', 'T059', 'T060', 'T061', 'T063', 'T203', 'T029', 'T030', 'T083', 'T085', 'T086', 'T087', 'T088', 'T093', 'T094', 'T095', 'T104', 'T109', 'T114', 'T116', 'T196', 'T197', 'T120', 'T121', 'T195', 'T122', 'T123', 'T125', 'T126', 'T127', 'T129', 'T192', 'T130', 'T131', 'T089', 'T185', 'T002', 'T004', 'T008', 'T010', 'T011', 'T012', 'T013', 'T014', 'T015', 'T016'}
umls = umls[umls['sab'].isin(selected_ontologies)]
umls = umls[umls.tui.apply(lambda x: len(set(x).intersection(selected_types))>0)]
umls = umls[umls.ispref == 'Y']
umls = umls[~umls.alias.apply(str.lower).isin(all_stopwords)]

grouped = umls.groupby('alias')[['cui', 'tui']].agg(list).reset_index()
filtered = grouped[grouped['cui'].apply(len) == 1]
filtered['cui'] = filtered['cui'].apply(lambda x: x[0])
filtered['tui'] = filtered['tui'].apply(lambda x: list(x[0]))
entity_alias_map = filtered.set_index('alias').to_dict('index')

del grouped, filtered
# del umls

Loading cached UMLS data from /mitchell/entity-linking/2017AA/META/.cached_df.feather


In [24]:
extractor = EntityExtractor(entity_alias_map)

Building trie...


  0%|          | 0/4587440 [00:00<?, ?it/s]

In [28]:
import _pickle as pickle

sys.setrecursionlimit(50000)

with open('umls_entity_extractor.pkl', 'wb') as f:
    pickle.dump(extractor, f)

In [5]:
import _pickle as pickle

with open('umls_entity_extractor.pkl', 'rb') as f:
    extractor = pickle.load(f)

In [25]:
text = pubmed_example
entities = extractor.extract_entities(text)
entities = filter_entities(entities)
for entity in entities:
    # print("CUI", entity['cui'])
    # print("Mention:", entity['mention'])
    # print("Char Offset:", entity['offset'])
    print(entity)

    print('-'*50)

{'cui': 'C1705823', 'tui': ['T170'], 'mention': 'Title', 'offset': [0, 5]}
--------------------------------------------------
{'cui': 'C0020538', 'tui': ['T047'], 'mention': 'Hypertension', 'offset': [19, 31]}
--------------------------------------------------
{'cui': 'C0282443', 'tui': ['T170'], 'mention': 'Review', 'offset': [60, 66]}
--------------------------------------------------
{'cui': 'C0600678', 'tui': ['T170'], 'mention': 'Abstract', 'offset': [68, 76]}
--------------------------------------------------
{'cui': 'C0020538', 'tui': ['T047'], 'mention': 'Hypertension', 'offset': [78, 90]}
--------------------------------------------------
{'cui': 'C0005768', 'tui': ['T031'], 'mention': 'blood', 'offset': [118, 123]}
--------------------------------------------------
{'cui': 'C0220880', 'tui': ['T170'], 'mention': 'morbidity', 'offset': [248, 257]}
--------------------------------------------------
{'cui': 'C0020538', 'tui': ['T047'], 'mention': 'Hypertension', 'offset': [631, 