# Step 1. Loading Public Clinical Notes Data from Hugging Face


In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
# load in the clinical notes dataset from huggingface
df = pd.read_json("hf://datasets/AGBonnet/augmented-clinical-notes/augmented_notes_30K.jsonl", lines=True, nrows=300)

In [None]:
df

# Step 2. Installation of scispaCy Package and Model for NEL

In [None]:
!pip install scispacy


In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz

# Step 3. Importing scispaCy package and creation of model for Named Entity Linking

In [None]:
import spacy
import scispacy
from scispacy.linking import EntityLinker

In [None]:
# now we create our model instance which can be used to process biomedical text
nlp = spacy.load("en_core_sci_sm")

In [None]:
# now we add a linker to the UMLS knowledgebase to our model pipeline
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "mesh"})

In [None]:
linker = nlp.get_pipe("scispacy_linker")

# Step 4. Definition of Function for Extracting Named Entities from the Clinical Notes

In [None]:
def get_linked_entities_for_doc(text, nlp, linker):
  # get the document
  doc = nlp(text)
  # get the linked entities
  linked_entities = []
  for ent in doc.ents: # get all recognized entities
    for entry in ent._.kb_ents[:3]: # get the top 3 linked vocabulary terms for each entity
      linked_entities.append({
          'entity_name': ent.text,
          'cui': entry[0],
          'score': entry[1],
          'name': linker.kb.cui_to_entity[entry[0]].canonical_name,
          'definition': linker.kb.cui_to_entity[entry[0]].definition,
          'type_ids': ','.join(linker.kb.cui_to_entity[entry[0]].types),
      })
  return pd.DataFrame(linked_entities).drop_duplicates()


In [None]:
get_linked_entities_for_doc(df['full_note'][1], nlp, linker).query('score >= 0.9')

In [None]:
import pprint

In [None]:
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(df['full_note'][1])

# Step 5. Extracting All Named Entities from The Clinical Notes

In [None]:
from tqdm import tqdm

In [None]:
linked_entity_dfs = []
for _, row in tqdm(df.iterrows(), total=len(df)):
  linked_entities = get_linked_entities_for_doc(row['full_note'], nlp, linker).query('score >= 0.9')
  linked_entity_dfs.append(
      linked_entities.assign(type_ids_lst=lambda x: x['type_ids'].str.split(','))
      .explode('type_ids_lst').assign(note_id=row['idx'])
    )

In [None]:
linked_entities_all = pd.concat(linked_entity_dfs)

In [None]:
linked_entities_all

In [None]:
# inspecting the top 50 occuring linked entity names
linked_entities_all['entity_name'].value_counts().head(50)

# Step 6. Labeling Semantic Types for All Linked Entities

In [None]:
# first load in the file containing the labels for the semantic types
semantic_type_labels = pd.read_csv('https://github.com/btwooton/arch_workshop_scispacy_entity_linking_ws11/raw/refs/heads/main/umls_terms.txt')

In [None]:
semantic_type_labels

In [None]:
# now we add in these semantic type labels
linked_entities_final = linked_entities_all.merge(
    semantic_type_labels,
    left_on='type_ids_lst',
    right_on='tui',
    how='left'
)

In [None]:
linked_entities_final

## Mini Exercise: Show the Top 20 Most Frequently Occuring Semantic Types Among Linked Entities

In [None]:
# Your solution below...

# Step 7. Utilizing Publicly Available Crosswalk File to Link Entities to MeSH Terms and SNOMED Terms

In [None]:
# first we load in mappings from UMLS concept unique ids to Medical Subject Heading (MeSH) terms
mrconso_mesh_mappings = pd.read_parquet('https://github.com/btwooton/arch_workshop_scispacy_entity_linking_ws11/raw/refs/heads/main/mrconso_mesh.parquet')

In [None]:
mrconso_mesh_mappings

In [None]:
# now we can merge the linked entities with the MeSH mappings
linked_entities_mesh = linked_entities_final.merge(
    mrconso_mesh_mappings,
    left_on='cui',
    right_on='CUI',
    how='left'
)

In [None]:
linked_entities_mesh

In [None]:
# now we load in the mappings from CUIs to Systematized Nomenclature of Medicine - Clinical Terms
mrconso_snomed_mappings = pd.read_parquet('https://github.com/btwooton/arch_workshop_scispacy_entity_linking_ws11/raw/refs/heads/main/mrconso_snomed.parquet')

In [None]:
linked_entities_snomed = linked_entities_final.merge(
    mrconso_snomed_mappings,
    left_on='cui',
    right_on='CUI',
    how='left'
)

In [None]:
linked_entities_snomed

In [None]:
print(f"{len(linked_entities_snomed.dropna(subset=['CODE'])) / len(linked_entities_snomed) * 100}% of the entities have a SNOMED code")

In [None]:
print(f"{len(linked_entities_mesh.dropna(subset=['CODE'])) / len(linked_entities_mesh) * 100}% of the entities have a MeSH code")

# Step 8. Utilize MeSH Hierarchy to Semantically Group Linked Entities
This file was originally downloaded from the NIH National Library of Medicine Website at [The Following Link](https://www.nlm.nih.gov/databases/download/mesh.html). The original file is in XML format, which I then processed and converted into a CSV file for ease of loading and reduced disk storage.

In [None]:
# now load in the MeSH Hierarchy file
mesh_hierarchy = pd.read_csv('https://github.com/btwooton/arch_workshop_scispacy_entity_linking_ws11/raw/refs/heads/main/mesh_hierarchy.csv')

In [None]:
mesh_hierarchy

In [None]:
# now we link in the tree numbers to the MeSH mapped entities
linked_entities_mesh_hierarchy = linked_entities_mesh.merge(
    mesh_hierarchy[['UI', 'tree_number']],
    left_on='CODE',
    right_on='UI',
    how='inner'
)

In [None]:
linked_entities_mesh_hierarchy

In [None]:
set(linked_entities_mesh_hierarchy['tree_number'])

In [None]:
# format the mesh hierarchy as a lookup table/dictionary
mesh_dictionary = {
    row['tree_number']: row['name']
    for _, row in tqdm(mesh_hierarchy.iterrows(), total=len(mesh_hierarchy))
}

In [None]:
# now we specify a function to walk up the MeSH tree for each entity
def walk_mesh_hierarchy(entities_df, mesh_hierarchy):
  result = entities_df.copy()
  # get the set of distinct tree numbers in the dataset
  tree_nums = set(entities_df['tree_number'].tolist())
  # start at the top level
  level = 1
  # while we still have tree numbers to process
  while len(tree_nums) > 0:
    print(f"Processing level {level}")
    # save the mappings for the current level in a list
    level_mappings = []
    # keep track of tree numbers to remove after processing this level
    to_remove = set()
    # loop over the tree nums
    for tree_num in tree_nums:
      # get the prefix for the current tree level
      prefix = '.'.join(tree_num.split(".")[:level])
      # if the prefix is different from the tree number, save a mapping for the current level
      if prefix != tree_num:
        level_mappings.append({
            'tree_number': tree_num,
            f'level_{level}_tree_number': prefix,
            f'level_{level}_parent_name': mesh_hierarchy[prefix]
        })
      else:
        # we have already enumerated all ancestors if the prefix matches, so remove the tree number
        to_remove.add(tree_num)
    # merge in the mappings for the current level if we have any
    if len(level_mappings) > 0:
      result = result.merge(
          pd.DataFrame(level_mappings),
          on='tree_number',
          how='left'
      )
    # move one level down the tree
    level += 1
    # update the set of tree_nums
    tree_nums = tree_nums - to_remove
  # return the result dataframe
  return result


In [None]:
linked_entities_mesh_final = walk_mesh_hierarchy(
    linked_entities_mesh_hierarchy,
    mesh_dictionary
)

In [None]:
linked_entities_mesh_final[['note_id', 'entity_name', 'name', 'definition', 'label', 'level_1_parent_name', 'level_2_parent_name', 'tree_number']]

# Exercises

In [None]:
# Exercise 1: Count the number of patient notes that mention respiratory tract diseases

In [None]:
# Exercise 2: For entities with a tree number prefixed by 'C' (Diseases) Rank them by number of notes mentioning each kind of disease
# Use the level 1 parent name


In [None]:
# Exercise 3: What are the 10 most frequent anatomical parts mentioned in notes tagged with Neoplasms?
# Note: MeSH terms categorized as anatomical have a tree number prefixed by 'A'

