# Step 1. Loading Public Clinical Notes Data from Hugging Face


In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
# load in the clinical notes dataset from huggingface
df = pd.read_json("hf://datasets/AGBonnet/augmented-clinical-notes/augmented_notes_30K.jsonl", lines=True, nrows=300)



In [3]:
df

Unnamed: 0,note,conversation,idx,summary,full_note
0,"A a sixteen year-old girl, presented to our Ou...","Doctor: Good morning, what brings you to the O...",155216,"{\n""visit motivation"": ""Discomfort in the neck...","A a sixteen year-old girl, presented to our Ou..."
1,This is the case of a 56-year-old man that was...,"Doctor: Hi, how are you feeling today?\nPatien...",77465,"{\n""visit motivation"": ""Complaints of a dull p...",This is the case of a 56-year-old man that was...
2,A 36-year old female patient visited our hospi...,"Doctor: Hello, what brings you to the hospital...",133948,"{\n""visit motivation"": ""Pain and restricted ra...",A 36-year old female patient visited our hospi...
3,A 49-year-old male presented with a complaint ...,"Doctor: Good morning, Mr. [Patient's Name]. I'...",80176,"{\n""visit motivation"": ""Pain in the left proxi...",A 49-year-old male presented with a complaint ...
4,A 47-year-old male patient was referred to the...,"Doctor: Good morning, how are you feeling toda...",72232,"{\n""visit motivation"": ""Recurrent attacks of p...",A 47-year-old male patient was referred to the...
...,...,...,...,...,...
295,"A 36-year-old man, originally from Latin Ameri...","Doctor: Hello, what brings you in today?\nPati...",174877,"{\n""visit motivation"": ""Complaints of abdomina...","A 36-year-old man, originally from Latin Ameri..."
296,An otherwise healthy 22-year-old caucasian wom...,"Doctor: Good morning, how can I help you today...",41761,"{\n""visit motivation"": ""Accelerated growth of ...",An otherwise healthy 22-year-old caucasian wom...
297,"A 36-year-old man, originally from Latin Ameri...","Doctor: Good afternoon, sir. I understand that...",7876,"{\n""visit motivation"": ""Complaints of abdomina...","A 36-year-old man, originally from Latin Ameri..."
298,Our patient is a 38-year-old male who presente...,"Doctor: Hi there, how are you feeling today?\n...",182286,"{\n""visit motivation"": ""Right chest wall and s...",Our patient is a 38-year-old male who presente...


# Step 2. Importing scispaCy package and creation of model for Named Entity Linking

In [4]:
import spacy
import scispacy
from scispacy.linking import EntityLinker

In [6]:
# now we create our model instance which can be used to process biomedical text
nlp = spacy.load("en_core_sci_sm")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [7]:
# now we add a linker to the UMLS knowledgebase to our model pipeline
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "mesh"})

<scispacy.linking.EntityLinker at 0x31a9616d0>

In [8]:
linker = nlp.get_pipe("scispacy_linker")

# Step 3. Definition of Function for Extracting Named Entities from the Clinical Notes

In [9]:
def get_linked_entities_for_doc(text, nlp, linker):
  # get the document
  doc = nlp(text)
  # get the linked entities
  linked_entities = []
  for ent in doc.ents: # get all recognized entities
    for entry in ent._.kb_ents[:3]: # get the top 3 linked vocabulary terms for each entity
      linked_entities.append({
          'entity_name': ent.text,
          'cui': entry[0],
          'score': entry[1],
          'name': linker.kb.cui_to_entity[entry[0]].canonical_name,
          'definition': linker.kb.cui_to_entity[entry[0]].definition,
          'type_ids': ','.join(linker.kb.cui_to_entity[entry[0]].types),
      })
  return pd.DataFrame(linked_entities).drop_duplicates()


In [10]:
get_linked_entities_for_doc(df['full_note'][1], nlp, linker).query('score >= 0.9')

Unnamed: 0,entity_name,cui,score,name,definition,type_ids
0,patient,C0030705,0.986736,Patients,Individuals participating in the health care s...,T101
10,X-ray,C0034571,0.997294,X-ray image,"Used with organs, regions, and diseases for x-...",T169
11,X-ray,C0043309,0.997294,"Ray, X",Penetrating electromagnetic radiation emitted ...,T070
14,thorax,C0817096,0.978844,Thorace,The upper part of the trunk between the NECK a...,T029
16,tumor,C0027651,0.992931,Tumor,New abnormal growth of tissue. Malignant neopl...,T191
19,thoracic wall,C0205076,0.975917,Chest Wall,The total system of structures outside the lun...,T023
24,rib,C0035561,0.992131,Rib,A set of twelve curved bones which connect to ...,T023
25,lying,C0600261,0.949976,Lying,telling untruths,T055
28,lung,C0024109,0.983171,Lung,Either of the pair of organs occupying the cav...,T023
33,lungs,C0024109,0.998977,Lung,Either of the pair of organs occupying the cav...,T023


In [11]:
import pprint

In [12]:
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(df['full_note'][1])

('This is the case of a 56-year-old man that was complaining of a dump pain on '
 'the right back and a swelling right in this place for several weeks. The '
 'patient was in good state and very active. There was not any health problem '
 'in the past except a thoracic trauma at work one year ago. In that time the '
 'patient was diagnosed with a simple fracture of the 9th right rib without '
 'any other consequences.\\nOn the X-ray was seen a shadow in the lower part '
 'of the right hemithorax. After that, it was decided to perform a CT-scan of '
 'the thorax that revealed a tumor of the thoracic wall in the right '
 'hemithorax that measured 8 × 4 cm and had a heterogeneous density inside of '
 'it. The tumor had involved and destructed the 9th rib and was lying even in '
 'two adjacent intercostal spaces, but without involving the lung and muscular '
 'layers. On lung window of the CT-scan were seen micronodular infiltrations '
 'of both lungs with diameters up to 5 mm and only one

# Step 4. Extracting All Named Entities from The Clinical Notes

In [13]:
from tqdm import tqdm

In [14]:
linked_entity_dfs = []
for _, row in tqdm(df.iterrows(), total=len(df)):
  linked_entities = get_linked_entities_for_doc(row['full_note'], nlp, linker).query('score >= 0.9')
  linked_entity_dfs.append(
      linked_entities.assign(type_ids_lst=lambda x: x['type_ids'].str.split(','))
      .explode('type_ids_lst').assign(note_id=row['idx'])
    )

100%|█████████████████████████████████████| 300/300 [00:46<00:00,  6.49it/s]


In [15]:
linked_entities_all = pd.concat(linked_entity_dfs)

In [16]:
linked_entities_all

Unnamed: 0,entity_name,cui,score,name,definition,type_ids,type_ids_lst,note_id
0,girl,C0043210,0.992564,Woman,"Human females as cultural, psychological, soci...",T098,T098,155216
2,neck,C0027530,0.986622,Neck,The part of a human or animal body connecting ...,T029,T029,155216
9,posture,C1262869,0.990923,Postures,The position or physical attitude of the body.,T033,T033,155216
12,sitting position,C0277814,0.976172,Sitting Position,The state or act of one who sits; the posture ...,T033,T033,155216
14,head,C0018670,0.988843,Head,"The upper part of the human body, or the front...",T029,T029,155216
...,...,...,...,...,...,...,...,...
106,cardiopulmonary bypass,C0007202,0.977226,"Bypass, Cardiopulmonary",Diversion of the flow of blood from the entran...,T061,T061,90099
117,patient,C0030705,0.986736,Patients,Individuals participating in the health care s...,T101,T101,90099
121,Intensive Care Unit,C0021708,0.989781,Intensive Care Unit,Hospital units providing continuous surveillan...,"T073,T093",T073,90099
121,Intensive Care Unit,C0021708,0.989781,Intensive Care Unit,Hospital units providing continuous surveillan...,"T073,T093",T093,90099


In [17]:
# inspecting the top 50 occuring linked entity names
linked_entities_all['entity_name'].value_counts().head(50)

entity_name
patient                       281
surgery                       226
diagnosis                     180
treatment                     156
history                       154
hospital                      148
male                           98
emergency department           98
pain                           98
female                         80
procedure                      76
findings                       73
symptoms                       71
abdomen                        63
trauma                         62
time                           56
blood                          56
woman                          53
operating room                 52
physical examination           49
pathology                      46
complications                  45
intensive care unit            44
tumor                          43
hemoglobin                     42
skin                           42
period                         41
recurrence                     40
chemotherapy                   38
bi

# Step 5. Labeling Semantic Types for All Linked Entities

In [None]:
# first load in the file containing the labels for the semantic types
semantic_type_labels = pd.read_csv('https://github.com/expmed/arch_workshop_scispacy_entity_linking_ws11/raw/refs/heads/main/umls_terms.txt')

In [19]:
semantic_type_labels

Unnamed: 0,tui,label
0,T001,Organism
1,T002,Plant
2,T004,Fungus
3,T005,Virus
4,T007,Bacterium
...,...,...
122,T197,Inorganic Chemical
123,T200,Clinical Drug
124,T201,Clinical Attribute
125,T203,Drug Delivery Device


In [20]:
# now we add in these semantic type labels
linked_entities_final = linked_entities_all.merge(
    semantic_type_labels,
    left_on='type_ids_lst',
    right_on='tui',
    how='left'
)

In [21]:
linked_entities_final

Unnamed: 0,entity_name,cui,score,name,definition,type_ids,type_ids_lst,note_id,tui,label
0,girl,C0043210,0.992564,Woman,"Human females as cultural, psychological, soci...",T098,T098,155216,T098,Population Group
1,neck,C0027530,0.986622,Neck,The part of a human or animal body connecting ...,T029,T029,155216,T029,Body Location or Region
2,posture,C1262869,0.990923,Postures,The position or physical attitude of the body.,T033,T033,155216,T033,Finding
3,sitting position,C0277814,0.976172,Sitting Position,The state or act of one who sits; the posture ...,T033,T033,155216,T033,Finding
4,head,C0018670,0.988843,Head,"The upper part of the human body, or the front...",T029,T029,155216,T029,Body Location or Region
...,...,...,...,...,...,...,...,...,...,...
11537,cardiopulmonary bypass,C0007202,0.977226,"Bypass, Cardiopulmonary",Diversion of the flow of blood from the entran...,T061,T061,90099,T061,Therapeutic or Preventive Procedure
11538,patient,C0030705,0.986736,Patients,Individuals participating in the health care s...,T101,T101,90099,T101,Patient or Disabled Group
11539,Intensive Care Unit,C0021708,0.989781,Intensive Care Unit,Hospital units providing continuous surveillan...,"T073,T093",T073,90099,T073,Manufactured Object
11540,Intensive Care Unit,C0021708,0.989781,Intensive Care Unit,Hospital units providing continuous surveillan...,"T073,T093",T093,90099,T093,Health Care Related Organization


## Mini Exercise: Show the Top 20 Most Frequently Occuring Semantic Types Among Linked Entities

In [22]:
# Your solution below...

# Step 6. Utilizing Publicly Available Crosswalk File to Link Entities to MeSH Terms and SNOMED Terms

In [None]:
# first we load in mappings from UMLS concept unique ids to Medical Subject Heading (MeSH) terms
mrconso_mesh_mappings = pd.read_parquet('https://github.com/expmed/arch_workshop_scispacy_entity_linking_ws11/raw/refs/heads/main/mrconso_mesh.parquet')

In [25]:
mrconso_mesh_mappings

Unnamed: 0,CUI,string_description,ISPREF,CODE,abbreviated_source
0,C0000039,"1,2-Dipalmitoylphosphatidylcholine",Y,D015060,MSH
1,C0000052,"1,4-alpha-Glucan Branching Enzyme",N,D015061,MSH
2,C0000084,1-Carboxyglutamic Acid,N,D015055,MSH
3,C0000096,1-Methyl-3-isobutylxanthine,Y,D015056,MSH
4,C0000097,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",Y,D015632,MSH
...,...,...,...,...,...
30902,C5940653,Network Meta-Analysis,Y,D000099094,MSH
30903,C5940654,Intelligent Systems,Y,D000098403,MSH
30904,C5942421,Suicidal Ideation,Y,D059020,MSH
30905,C5942429,Humidifiers,Y,D000068998,MSH


In [26]:
# now we can merge the linked entities with the MeSH mappings
linked_entities_mesh = linked_entities_final.merge(
    mrconso_mesh_mappings,
    left_on='cui',
    right_on='CUI',
    how='left'
)

In [27]:
linked_entities_mesh

Unnamed: 0,entity_name,cui,score,name,definition,type_ids,type_ids_lst,note_id,tui,label,CUI,string_description,ISPREF,CODE,abbreviated_source
0,girl,C0043210,0.992564,Woman,"Human females as cultural, psychological, soci...",T098,T098,155216,T098,Population Group,C0043210,Women,Y,D014930,MSH
1,neck,C0027530,0.986622,Neck,The part of a human or animal body connecting ...,T029,T029,155216,T029,Body Location or Region,C0027530,Neck,N,D009333,MSH
2,posture,C1262869,0.990923,Postures,The position or physical attitude of the body.,T033,T033,155216,T033,Finding,,,,,
3,sitting position,C0277814,0.976172,Sitting Position,The state or act of one who sits; the posture ...,T033,T033,155216,T033,Finding,C0277814,Sitting Position,Y,D000077708,MSH
4,head,C0018670,0.988843,Head,"The upper part of the human body, or the front...",T029,T029,155216,T029,Body Location or Region,C0018670,Head,N,D006257,MSH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11537,cardiopulmonary bypass,C0007202,0.977226,"Bypass, Cardiopulmonary",Diversion of the flow of blood from the entran...,T061,T061,90099,T061,Therapeutic or Preventive Procedure,C0007202,Cardiopulmonary Bypass,N,D002315,MSH
11538,patient,C0030705,0.986736,Patients,Individuals participating in the health care s...,T101,T101,90099,T101,Patient or Disabled Group,C0030705,Patients,N,D010361,MSH
11539,Intensive Care Unit,C0021708,0.989781,Intensive Care Unit,Hospital units providing continuous surveillan...,"T073,T093",T073,90099,T073,Manufactured Object,C0021708,Intensive Care Units,Y,D007362,MSH
11540,Intensive Care Unit,C0021708,0.989781,Intensive Care Unit,Hospital units providing continuous surveillan...,"T073,T093",T093,90099,T093,Health Care Related Organization,C0021708,Intensive Care Units,Y,D007362,MSH


In [None]:
# now we load in the mappings from CUIs to Systematized Nomenclature of Medicine - Clinical Terms
mrconso_snomed_mappings = pd.read_parquet('https://github.com/expmed/arch_workshop_scispacy_entity_linking_ws11/raw/refs/heads/main/mrconso_snomed.parquet')

In [29]:
linked_entities_snomed = linked_entities_final.merge(
    mrconso_snomed_mappings,
    left_on='cui',
    right_on='CUI',
    how='left'
)

In [30]:
linked_entities_snomed

Unnamed: 0,entity_name,cui,score,name,definition,type_ids,type_ids_lst,note_id,tui,label,CUI,string_description,ISPREF,CODE,abbreviated_source
0,girl,C0043210,0.992564,Woman,"Human females as cultural, psychological, soci...",T098,T098,155216,T098,Population Group,C0043210,Woman,N,224526002,SNOMEDCT_US
1,neck,C0027530,0.986622,Neck,The part of a human or animal body connecting ...,T029,T029,155216,T029,Body Location or Region,C0027530,Neck structure,Y,45048000,SNOMEDCT_US
2,posture,C1262869,0.990923,Postures,The position or physical attitude of the body.,T033,T033,155216,T033,Finding,C1262869,Body position finding,Y,9851009,SNOMEDCT_US
3,sitting position,C0277814,0.976172,Sitting Position,The state or act of one who sits; the posture ...,T033,T033,155216,T033,Finding,C0277814,Sitting position,N,33586001,SNOMEDCT_US
4,head,C0018670,0.988843,Head,"The upper part of the human body, or the front...",T029,T029,155216,T029,Body Location or Region,C0018670,Head structure,Y,69536005,SNOMEDCT_US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14291,cardiopulmonary bypass,C0007202,0.977226,"Bypass, Cardiopulmonary",Diversion of the flow of blood from the entran...,T061,T061,90099,T061,Therapeutic or Preventive Procedure,C0007202,Cardiopulmonary bypass operation,Y,63697000,SNOMEDCT_US
14292,patient,C0030705,0.986736,Patients,Individuals participating in the health care s...,T101,T101,90099,T101,Patient or Disabled Group,C0030705,Patient,Y,116154003,SNOMEDCT_US
14293,Intensive Care Unit,C0021708,0.989781,Intensive Care Unit,Hospital units providing continuous surveillan...,"T073,T093",T073,90099,T073,Manufactured Object,C0021708,Intensive care unit,Y,309904001,SNOMEDCT_US
14294,Intensive Care Unit,C0021708,0.989781,Intensive Care Unit,Hospital units providing continuous surveillan...,"T073,T093",T093,90099,T093,Health Care Related Organization,C0021708,Intensive care unit,Y,309904001,SNOMEDCT_US


In [31]:
print(f"{len(linked_entities_snomed.dropna(subset=['CODE'])) / len(linked_entities_snomed) * 100}% of the entities have a SNOMED code")

79.3788472299944% of the entities have a SNOMED code


In [32]:
print(f"{len(linked_entities_mesh.dropna(subset=['CODE'])) / len(linked_entities_mesh) * 100}% of the entities have a MeSH code")

78.96378443943857% of the entities have a MeSH code


# Step 7. Utilize MeSH Hierarchy to Semantically Group Linked Entities
This file was originally downloaded from the NIH National Library of Medicine Website at [The Following Link](https://www.nlm.nih.gov/databases/download/mesh.html). The original file is in XML format, which I then processed and converted into a CSV file for ease of loading and reduced disk storage.

In [None]:
# now load in the MeSH Hierarchy file
mesh_hierarchy = pd.read_csv('https://github.com/expmed/arch_workshop_scispacy_entity_linking_ws11/raw/refs/heads/main/mesh_hierarchy.csv')

In [34]:
mesh_hierarchy

Unnamed: 0,UI,name,tree_number
0,D000001,Calcimycin,D02.355.291.933.125
1,D000001,Calcimycin,D02.540.576.625.125
2,D000001,Calcimycin,D03.633.100.221.173
3,D000001,Calcimycin,D04.345.241.654.125
4,D000001,Calcimycin,D04.345.674.625.125
...,...,...,...
64878,D000099090,"Surgical Procedures, Colorectal",E04.210.896
64879,D000099091,Medical Interpreting,L01.143.600
64880,D000099092,Nightclubs,J03.635
64881,D000099093,Aging in Place,I03.050.500.001


In [35]:
# now we link in the tree numbers to the MeSH mapped entities
linked_entities_mesh_hierarchy = linked_entities_mesh.merge(
    mesh_hierarchy[['UI', 'tree_number']],
    left_on='CODE',
    right_on='UI',
    how='inner'
)

In [36]:
linked_entities_mesh_hierarchy

Unnamed: 0,entity_name,cui,score,name,definition,type_ids,type_ids_lst,note_id,tui,label,CUI,string_description,ISPREF,CODE,abbreviated_source,UI,tree_number
0,girl,C0043210,0.992564,Woman,"Human females as cultural, psychological, soci...",T098,T098,155216,T098,Population Group,C0043210,Women,Y,D014930,MSH,D014930,M01.975
1,neck,C0027530,0.986622,Neck,The part of a human or animal body connecting ...,T029,T029,155216,T029,Body Location or Region,C0027530,Neck,N,D009333,MSH,D009333,A01.598
2,sitting position,C0277814,0.976172,Sitting Position,The state or act of one who sits; the posture ...,T033,T033,155216,T033,Finding,C0277814,Sitting Position,Y,D000077708,MSH,D000077708,G11.427.695.575
3,head,C0018670,0.988843,Head,"The upper part of the human body, or the front...",T029,T029,155216,T029,Body Location or Region,C0018670,Head,N,D006257,MSH,D006257,A01.456
4,neck muscles,C0027532,0.987768,"Muscles, Neck","The neck muscles consist of the platysma, sple...",T023,T023,155216,T023,"Body Part, Organ, or Organ Component",C0027532,Neck Muscles,Y,D009334,MSH,D009334,A02.633.567.650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15490,cardiopulmonary bypass,C0007202,0.977226,"Bypass, Cardiopulmonary",Diversion of the flow of blood from the entran...,T061,T061,90099,T061,Therapeutic or Preventive Procedure,C0007202,Cardiopulmonary Bypass,N,D002315,MSH,D002315,E04.292.413
15491,patient,C0030705,0.986736,Patients,Individuals participating in the health care s...,T101,T101,90099,T101,Patient or Disabled Group,C0030705,Patients,N,D010361,MSH,D010361,M01.643
15492,Intensive Care Unit,C0021708,0.989781,Intensive Care Unit,Hospital units providing continuous surveillan...,"T073,T093",T073,90099,T073,Manufactured Object,C0021708,Intensive Care Units,Y,D007362,MSH,D007362,N02.278.388.493
15493,Intensive Care Unit,C0021708,0.989781,Intensive Care Unit,Hospital units providing continuous surveillan...,"T073,T093",T093,90099,T093,Health Care Related Organization,C0021708,Intensive Care Units,Y,D007362,MSH,D007362,N02.278.388.493


In [37]:
set(linked_entities_mesh_hierarchy['tree_number'])

{'D12.776.210.500.910.900',
 'E05.242.251',
 'C26.558.276',
 'D04.210.500.247.222.159.478.387.300',
 'E01.370.225.998.329.810',
 'A02.513.514.287',
 'G01.750.770.578',
 'C17.800.428.260',
 'C04.557.470.200.400',
 'A03.556.875.875.440',
 'Z01.107.567.176',
 'C14.907.253.855.200.200.450',
 'E05.723.402',
 'C26.404.875.500',
 'C01.221.812.640.400.040',
 'A09.371.784',
 'B03',
 'A02.835.583.979',
 'C19.246.099.875',
 'E05.978.155',
 'D08.811.277.450.066',
 'C12.950.967.500',
 'E02.926.500.074',
 'A01.456.505.631',
 'E01.370.600.620',
 'C26.558.276.425.500.500',
 'A01.923.761.800.650',
 'M01.526.485.810.699.500',
 'A02.633.567.050.750',
 'D03.383.725.203.065',
 'C14.280.123.500.230',
 'G01.374.715',
 'D04.615.638.721.374.689',
 'C19.344.894.400',
 'C14.280.720',
 'I01.800',
 'G01.750.250.650.782',
 'B01.650.915',
 'H02.403.763.750',
 'F04.754.137.350',
 'A02.633.567.825',
 'C12.200.777.419.307',
 'A02.835.232.781.750',
 'D27.720.013',
 'A07.541.795.500',
 'D12.776.377.715.548.705.750',
 'D1

In [38]:
# format the mesh hierarchy as a lookup table/dictionary
mesh_dictionary = {
    row['tree_number']: row['name']
    for _, row in tqdm(mesh_hierarchy.iterrows(), total=len(mesh_hierarchy))
}

100%|██████████████████████████████| 64883/64883 [00:00<00:00, 67424.41it/s]


In [39]:
# now we specify a function to walk up the MeSH tree for each entity
def walk_mesh_hierarchy(entities_df, mesh_hierarchy):
  result = entities_df.copy()
  # get the set of distinct tree numbers in the dataset
  tree_nums = set(entities_df['tree_number'].tolist())
  # start at the top level
  level = 1
  # while we still have tree numbers to process
  while len(tree_nums) > 0:
    print(f"Processing level {level}")
    # save the mappings for the current level in a list
    level_mappings = []
    # keep track of tree numbers to remove after processing this level
    to_remove = set()
    # loop over the tree nums
    for tree_num in tree_nums:
      # get the prefix for the current tree level
      prefix = '.'.join(tree_num.split(".")[:level])
      # if the prefix is different from the tree number, save a mapping for the current level
      if prefix != tree_num:
        level_mappings.append({
            'tree_number': tree_num,
            f'level_{level}_tree_number': prefix,
            f'level_{level}_parent_name': mesh_hierarchy[prefix]
        })
      else:
        # we have already enumerated all ancestors if the prefix matches, so remove the tree number
        to_remove.add(tree_num)
    # merge in the mappings for the current level if we have any
    if len(level_mappings) > 0:
      result = result.merge(
          pd.DataFrame(level_mappings),
          on='tree_number',
          how='left'
      )
    # move one level down the tree
    level += 1
    # update the set of tree_nums
    tree_nums = tree_nums - to_remove
  # return the result dataframe
  return result


In [40]:
linked_entities_mesh_final = walk_mesh_hierarchy(
    linked_entities_mesh_hierarchy,
    mesh_dictionary
)

Processing level 1
Processing level 2
Processing level 3
Processing level 4
Processing level 5
Processing level 6
Processing level 7
Processing level 8
Processing level 9
Processing level 10
Processing level 11


In [41]:
linked_entities_mesh_final[['note_id', 'entity_name', 'name', 'definition', 'label', 'level_1_parent_name', 'level_2_parent_name', 'tree_number']]

Unnamed: 0,note_id,entity_name,name,definition,label,level_1_parent_name,level_2_parent_name,tree_number
0,155216,girl,Woman,"Human females as cultural, psychological, soci...",Population Group,Persons,,M01.975
1,155216,neck,Neck,The part of a human or animal body connecting ...,Body Location or Region,Body Regions,,A01.598
2,155216,sitting position,Sitting Position,The state or act of one who sits; the posture ...,Finding,Musculoskeletal and Neural Physiological Pheno...,Musculoskeletal Physiological Phenomena,G11.427.695.575
3,155216,head,Head,"The upper part of the human body, or the front...",Body Location or Region,Body Regions,,A01.456
4,155216,neck muscles,"Muscles, Neck","The neck muscles consist of the platysma, sple...","Body Part, Organ, or Organ Component",Musculoskeletal System,Muscles,A02.633.567.650
...,...,...,...,...,...,...,...,...
15490,90099,cardiopulmonary bypass,"Bypass, Cardiopulmonary",Diversion of the flow of blood from the entran...,Therapeutic or Preventive Procedure,"Surgical Procedures, Operative",Extracorporeal Circulation,E04.292.413
15491,90099,patient,Patients,Individuals participating in the health care s...,Patient or Disabled Group,Persons,,M01.643
15492,90099,Intensive Care Unit,Intensive Care Unit,Hospital units providing continuous surveillan...,Manufactured Object,Health Care Facilities Workforce and Services,Health Facilities,N02.278.388.493
15493,90099,Intensive Care Unit,Intensive Care Unit,Hospital units providing continuous surveillan...,Health Care Related Organization,Health Care Facilities Workforce and Services,Health Facilities,N02.278.388.493


# Exercises

In [None]:
# Exercise 1: Count the number of patient notes that mention respiratory tract diseases

In [None]:
# Exercise 2: For entities with a tree number prefixed by 'C' (Diseases) Rank them by number of notes mentioning each kind of disease
# Use the level 1 parent name


In [None]:
# Exercise 3: What are the 10 most frequent anatomical parts mentioned in notes tagged with Neoplasms?
# Note: MeSH terms categorized as anatomical have a tree number prefixed by 'A'

