# Named Entity Recognition

## Librerie Utili

In [1]:
import pandas as pd

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

from medcat.cat import CAT
from medcat.cdb import CDB
from medcat.vocab import Vocab
from medcat.meta_cat import MetaCAT

## Lettura del NOTEEVENTS preprocessato

In [2]:
data = pd.read_csv("preprocessed_noteevents.csv", sep=";", header=0)
data.head(5)

Unnamed: 0,ROW_ID,SUBJECT_ID,CHARTDATE,TEXT,GENDER,DOB,DOD,EXPIRE_FLAG,LEMMATIZED TEXT,TOKEN_COUNT_COMP
0,722,27431,2148-03-07,Admission Date: [**2148-3-2**] D...,M,2070-10-04 00:00:00,2148-04-08 00:00:00,1,Chief Complaint Hypotension/hypoxia Major Surg...,1005.25
1,723,27431,2148-03-29,Admission Date: [**2148-3-20**] ...,M,2070-10-04 00:00:00,2148-04-08 00:00:00,1,Chief Complaint Hypoxia Major Surgical Invasiv...,1676.5
2,724,27431,2148-04-08,Admission Date: [**2148-4-8**] D...,M,2070-10-04 00:00:00,2148-04-08 00:00:00,1,Chief Complaint hypoxia s/p PEA arrest Major S...,379.75
3,1907,21323,2139-05-04,Admission Date: [**2139-4-26**] ...,M,2082-12-21 00:00:00,,0,Chief Complaint black stool Major Surgical Inv...,504.0
4,1905,21323,2135-08-06,Admission Date: [**2135-7-30**] Discharge...,M,2082-12-21 00:00:00,,0,HISTORY PRESENT ILLNESS Patient 52-year-old ma...,561.25


## Modelli NER Hugging Face

### Modello Taught Net

In [3]:
tokenizer = AutoTokenizer.from_pretrained("marcopost-it/TaughtNet-disease-chem-gene")
model = AutoModelForTokenClassification.from_pretrained("marcopost-it/TaughtNet-disease-chem-gene")
pipe = pipeline("ner", tokenizer=tokenizer, model=model, aggregation_strategy='first')

In [4]:
tokenizer.model_max_length = 512

In [5]:
ner = pipe(data['TEXT'][0])

for entity in ner:
    print(entity)

{'entity_group': 'disease', 'score': 0.2921802, 'word': ' schizophrenia,', 'start': 446, 'end': 460}
{'entity_group': 'disease', 'score': 0.26068684, 'word': ' CAD,', 'start': 461, 'end': 465}
{'entity_group': 'disease', 'score': 0.25280395, 'word': ' HTN,', 'start': 466, 'end': 470}
{'entity_group': 'disease', 'score': 0.296749, 'word': ' dementia', 'start': 471, 'end': 479}
{'entity_group': 'disease', 'score': 0.30921772, 'word': ' FTT', 'start': 496, 'end': 499}
{'entity_group': 'chem', 'score': 0.30997872, 'word': ' CTX.\nSubsequently,', 'start': 849, 'end': 867}
{'entity_group': 'disease', 'score': 0.23614524, 'word': ' diarrhea.', 'start': 1119, 'end': 1128}
{'entity_group': 'chem', 'score': 0.31843758, 'word': ' Levaquin', 'start': 1427, 'end': 1435}
{'entity_group': 'chem', 'score': 0.30741656, 'word': ' FLagyl', 'start': 1440, 'end': 1446}
{'entity_group': 'disease', 'score': 0.24316074, 'word': ' aspiration PNA\nalthough', 'start': 1460, 'end': 1483}




In [6]:
ner = pipe(data['LEMMATIZED TEXT'][0])

for entity in ner:
    print(entity)

{'entity_group': 'disease', 'score': 0.29073653, 'word': ' Hypotension/hypoxia', 'start': 16, 'end': 35}
{'entity_group': 'disease', 'score': 0.30189118, 'word': ' schizophrenia', 'start': 163, 'end': 176}
{'entity_group': 'disease', 'score': 0.28751686, 'word': ' CAD', 'start': 177, 'end': 180}
{'entity_group': 'disease', 'score': 0.25589857, 'word': ' HTN dementia', 'start': 181, 'end': 193}
{'entity_group': 'disease', 'score': 0.25982344, 'word': ' hypoxia', 'start': 198, 'end': 205}
{'entity_group': 'disease', 'score': 0.23587082, 'word': ' FTT', 'start': 206, 'end': 209}
{'entity_group': 'chem', 'score': 0.31625417, 'word': ' levaquin', 'start': 405, 'end': 413}
{'entity_group': 'chem', 'score': 0.29079407, 'word': ' CTX', 'start': 447, 'end': 450}
{'entity_group': 'disease', 'score': 0.23271823, 'word': ' cough', 'start': 628, 'end': 633}
{'entity_group': 'disease', 'score': 0.2406837, 'word': ' diarrhea', 'start': 634, 'end': 642}
{'entity_group': 'disease', 'score': 0.22855523,

### Modello Bert Clinical NER

In [7]:
tokenizer2 = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
model2 = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
pipe2 = pipeline("ner", tokenizer=tokenizer2, model=model2, aggregation_strategy='first')

In [8]:
ner2 = pipe2(data['TEXT'][0])

for entity2 in ner2:
    print(entity2)

{'entity_group': 'problem', 'score': 0.9958555, 'word': 'known allergies', 'start': 193, 'end': 208}
{'entity_group': 'problem', 'score': 0.99867237, 'word': 'hypotension', 'start': 281, 'end': 292}
{'entity_group': 'problem', 'score': 0.9961487, 'word': 'hypoxia', 'start': 293, 'end': 300}
{'entity_group': 'treatment', 'score': 0.9305995, 'word': 'dobhoff tube', 'start': 353, 'end': 365}
{'entity_group': 'treatment', 'score': 0.76258314, 'word': 'arterial line', 'start': 379, 'end': 392}
{'entity_group': 'problem', 'score': 0.9971476, 'word': 'schizophrenia', 'start': 446, 'end': 459}
{'entity_group': 'problem', 'score': 0.9967463, 'word': 'cad', 'start': 461, 'end': 464}
{'entity_group': 'problem', 'score': 0.99339914, 'word': 'htn', 'start': 466, 'end': 469}
{'entity_group': 'problem', 'score': 0.9948183, 'word': 'dementia', 'start': 471, 'end': 479}
{'entity_group': 'problem', 'score': 0.9960675, 'word': 'hypoxia', 'start': 484, 'end': 491}
{'entity_group': 'problem', 'score': 0.97

In [9]:
ner2 = pipe2(data['LEMMATIZED TEXT'][0])

for entity2 in ner2:
    print(entity2)

{'entity_group': 'problem', 'score': 0.99713194, 'word': 'hypotension', 'start': 16, 'end': 27}
{'entity_group': 'problem', 'score': 0.9933426, 'word': 'hypoxia', 'start': 28, 'end': 35}
{'entity_group': 'treatment', 'score': 0.95294684, 'word': 'major surgical invasive procedure placement', 'start': 36, 'end': 79}
{'entity_group': 'treatment', 'score': 0.8965652, 'word': 'dobhoff tube placement', 'start': 80, 'end': 102}
{'entity_group': 'treatment', 'score': 0.8915086, 'word': 'line', 'start': 112, 'end': 116}
{'entity_group': 'problem', 'score': 0.9921606, 'word': 'schizophrenia', 'start': 163, 'end': 176}
{'entity_group': 'problem', 'score': 0.92668873, 'word': 'cad', 'start': 177, 'end': 180}
{'entity_group': 'problem', 'score': 0.9257811, 'word': 'htn', 'start': 181, 'end': 184}
{'entity_group': 'problem', 'score': 0.9828106, 'word': 'dementia', 'start': 185, 'end': 193}
{'entity_group': 'problem', 'score': 0.99694186, 'word': 'hypoxia', 'start': 198, 'end': 205}
{'entity_group':

### Modello Biomedical NER

In [10]:
tokenizer3 = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model3 = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")
pipe3 = pipeline("ner", tokenizer=tokenizer3, model=model3, aggregation_strategy='first')

In [11]:
ner3 = pipe3(data['TEXT'][0])

for entity3 in ner3:
    print(entity3)

{'entity_group': 'Sign_symptom', 'score': 0.60468596, 'word': 'hypoxia', 'start': 293, 'end': 300}
{'entity_group': 'Therapeutic_procedure', 'score': 0.9680628, 'word': 'dobhoff tube', 'start': 353, 'end': 365}
{'entity_group': 'Therapeutic_procedure', 'score': 0.7064111, 'word': 'arterial line', 'start': 379, 'end': 392}
{'entity_group': 'Lab_value', 'score': 0.9593899, 'word': '77', 'start': 422, 'end': 424}
{'entity_group': 'Sign_symptom', 'score': 0.73298556, 'word': 'dementia p / w', 'start': 471, 'end': 483}
{'entity_group': 'Sign_symptom', 'score': 0.9201439, 'word': 'hypoxia', 'start': 484, 'end': 491}
{'entity_group': 'Disease_disorder', 'score': 0.94587755, 'word': 'ftt', 'start': 496, 'end': 499}
{'entity_group': 'Disease_disorder', 'score': 0.99389136, 'word': 'pt', 'start': 538, 'end': 540}
{'entity_group': 'Sign_symptom', 'score': 0.99994147, 'word': 'desaturation', 'start': 559, 'end': 571}
{'entity_group': 'Lab_value', 'score': 0.8939878, 'word': "mid 80 ' s", 'start': 

In [12]:
ner3 = pipe3(data['LEMMATIZED TEXT'][0])

for entity3 in ner3:
    print(entity3)

{'entity_group': 'Sign_symptom', 'score': 0.99497956, 'word': 'hypotension', 'start': 16, 'end': 27}
{'entity_group': 'Sign_symptom', 'score': 0.9903528, 'word': 'hypoxia', 'start': 28, 'end': 35}
{'entity_group': 'Therapeutic_procedure', 'score': 0.932977, 'word': 'surgical invasive procedure', 'start': 42, 'end': 69}
{'entity_group': 'Therapeutic_procedure', 'score': 0.84485596, 'word': 'dobhoff tube placement', 'start': 80, 'end': 102}
{'entity_group': 'Diagnostic_procedure', 'score': 0.67839265, 'word': 'arterial line', 'start': 103, 'end': 116}
{'entity_group': 'Disease_disorder', 'score': 0.81617785, 'word': 'present', 'start': 125, 'end': 132}
{'entity_group': 'Disease_disorder', 'score': 0.7580083, 'word': 'illness', 'start': 133, 'end': 140}
{'entity_group': 'Lab_value', 'score': 0.6949835, 'word': '77 yo', 'start': 141, 'end': 146}
{'entity_group': 'Other_event', 'score': 0.2011465, 'word': 'nh', 'start': 147, 'end': 149}
{'entity_group': 'Detailed_description', 'score': 0.38

## Modello MedCAT

### Modelli clinici richiesti da MedCAT

In [13]:
DATA_DIR = "./data/"
vocab_path = DATA_DIR + "vocab.dat"
cdb_path = DATA_DIR + "cdb-medmen.dat"

### Caricamento dei modelli

In [14]:
# Create and load the CDB (Concept Database)
cdb = CDB.load(cdb_path)

# Create and load the Vocabulary
vocab = Vocab.load(vocab_path)

# Load model for MetaAnnotations (optional)
mc_status = MetaCAT.load("./data/Status/")

### Creazione del Medical Concept Annotation Tool (CAT)

In [15]:
# Create CAT - the main class from medcat used for concept annotation
cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab, meta_cats=[mc_status])

### Seleziono solo le entità di interesse

In [16]:
# Detect only the following entity:
    # bpoc|T023|Body Part, Organ, or Organ Component
    # lbtr|T034|Laboratory or Test Result
    # patf|T046|Pathologic Function
    # dsyn|T047|Disease or Syndrome
    # mobd|T048|Mental or Behavioral Dysfunction
    # lbpr|T059|Laboratory Procedure
    # diap|T060|Diagnostic Procedure
    # topp|T061|Therapeutic or Preventive Procedure
    # phsu|T121|Pharmacologic Substance ---> SE LO AGGIUNGO MI METTE ANCHE ALTRA ROBA TIPO VITAMINE, ENZIMI, ETC.
    # sosy|T184|Sign or Symptom

type_ids_filter = ['T023', 'T034', 'T046', 'T047', 'T048', 'T059', 'T060', 'T061', 'T121', 'T184']
cui_filters = set()
for type_ids in type_ids_filter:
    cui_filters.update(cat.cdb.addl_info['type_id2cuis'][type_ids])
cat.cdb.config.linking['filters']['cuis'] = cui_filters

In [17]:
color_dict = {
    'Body Part, Organ, or Organ Component': '#FFCE80',  # Arancione pastello
    'Laboratory or Test Result': '#FFF9C4',             # Giallo chiaro
    'Disease or Syndrome': '#B5EAD7',                   # Verde acqua chiaro
    'Mental or Behavioral Dysfunction': '#F0B2FF',      # Viola pastello
    'Laboratory Procedure': '#D0D9FF',                  # Blu chiaro
    'Diagnostic Procedure': '#FFD9EC',                  # Rosa chiaro
    'Therapeutic or Preventive Procedure': '#C4FFFF',   # Azzurro chiaro
    'Pharmacologic Substance': '#FFDAB9',               # Pesca pastello
    'Sign or Symptom': '#FFC4F3',                       # Magenta chiaro
    'Pathologic Function': '#B5EAD7'                    # Verde acqua chiaro
}

### Annotazione su testo NON preprocessato

In [18]:
entities = cat.get_entities(data['TEXT'][0])

extracted_data = []
# Extract 'pretty_name' and 'types' values from the dictionary
for key, value in entities['entities'].items():
    if value['meta_anns']['Status']['value'] == 'Affirmed':
        if value['types'][0] == 'Pharmacologic Substance':
            extracted_data.append((value['source_value'], value['types'][0]))
        else:
            extracted_data.append((value['pretty_name'], value['types'][0]))

# Create a dataframe
df = pd.DataFrame(extracted_data, columns=['Nome', 'Entità'])
print(df)

                          Nome                               Entità
0                     MEDICINE              Pharmacologic Substance
1                        DRUGS  Therapeutic or Preventive Procedure
2                      Hypoxia                  Pathologic Function
3         procedures treatment  Therapeutic or Preventive Procedure
4            placement of tube  Therapeutic or Preventive Procedure
..                         ...                                  ...
126       Hypertensive disease                  Disease or Syndrome
127  Coronary Arteriosclerosis                  Disease or Syndrome
128       Hypertensive disease                  Disease or Syndrome
129       Hypertensive disease                  Disease or Syndrome
130   Cerebrovascular accident                  Disease or Syndrome

[131 rows x 2 columns]


### Annotazione su testo preprocessato

In [19]:
entities_lemma = cat.get_entities(data['LEMMATIZED TEXT'][0])

extracted_data = []
# Extract 'pretty_name' and 'types' values from the dictionary
for key, value in entities_lemma['entities'].items():
    if value['meta_anns']['Status']['value'] == 'Affirmed':
        if value['types'][0] == 'Pharmacologic Substance':
            extracted_data.append((value['source_value'], value['types'][0]))
        else:
            extracted_data.append((value['pretty_name'], value['types'][0]))

# Create a dataframe
df = pd.DataFrame(extracted_data, columns=['Nome', 'Entità'])
print(df)

                                     Nome  \
0                                 Hypoxia   
1                    procedures treatment   
2                       placement of tube   
3                           Schizophrenia   
4               Coronary Arteriosclerosis   
5                    Hypertensive disease   
6                                Dementia   
7                                 Hypoxia   
8                  Fertilization in Vitro   
9                                 Hypoxia   
10                              Dizziness   
11                    Electrocardiography   
12                    Electrocardiography   
13                          Schizophrenia   
14                      Mental Depression   
15                   Hypertensive disease   
16                               Dementia   
17              Coronary Arteriosclerosis   
18                 Sternotomy (procedure)   
19         Coronary Artery Bypass Surgery   
20                                Hypoxia   
21        

### Visualizzazione testo annotato

In [20]:
from spacy import displacy

doc = {"text": data['LEMMATIZED TEXT'][0], "ents": [], "title": None}

for key, value in entities_lemma['entities'].items():
    if value['meta_anns']['Status']['value'] == 'Affirmed':
        ent = {
            'start': value['start'],
            'end': value['end'],
            'label': value['types'][0]
        }
        doc['ents'].append(ent)

colors = color_dict
options = {"ents": list(colors.keys()), "colors": colors}

# Call spacy.displacy.render() to visualize the entities
displacy.render(doc, style='ent', options=options, jupyter=True, manual=True)

## Salvataggio delle entità estratte dal Preprocessed NOTEEVENTS in un file CSV

In [21]:
# Lista per salvare i risultati finali
risultati_finali = []

# Itera su ogni riga del dataframe 'data'
for index, row in data.iterrows():
    # Esegui le operazioni per la riga corrente

    entities_tot = cat.get_entities(row['LEMMATIZED TEXT'])
    
    extracted_data = []
    # Extract 'pretty_name' and 'types' values from the dictionary
    for key, value in entities_tot['entities'].items():
        if value['meta_anns']['Status']['value'] == 'Affirmed':
            if value['type_ids'][0] in type_ids_filter:
                if value['types'][0] == 'Pharmacologic Substance':
                    extracted_data.append((value['source_value'], value['types'][0]))
                else:
                    extracted_data.append((value['pretty_name'], value['types'][0]))

    # Create a dataframe
    df = pd.DataFrame(extracted_data, columns=['Nome', 'Entità'])

    categorie_dict = {}
    for _, val in df.iterrows():
        nome = val['Nome']
        categoria = val['Entità']
        if categoria in categorie_dict:
            categorie_dict[categoria].append(nome)
        else:
            categorie_dict[categoria] = [nome]

    lista_finale = []
    for categoria, nomi in categorie_dict.items():
        lista_finale.append((categoria, nomi))

    df_finale = pd.DataFrame(lista_finale, columns=['Categoria', 'Nomi'])
    df_finale['Nomi'] = df_finale['Nomi'].apply(lambda x: ', '.join(set(x)))
    df_finale = df_finale.pivot_table(index=None, columns='Categoria', values='Nomi', aggfunc=lambda x: x)
    df_finale.columns.name = None

    df_finale['Subject ID'] = row['SUBJECT_ID']
    df_finale['Note ID'] = row['ROW_ID']
    # Aggiungi i risultati finali alla lista
    risultati_finali.append(df_finale)

# Unisci i risultati finali in un unico dataframe
df_completo = pd.concat(risultati_finali)

# Salvataggio del dataframe merge in un file CSV
df_completo.to_csv("ner_noteevents.csv", sep=";", index=False)