In [1]:
%cd ..

/home/ivan/Documents/rel_ext/pymedext_eds


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import json

In [4]:
from tqdm import tqdm

In [5]:
import os

from glob import glob
import pandas as pd
import re
from pprint import pprint
import pkg_resources

from pymedextcore.document import Document
from pymedext_eds.annotators import Endlines, SentenceTokenizer, Hypothesis, \
                                    ATCDFamille, SyntagmeTokenizer, Negation, RegexMatcher, \
                                    QuickUMLSAnnotator, Pipeline, SectionSplitter, Pipeline

from pymedext_eds.utils import rawtext_loader
from pymedext_eds.ner import NERAnnotator
from pymedext_eds.norm import NERNormalizer, FeedDictionnary, NormPhenoLev, NormPhenoDTW, NormPhenoCat

## 1. Init previous elements of pipeline

In [6]:
endlines = Endlines(['raw_text'], 'clean_text', 'endlines:v1')
sentences = SentenceTokenizer(['clean_text'], 'sentence', 'sentenceTokenizer:v1')
sections = SectionSplitter(['clean_text'], "section", ID= 'sections')
hypothesis = Hypothesis(['sentence'], 'hypothesis', 'hypothesis:v1')
family = ATCDFamille(['sentence'], 'context', 'ATCDfamily:v1')
syntagmes = SyntagmeTokenizer(['sentence'], 'syntagme', 'SyntagmeTokenizer:v1')
negation = Negation(['syntagme'], 'negation', 'Negation:v1')
regex = RegexMatcher(['clean_text','syntagme'], 'regex', 'RegexMatcher:v1', 'list_regexp.json')

sosy|T184|Sign or Symptom  


dsyn|T047|Disease or Syndrome  
neop|T191|Neoplastic Process  
comd|T049|Cell or Molecular Dysfunction  
mobd|T048|Mental or Behavioral Dysfunction   
patf|T046|Pathologic Function  
anab|T190|Anatomical Abnormality  
cgab|T019|Congenital Abnormality  
acab|T020|Acquired Abnormality  
inpo|T037|Injury or Poisoning  

diap|T060|Diagnostic Procedure  
lbpr|T059|Laboratory Procedure  
lbtr|T034|Laboratory or Test Result  

topp|T061|Therapeutic or Preventive Procedure  


In [7]:
#                                    accepted_semtypes = {'T184', 'T047', 'T191', 'T049', 'T048', 'T046', 'T190', 'T019', 'T020', 'T037','T060',  'T059', 'T034', 'T061'},


In [8]:
quick_umls_th = 0.9
quick_umls_dist = "score"

umls_syntagme = QuickUMLSAnnotator(['syntagme'], 'umls_syntagme', 'QuickUMLS:2020AA', 
                                   quickumls_fp='data/umls2_UL/',
                                   overlapping_criteria=quick_umls_dist,
                                   threshold=quick_umls_th,
                                   similarity_name='jaccard',
                                   accepted_semtypes = {'T184', 'T047', 'T191', 'T049', 'T048', 'T046', 'T190', 'T019', 'T020', 'T037'},
                                   window=5)

umls_signs = QuickUMLSAnnotator(['ENT/SIGNS'], 'umls_signs', 'QuickUMLS:2020AA', 
                                quickumls_fp='data/umls2_UL/',
                                overlapping_criteria=quick_umls_dist,
                                threshold=quick_umls_th,
                                similarity_name='jaccard',
                                accepted_semtypes = {'T184'},
                                window=5)

umls_diag_proc = QuickUMLSAnnotator(['ENT/DIAG_PROC'], 'umls_diag_proc', 'QuickUMLS:2020AA', 
                                quickumls_fp='data/umls2_UL/',
                                overlapping_criteria=quick_umls_dist,
                                threshold=quick_umls_th,
                                similarity_name='jaccard',
                                accepted_semtypes = {'T060', 'T059', 'T034'},
                                window=5)

umls_diag_name = QuickUMLSAnnotator(['ENT/DIAG_NAME'], 'umls_diag_name', 'QuickUMLS:2020AA', 
                                quickumls_fp='data/umls2_UL/',
                                overlapping_criteria=quick_umls_dist,
                                threshold=quick_umls_th,
                                similarity_name='jaccard',
                                accepted_semtypes = {'T047', 'T191', 'T049', 'T048', 'T046', 'T190', 'T019', 'T020', 'T037'},
                                window=5)

umls_therap = QuickUMLSAnnotator(['ENT/THERAP_PROC'], 'umls_therap', 'QuickUMLS:2020AA', 
                                quickumls_fp='data/umls2_UL/',
                                overlapping_criteria=quick_umls_dist,
                                threshold=quick_umls_th,
                                similarity_name='jaccard',
                                accepted_semtypes = {'T061'},
                                window=5)


In [9]:
models_param = [{'tagger_path':'data/models/apcner_deid/entities_7/best-model.pt' ,
                  'store_embedding':True,
                'tag_name': 'pheno_pred' }]

ner = NERAnnotator(['sentence'], 'ner', ID='med:v2', models_param=models_param,  device='cuda:1', reduce_embedding='concat', mini_batch_size=128)

2021-06-10 13:55:24,579 loading file data/models/apcner_deid/entities_7/best-model.pt


## 2. Init feeddictionnay to build embedding dictionnary

In [None]:
dico_umls_exact_match = FeedDictionnary(['ENT/SIGNS','ENT/DIAG_NAME'],
                                        'feed_dictionnary_quick_umls',
                                        ID = 'fd_concat:v1',
                                        path_dict='data/emb_dict_concat_v4.h5py',
                                        threshold=1)

In [None]:
feed_dic_pipeline = [endlines, sections, sentences, ner, dico_umls_exact_match]

In [10]:
from glob import glob
import random

### 2.A Create embedding dictionnary on large corpus of texts

In [None]:
count = 0
subdirs = glob("../../../../marc/Data/CiliTALexport/*")
for subdir in random.sample(subdirs, len(subdirs)):
    paths = glob(subdir + "/*.json")
    
    for json_path in tqdm(random.sample(paths, len(paths))):
        try:

            with open(json_path) as h:
                doc = json.load(h)

            if doc['DOCUMENT_ORIGIN_CODE'] == 'ORBIS':
                continue

            doc = Document(
                    raw_text = doc['DISPLAYED_TEXT'],
                    ID = doc['DOCUMENT_NUM'],
                    attributes = {'person_id': doc['PATIENT_NUM']}
                )
            
            try:
                doc.annotate(feed_dic_pipeline)
                count += 1
            except:
                print("Annotation error", doc.source_ID)
        except:
            print("Load error", json_path)



print(count)

In [14]:
print(count)

2102722


with open("../data/export_mincil_pheno_loc.v1.json") as h:
    docs = json.load(h)

docs = [Document(
        raw_text = x['ano_text'],
        ID = x['document_num'],
        attributes = {'person_id': x['patient_num']}
    ) for x in docs]

for doc in tqdm(docs[:]):
    try:
        doc.annotate(feed_dic_pipeline)
    except:
        print(doc.source_ID)
    del doc

### 2.B Add UMLS synonyms to dictionnary

In [11]:
from flair.data import Sentence
import numpy as np
import h5py

In [60]:
target_tui = {'T184', 'T047', 'T191', 'T049', 'T048', 'T046', 'T190', 'T019', 'T020', 'T037'}

conso_head = pd.read_csv('data/umls2_UL/mrconso_headers.csv', header=None)
conso_head = conso_head.iloc[:,0].tolist()
mrconso = pd.read_csv('data/umls2_UL/MRCONSO.RRF', header=None, names=conso_head+[''], sep="|")

sty_head = pd.read_csv('data/umls2_UL/mrsty_headers.csv', header=None)
sty_head = sty_head.iloc[:,0].tolist()
sty = pd.read_csv('data/umls2_UL/MRSTY.RRF', header=None, names=sty_head+[''],  sep="|")

mrconso = mrconso.loc[mrconso.LAT=="FRE"]

tagger = ner.model_zoo[0][1]

In [62]:
mrconso.loc[mrconso.CUI.isin(sty.loc[sty.TUI.isin(target_tui), "CUI"].tolist())].shape

(97092, 19)

In [33]:
mode = "cat"

hf = h5py.File('data/emb_dict_concat_v4.h5py', 'a')

for i, row in tqdm(mrconso.loc[mrconso.CUI.isin(sty.loc[sty.TUI.isin(target_tui), "CUI"].tolist())].iterrows()):
    tmp = Sentence(row.STR)
    tagger.embeddings.embed(tmp)
    
    

    if mode == "mean":
        emb = np.concatenate([tok._embeddings['0-transformer-word-data/embeddings/bert-base-medical-huge-cased/checkpoint-800000/'].cpu().numpy().reshape(1, -1) \
                              for tok in tmp], 0).mean(axis=0).reshape(1, -1)

    elif mode == "cat":
        emb = np.concatenate([tok._embeddings['0-transformer-word-data/embeddings/bert-base-medical-huge-cased/checkpoint-800000/'].cpu().numpy().reshape(1, -1) \
                              for tok in tmp], 0)
        
    if emb.shape[1] != 3072:
        continue
    groupname = row.CUI+"/" + "_".join(row.STR.split('/'))

    if groupname in hf:
        group = hf.get(groupname)
    else:
        group = hf.create_group(groupname)

    group.create_dataset(row.AUI, data = emb)


hf.close()

del mrconso
del sty
del hf

97092it [20:29, 78.98it/s]


### 3. Load embedding dictionnary normalizer

In [10]:
%%time
norm_pheno_lev = NormPhenoLev(['ENT/SIGNS','ENT/DIAG_NAME'],
                              'lev_norm',
                              ID='pheno_dic:v4',
                              path_dict='data/emb_dict_concat_v4.h5py',
                              make_code = False,
                              pca_dim= 32,
                              code_base = 8,
                              best_code_size= 4,
                              max_editdistance = 0.5)

CPU times: user 564 ms, sys: 40 ms, total: 604 ms
Wall time: 603 ms


In [11]:
len(norm_pheno_lev.dict_label)

280661

In [12]:
%%time
norm_pheno_lev.get_closest_match(norm_pheno_lev.dict_label[0]['seq'])['label']

CPU times: user 1.79 s, sys: 8.08 ms, total: 1.8 s
Wall time: 1.79 s


'ABDOMEN AIGU'

In [13]:
%%time
norm_pheno_lev_nu = NormPhenoLev(['ENT/SIGNS','ENT/DIAG_NAME'],
                              'lev_norm_nu',
                              ID='pheno_dic:v5',
                              path_dict='data/emb_dict_concat_v4_noumls.h5py',
                              make_code = False,
                              pca_dim= 32,
                              code_base = 8,
                              best_code_size= 4,
                              max_editdistance = 0.5)

CPU times: user 130 ms, sys: 24.3 ms, total: 154 ms
Wall time: 154 ms


In [14]:
%%time
norm_pheno_dtw = NormPhenoDTW(['ENT/SIGNS','ENT/DIAG_NAME'],
                              'dtw_norm',
                              ID='pheno_dic:v6',
                              path_dict='data/emb_dict_concat_v4.h5py',
                              make_code = False,
                              pca_dim= 256,
                              dtw_window= 5,
                              max_editdistance = 100)

CPU times: user 737 ms, sys: 361 ms, total: 1.1 s
Wall time: 1.1 s


In [15]:
%%time
norm_pheno_dtw.get_closest_match(norm_pheno_dtw.dict_label[0]['seq'])['label']

CPU times: user 3.46 s, sys: 163 ms, total: 3.62 s
Wall time: 3.61 s


'ABDOMEN AIGU'

In [16]:
len(norm_pheno_dtw.dict_label)

420068

In [17]:
%%time
norm_pheno_dtw_nu = NormPhenoDTW(['ENT/SIGNS','ENT/DIAG_NAME'],
                              'dtw_norm_nu',
                              ID='pheno_dic:v7',
                              path_dict='data/emb_dict_concat_v4_noumls.h5py',
                              make_code = False,
                              pca_dim= 256,
                              dtw_window= 5,
                              max_editdistance = 100)

CPU times: user 489 ms, sys: 312 ms, total: 801 ms
Wall time: 800 ms


In [18]:
%%time
norm_pheno_dtw_nu.get_closest_match(norm_pheno_dtw.dict_label[0]['seq'])['label']

CPU times: user 2.61 s, sys: 84.1 ms, total: 2.69 s
Wall time: 2.69 s


'diverticulose'

In [19]:
len(norm_pheno_dtw_nu.dict_label)

322976

In [139]:
%%time
norm_pheno_cat= NormPhenoCat(['ENT/SIGNS','ENT/DIAG_NAME'],
                              'cat_norm',
                              ID='pheno_dic:v8',
                              path_dict='data/emb_dict_concat_v4.h5py',
                              make_code = True,
                              pca_dim = 256,
                              k_neighboors = 1,
                              max_distance = 0.5,
                              emb_dim = 3072)

100%|██████████| 43203/43203 [01:49<00:00, 393.17it/s] 


Fit PCA
Get rid of duplicates
Old size:420068
New size:365692
CPU times: user 8min 46s, sys: 18.1 s, total: 9min 5s
Wall time: 6min 59s


In [140]:
%%timeit -n 2
norm_pheno_cat.get_closest_match(norm_pheno_cat.dict_label[0]['seq'].reshape(1, -1))['label']

58.1 ms ± 114 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)


In [141]:
len(norm_pheno_cat.dict_label)

365692

In [125]:
%%time
norm_pheno_cat_nu= NormPhenoCat(['ENT/SIGNS','ENT/DIAG_NAME'],
                              'cat_norm_nu',
                              ID='pheno_dic:v9',
                              path_dict='data/emb_dict_concat_v4.h5py',
                              make_code = True,
                              pca_dim = 1024,
                              k_neighboors = 1,
                              max_distance = 0.5,
                              emb_dim = 3072)

100%|██████████| 43203/43203 [01:49<00:00, 393.19it/s] 


Fit PCA
Get rid of duplicates
Old size:420068
New size:365692
CPU times: user 9min 41s, sys: 23.9 s, total: 10min 5s
Wall time: 7min 38s


In [126]:
%%timeit -n 2
norm_pheno_cat_nu.get_closest_match(norm_pheno_cat_nu.dict_label[0]['seq'].reshape(1, -1))['label']

234 ms ± 613 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)


In [127]:
len(norm_pheno_cat_nu.dict_label)

365692

## 4. Compare methods

In [26]:
from pymedext_eds.extract.corpus_generation_tools import brat_norm_to_pdf

In [27]:
import numpy as np

In [43]:
def pymedext_to_pdf(doc, key):
    """Transform pymedext annotations to pandas dataframe"""
    columns = ["doc_id", "mention", "start", "stop", "cui", "label"]
    lines = [(doc.source_ID, t.value, int(t.span[0]), int(t.span[1]), t.attributes["cui"], t.attributes["label"]) \
             for t in doc.get_annotations(key)]
    
    df = pd.DataFrame(lines if lines else {k: [] for k in columns}, 
                      columns = columns
                     )  
    df['start'] = df['start'].astype('int')
    df['stop'] = df['stop'].astype('int')
    return df

In [48]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score

In [49]:
def compute_metrics(gold, pred, label = ""):
    tp, fn, fp, tn = confusion_matrix(gold, pred).ravel()
    acc = accuracy_score(gold, pred)
    f1= f1_score(gold, pred, average="binary")
    r = recall_score(gold, pred, average="binary")
    p = precision_score(gold, pred, average="binary")
    
    #reformat metrics
    metrics = pd.DataFrame([[tp + fn, tp, fn, fp, r, p, f1, acc]],
                           columns=['total_pos', "TP", "FN", "FP", "Recall", "Precision", "F1", "Accuracy"], index = [label])
    return metrics

In [28]:
pipeline_common = [endlines, sections, sentences, hypothesis, family, syntagmes, negation, regex, ner]
pipe_compare = [
            umls_syntagme,
            umls_signs, umls_diag_proc, umls_diag_name, umls_therap
]
pipe_compare2 = [ norm_pheno_lev, norm_pheno_dtw, norm_pheno_cat, norm_pheno_lev_nu, norm_pheno_dtw_nu, norm_pheno_cat_nu]

### 4.A NeckerNorm

In [29]:
# data_path = pkg_resources.resource_filename('pymedext_eds', 'data/demo')
file_list = glob("../data/NeckerNorm/CiliTalConfirm/*.txt")

In [30]:
docs = [rawtext_loader(x) for x in file_list]

In [31]:
file_list[:3]

['../data/NeckerNorm/CiliTalConfirm/0063_COMMUN_ORLTROUSSEAU_13624692484.txt',
 '../data/NeckerNorm/CiliTalConfirm/0056_COMMUN_NEPHRO_OLD_13736523315.txt',
 '../data/NeckerNorm/CiliTalConfirm/0028_COMMUN_CARDIO_113885141.txt']

In [32]:
source_ids = [doc.source_ID for doc in docs]

In [33]:
assert len(source_ids) == len(np.unique(source_ids))

In [34]:
%%time
for doc in docs:
    doc.annotate(pipeline_common)

Ignore 2 sentence(s) with no tokens.
Ignore 1 sentence(s) with no tokens.
Ignore 1 sentence(s) with no tokens.
Ignore 1 sentence(s) with no tokens.
Ignore 1 sentence(s) with no tokens.
Ignore 1 sentence(s) with no tokens.
Ignore 1 sentence(s) with no tokens.
Ignore 1 sentence(s) with no tokens.
Ignore 1 sentence(s) with no tokens.
Ignore 1 sentence(s) with no tokens.


CPU times: user 16.9 s, sys: 616 ms, total: 17.5 s
Wall time: 17.5 s


In [35]:
%%time
for doc in docs:
    doc.annotate(pipe_compare)

CPU times: user 52.5 s, sys: 115 ms, total: 52.6 s
Wall time: 52.6 s


In [36]:
%%time
for doc in tqdm(docs):
    doc.annotate([norm_pheno_cat, norm_pheno_cat_nu])

100%|██████████| 112/112 [04:11<00:00,  2.25s/it]

CPU times: user 1h 6min 2s, sys: 1min 8s, total: 1h 7min 11s
Wall time: 4min 11s





In [37]:
all_ann = brat_norm_to_pdf("../data/NeckerNorm/CiliTalConfirm/")

In [38]:
all_ann.head()

Unnamed: 0,ent_type,start,stop,mention_ner,doc_id,ann_id,ent_id,termino,cui,mention_norm
0,phenotype,1313,1327,deshydratation,13735351509,N1,T1,UMLS_ALL,C0011175,DESHYDRATATION
0,phenotype,1165,1214,Bruits du coeur reguliers sans souffle surajoute,13736496216,N1,T1,UMLS_FR,C0577821,Bruits du coeur normaux
1,phenotype,775,791,Bon etat general,13736496216,N2,T2,UMLS_ALL,C2939150,General wellbeing (observable entity)
2,phenotype,1217,1236,Lesions au visage,13736496216,N3,T3,UMLS_ALL,C3263723,injury
3,phenotype,1291,1323,auscultation pulmonaire normale,13736496216,N4,T4,UMLS_ALL,C0231855,Bruits respiratoires normaux


In [39]:
all_ann = all_ann.loc[lambda x:x.ent_type.isin(["phenotype"])]

In [40]:
(all_ann
 .assign(has_norm = lambda x:x.cui.notnull())
 .groupby(['doc_id'])['has_norm']
 .aggregate([('is_annotated', any), ('n_CUI', sum)])
 .groupby(['is_annotated'])['n_CUI']
 .aggregate([('n_annotated', len), ('n_CUI', sum)])

)

Unnamed: 0_level_0,n_annotated,n_CUI
is_annotated,Unnamed: 1_level_1,Unnamed: 2_level_1
False,1,0
True,59,408


In [41]:
(all_ann.assign(has_norm = lambda x:x.cui.notnull())
 .groupby(['ent_type'])['has_norm']
 .aggregate([('With CUI', sum), ('Total', len), ('freq', lambda x: round(sum(x)/len(x) ,2))])
)

Unnamed: 0_level_0,With CUI,Total,freq
ent_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
phenotype,408,459,0.89


In [42]:
all_ann = all_ann.loc[lambda x:x.cui.notnull()].assign(mod = "manual").assign(start = lambda x:x.start.astype('int')).assign(stop = lambda x:x.stop.astype('int'))

In [45]:
ner_acc = []
syn_acc = []
lev_acc = []
dtw_acc = []
cat_acc = []
catnu_acc = []

for doc in docs[:10]:
    syn_acc.append(pymedext_to_pdf(doc, 'umls_syntagme'))
    ner_acc.append(pymedext_to_pdf(doc, 'umls_diag_name'))
    ner_acc.append(pymedext_to_pdf(doc, 'umls_signs'))
#    lev_acc.append(pymedext_to_pdf(doc, 'lev_norm'))
#    dtw_acc.append(pymedext_to_pdf(doc, 'dtw_norm'))
    cat_acc.append(pymedext_to_pdf(doc, 'cat_norm'))
    catnu_acc.append(pymedext_to_pdf(doc, 'cat_norm_nu'))

ner_acc = pd.concat(ner_acc).assign(mod = "ner")
syn_acc = pd.concat(syn_acc).assign(mod = "syn")

#dtw_acc = pd.concat(dtw_acc).assign(mod = "dtw")
#lev_acc = pd.concat(lev_acc).assign(mod = "lev")
cat_acc = pd.concat(cat_acc).assign(mod = "cat")
catnu_acc = pd.concat(catnu_acc).assign(mod = "catnu")


In [46]:
ner_acc.head()

Unnamed: 0,doc_id,mention,start,stop,cui,label,mod
0,13624692484,stenose,699,706,C1261287,stenose,ner
1,13624692484,collapsus,780,789,C0036974,collapsus,ner
2,13624692484,enterite a Rotavirus,1081,1101,C0347854,enterite a rotavirus,ner
3,13624692484,pneumopathie,1142,1154,C0024115,pneumopathie,ner
4,13624692484,erytheme,1589,1597,C0041834,erytheme,ner


In [47]:
all_ann.head()

Unnamed: 0,ent_type,start,stop,mention_ner,doc_id,ann_id,ent_id,termino,cui,mention_norm,mod
0,phenotype,1313,1327,deshydratation,13735351509,N1,T1,UMLS_ALL,C0011175,DESHYDRATATION,manual
0,phenotype,1165,1214,Bruits du coeur reguliers sans souffle surajoute,13736496216,N1,T1,UMLS_FR,C0577821,Bruits du coeur normaux,manual
1,phenotype,775,791,Bon etat general,13736496216,N2,T2,UMLS_ALL,C2939150,General wellbeing (observable entity),manual
2,phenotype,1217,1236,Lesions au visage,13736496216,N3,T3,UMLS_ALL,C3263723,injury,manual
3,phenotype,1291,1323,auscultation pulmonaire normale,13736496216,N4,T4,UMLS_ALL,C0231855,Bruits respiratoires normaux,manual


In [50]:
all_ann.head()

Unnamed: 0,ent_type,start,stop,mention_ner,doc_id,ann_id,ent_id,termino,cui,mention_norm,mod
0,phenotype,1313,1327,deshydratation,13735351509,N1,T1,UMLS_ALL,C0011175,DESHYDRATATION,manual
0,phenotype,1165,1214,Bruits du coeur reguliers sans souffle surajoute,13736496216,N1,T1,UMLS_FR,C0577821,Bruits du coeur normaux,manual
1,phenotype,775,791,Bon etat general,13736496216,N2,T2,UMLS_ALL,C2939150,General wellbeing (observable entity),manual
2,phenotype,1217,1236,Lesions au visage,13736496216,N3,T3,UMLS_ALL,C3263723,injury,manual
3,phenotype,1291,1323,auscultation pulmonaire normale,13736496216,N4,T4,UMLS_ALL,C0231855,Bruits respiratoires normaux,manual


In [63]:
all_merge = (all_ann.loc[:, ["doc_id", "cui", "mod"]]
             .merge(ner_acc.loc[:, ["doc_id", "cui", "mod"]], how = "outer", on =["doc_id", "cui"], suffixes=('_man', '_ner'))
             .merge(syn_acc.loc[:, ["doc_id", "cui", "mod"]], how = "outer", on =["doc_id", "cui"])
             .merge(cat_acc.loc[:, ["doc_id", "cui", "mod"]], how = "outer", on =["doc_id", "cui"], suffixes=('_syn', '_cat'))
             .merge(catnu_acc.loc[:, ["doc_id", "cui", "mod"]].assign(mod_catnu = lambda x:x['mod']).drop('mod',1), how = "outer", on =["doc_id", "cui"])

#             .merge(dtw_acc.loc[:, ["doc_id", "cui", "mod"]], how = "outer", on =["doc_id", "cui"], suffixes=('_syn', '_dtw'))
#             .merge(lev_acc.loc[:, ["doc_id", "cui", "mod"]].assign(mod_lev = lambda x:x.mod).drop('mod',1), how = "outer", on =["doc_id", "cui"])
             .assign(true = lambda x:x.mod_man.notnull())
             .assign(pred_ner = lambda x:x['mod_ner'].notnull())
             .assign(pred_syn = lambda x:x['mod_syn'].notnull())
             .assign(pred_cat = lambda x:x['mod_cat'].notnull())
             .assign(pred_catnu = lambda x:x['mod_catnu'].notnull())

#             .assign(pred_dtw = lambda x:x['mod_dtw'].notnull())
#             .assign(pred_lev = lambda x:x['mod_lev'].notnull())

            )

In [71]:
all_merge = all_merge.loc[lambda x:x.mod_man.notnull()]

In [72]:
all_merge.head()

Unnamed: 0,doc_id,cui,mod_man,mod_ner,mod_syn,mod_cat,mod_catnu,true,pred_ner,pred_syn,pred_cat,pred_catnu
0,13735351509,C0011175,manual,ner,syn,cat,catnu,True,True,True,True,True
1,13736496216,C0577821,manual,,,,,True,False,False,False,False
2,13736496216,C2939150,manual,,,,,True,False,False,False,False
3,13736496216,C3263723,manual,,,,,True,False,False,False,False
4,13736496216,C0231855,manual,,,,,True,False,False,False,False


In [73]:
pd.concat([compute_metrics(all_merge.true, all_merge.pred_ner, label = "NER + QuickUMLS (ent level)"), 
           compute_metrics(all_merge.true, all_merge.pred_syn, label = "QuickUMLS (ent level)"),
           compute_metrics(all_merge.true, all_merge.pred_cat, label = "NER + cat e (ent level)"),
           compute_metrics(all_merge.true, all_merge.pred_catnu, label = "NER + cat no umls e (ent level)")
          ]

         ).round(2)

Unnamed: 0,total_pos,TP,FN,FP,Recall,Precision,F1,Accuracy
NER + QuickUMLS (ent level),0,0,0,403,0.01,1.0,0.02,0.01
QuickUMLS (ent level),0,0,0,401,0.02,1.0,0.03,0.02
NER + cat e (ent level),0,0,0,398,0.02,1.0,0.05,0.02
NER + cat no umls e (ent level),0,0,0,400,0.02,1.0,0.04,0.02


In [78]:
all_merge.loc[:,["true", "pred_syn", "pred_ner", "pred_cat"]].sum(0)

true        408
pred_syn      7
pred_ner      5
pred_cat     10
dtype: int64

### 4.B Quaero norm

In [90]:
# data_path = pkg_resources.resource_filename('pymedext_eds', 'data/demo')
file_list = glob("../data/QUAERO_FrenchMed/corpus/dev/MEDLINE/*.txt")

In [91]:
docs = [rawtext_loader(x) for x in file_list]

In [92]:
file_list[:3]

['../data/QUAERO_FrenchMed/corpus/dev/MEDLINE/8502882.txt',
 '../data/QUAERO_FrenchMed/corpus/dev/MEDLINE/15053153.txt',
 '../data/QUAERO_FrenchMed/corpus/dev/MEDLINE/371062.txt']

In [93]:
source_ids = [doc.source_ID for doc in docs]

In [94]:
assert len(source_ids) == len(np.unique(source_ids))

In [95]:
%%time
for doc in docs:
    doc.annotate(pipeline_common)

Ignore 1 sentence(s) with no tokens.


CPU times: user 16.8 s, sys: 63.9 ms, total: 16.9 s
Wall time: 16.9 s


In [96]:
%%time
for doc in docs:
    doc.annotate(pipe_compare)

CPU times: user 12.9 s, sys: 51.1 ms, total: 12.9 s
Wall time: 13.1 s


In [97]:
%%time
for doc in tqdm(docs):
    doc.annotate([norm_pheno_cat, norm_pheno_cat_nu])

100%|██████████| 832/832 [01:29<00:00,  9.32it/s]

CPU times: user 23min 23s, sys: 24.7 s, total: 23min 47s
Wall time: 1min 29s





In [98]:
all_ann = brat_norm_to_pdf("../data/QUAERO_FrenchMed/corpus/dev/MEDLINE/")

In [99]:
all_ann.head()

Unnamed: 0,doc_id,ent_id,ent_type,start,stop,mention_ner,ann_id,termino,cui,mention_norm
0,6772762,T1,PROC,0,5,Etude,,,,
1,6772762,T2,PHYS,29,49,fonction gonadotrope,,,,
2,6772762,T3,LIVB,58,61,rat,,,,
3,6772762,T5,PROC,85,95,castration,,,,
4,6772762,T4,LIVB,62,66,mâle,,,,


In [None]:
    #    standardized_data = StandardScaler().fit_transform(X_embed)
  #      self.umap_trans = umap.UMAP(n_neighbors = umap_n_neighbors, 
   #                                 n_components = umap_n_components, 
   #                                 min_dist = 0, 
   #                                 random_state=42).fit(standardized_data, y=labels)
   #     umap_embedding = self.umap_trans.transform(standardized_data)


In [100]:
all_ann.ent_type.unique()

array(['PROC', 'PHYS', 'LIVB', 'DISO', 'ANAT', 'PHEN', 'CHEM', 'DEVI',
       'OBJC', 'GEOG'], dtype=object)

In [101]:
all_ann = all_ann.loc[lambda x:x.ent_type.isin(["DISO"])]

In [102]:
(all_ann
 .assign(has_norm = lambda x:x.cui.notnull())
 .groupby(['doc_id'])['has_norm']
 .aggregate([('is_annotated', any), ('n_CUI', sum)])
 .groupby(['is_annotated'])['n_CUI']
 .aggregate([('n_annotated', len), ('n_CUI', sum)])

)

Unnamed: 0_level_0,n_annotated,n_CUI
is_annotated,Unnamed: 1_level_1,Unnamed: 2_level_1
False,475,0


In [103]:
(all_ann.assign(has_norm = lambda x:x.cui.notnull())
 .groupby(['ent_type'])['has_norm']
 .aggregate([('With CUI', sum), ('Total', len), ('freq', lambda x: round(sum(x)/len(x) ,2))])
)

Unnamed: 0_level_0,With CUI,Total,freq
ent_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DISO,0,892,False


In [42]:
all_ann = all_ann.loc[lambda x:x.cui.notnull()].assign(mod = "manual").assign(start = lambda x:x.start.astype('int')).assign(stop = lambda x:x.stop.astype('int'))

In [45]:
ner_acc = []
syn_acc = []
lev_acc = []
dtw_acc = []
cat_acc = []
catnu_acc = []

for doc in docs[:10]:
    syn_acc.append(pymedext_to_pdf(doc, 'umls_syntagme'))
    ner_acc.append(pymedext_to_pdf(doc, 'umls_diag_name'))
    ner_acc.append(pymedext_to_pdf(doc, 'umls_signs'))
#    lev_acc.append(pymedext_to_pdf(doc, 'lev_norm'))
#    dtw_acc.append(pymedext_to_pdf(doc, 'dtw_norm'))
    cat_acc.append(pymedext_to_pdf(doc, 'cat_norm'))
    catnu_acc.append(pymedext_to_pdf(doc, 'cat_norm_nu'))

ner_acc = pd.concat(ner_acc).assign(mod = "ner")
syn_acc = pd.concat(syn_acc).assign(mod = "syn")

#dtw_acc = pd.concat(dtw_acc).assign(mod = "dtw")
#lev_acc = pd.concat(lev_acc).assign(mod = "lev")
cat_acc = pd.concat(cat_acc).assign(mod = "cat")
catnu_acc = pd.concat(catnu_acc).assign(mod = "catnu")


In [46]:
ner_acc.head()

Unnamed: 0,doc_id,mention,start,stop,cui,label,mod
0,13624692484,stenose,699,706,C1261287,stenose,ner
1,13624692484,collapsus,780,789,C0036974,collapsus,ner
2,13624692484,enterite a Rotavirus,1081,1101,C0347854,enterite a rotavirus,ner
3,13624692484,pneumopathie,1142,1154,C0024115,pneumopathie,ner
4,13624692484,erytheme,1589,1597,C0041834,erytheme,ner


In [47]:
all_ann.head()

Unnamed: 0,ent_type,start,stop,mention_ner,doc_id,ann_id,ent_id,termino,cui,mention_norm,mod
0,phenotype,1313,1327,deshydratation,13735351509,N1,T1,UMLS_ALL,C0011175,DESHYDRATATION,manual
0,phenotype,1165,1214,Bruits du coeur reguliers sans souffle surajoute,13736496216,N1,T1,UMLS_FR,C0577821,Bruits du coeur normaux,manual
1,phenotype,775,791,Bon etat general,13736496216,N2,T2,UMLS_ALL,C2939150,General wellbeing (observable entity),manual
2,phenotype,1217,1236,Lesions au visage,13736496216,N3,T3,UMLS_ALL,C3263723,injury,manual
3,phenotype,1291,1323,auscultation pulmonaire normale,13736496216,N4,T4,UMLS_ALL,C0231855,Bruits respiratoires normaux,manual


In [50]:
all_ann.head()

Unnamed: 0,ent_type,start,stop,mention_ner,doc_id,ann_id,ent_id,termino,cui,mention_norm,mod
0,phenotype,1313,1327,deshydratation,13735351509,N1,T1,UMLS_ALL,C0011175,DESHYDRATATION,manual
0,phenotype,1165,1214,Bruits du coeur reguliers sans souffle surajoute,13736496216,N1,T1,UMLS_FR,C0577821,Bruits du coeur normaux,manual
1,phenotype,775,791,Bon etat general,13736496216,N2,T2,UMLS_ALL,C2939150,General wellbeing (observable entity),manual
2,phenotype,1217,1236,Lesions au visage,13736496216,N3,T3,UMLS_ALL,C3263723,injury,manual
3,phenotype,1291,1323,auscultation pulmonaire normale,13736496216,N4,T4,UMLS_ALL,C0231855,Bruits respiratoires normaux,manual


In [63]:
all_merge = (all_ann.loc[:, ["doc_id", "cui", "mod"]]
             .merge(ner_acc.loc[:, ["doc_id", "cui", "mod"]], how = "outer", on =["doc_id", "cui"], suffixes=('_man', '_ner'))
             .merge(syn_acc.loc[:, ["doc_id", "cui", "mod"]], how = "outer", on =["doc_id", "cui"])
             .merge(cat_acc.loc[:, ["doc_id", "cui", "mod"]], how = "outer", on =["doc_id", "cui"], suffixes=('_syn', '_cat'))
             .merge(catnu_acc.loc[:, ["doc_id", "cui", "mod"]].assign(mod_catnu = lambda x:x['mod']).drop('mod',1), how = "outer", on =["doc_id", "cui"])

#             .merge(dtw_acc.loc[:, ["doc_id", "cui", "mod"]], how = "outer", on =["doc_id", "cui"], suffixes=('_syn', '_dtw'))
#             .merge(lev_acc.loc[:, ["doc_id", "cui", "mod"]].assign(mod_lev = lambda x:x.mod).drop('mod',1), how = "outer", on =["doc_id", "cui"])
             .assign(true = lambda x:x.mod_man.notnull())
             .assign(pred_ner = lambda x:x['mod_ner'].notnull())
             .assign(pred_syn = lambda x:x['mod_syn'].notnull())
             .assign(pred_cat = lambda x:x['mod_cat'].notnull())
             .assign(pred_catnu = lambda x:x['mod_catnu'].notnull())

#             .assign(pred_dtw = lambda x:x['mod_dtw'].notnull())
#             .assign(pred_lev = lambda x:x['mod_lev'].notnull())

            )

In [71]:
all_merge = all_merge.loc[lambda x:x.mod_man.notnull()]

In [72]:
all_merge.head()

Unnamed: 0,doc_id,cui,mod_man,mod_ner,mod_syn,mod_cat,mod_catnu,true,pred_ner,pred_syn,pred_cat,pred_catnu
0,13735351509,C0011175,manual,ner,syn,cat,catnu,True,True,True,True,True
1,13736496216,C0577821,manual,,,,,True,False,False,False,False
2,13736496216,C2939150,manual,,,,,True,False,False,False,False
3,13736496216,C3263723,manual,,,,,True,False,False,False,False
4,13736496216,C0231855,manual,,,,,True,False,False,False,False


In [73]:
pd.concat([compute_metrics(all_merge.true, all_merge.pred_ner, label = "NER + QuickUMLS (ent level)"), 
           compute_metrics(all_merge.true, all_merge.pred_syn, label = "QuickUMLS (ent level)"),
           compute_metrics(all_merge.true, all_merge.pred_cat, label = "NER + cat e (ent level)"),
           compute_metrics(all_merge.true, all_merge.pred_catnu, label = "NER + cat no umls e (ent level)")
          ]

         ).round(2)

Unnamed: 0,total_pos,TP,FN,FP,Recall,Precision,F1,Accuracy
NER + QuickUMLS (ent level),0,0,0,403,0.01,1.0,0.02,0.01
QuickUMLS (ent level),0,0,0,401,0.02,1.0,0.03,0.02
NER + cat e (ent level),0,0,0,398,0.02,1.0,0.05,0.02
NER + cat no umls e (ent level),0,0,0,400,0.02,1.0,0.04,0.02


In [78]:
all_merge.loc[:,["true", "pred_syn", "pred_ner", "pred_cat"]].sum(0)

true        408
pred_syn      7
pred_ner      5
pred_cat     10
dtype: int64

In [32]:
# python -m quickumls.install -E FR data/umls data/quickumls

### 5. Qualitative comparison

In [128]:
from pymedext_eds.viz import display_annotations

In [129]:
doc_demo = rawtext_loader("data/test_data/pheno_norm.txt")

In [130]:
%%time
doc_demo.annotate(pipeline_common)

CPU times: user 660 ms, sys: 16.5 ms, total: 676 ms
Wall time: 115 ms


In [131]:
%%time
doc_demo.annotate(pipe_compare)

CPU times: user 354 ms, sys: 49 µs, total: 354 ms
Wall time: 353 ms


In [112]:
%%time
doc_demo.annotate([norm_pheno_lev])

CPU times: user 3min 6s, sys: 1.99 s, total: 3min 8s
Wall time: 1min 22s


In [113]:
%%time
doc_demo.annotate([norm_pheno_dtw])

CPU times: user 3min 55s, sys: 7.67 s, total: 4min 3s
Wall time: 2min 17s


In [142]:
%%time
doc_demo.annotate([norm_pheno_cat])


CPU times: user 36.8 s, sys: 621 ms, total: 37.4 s
Wall time: 2.34 s


In [133]:
%%time
doc_demo.annotate([norm_pheno_cat_nu])

CPU times: user 1min 52s, sys: 2.04 s, total: 1min 54s
Wall time: 9.21 s


In [130]:
test = [t['label'] for t in norm_pheno_cat_nu.dict_label.values()]

In [139]:
#[t for t in test if re.search("insuffisance renale aigue", t)]

In [73]:
for ann in doc_demo.get_annotations('umls_syntagme'):
    print("{:>100} --> {:>30}  ({:>2})".format(ann.attributes['snippet'], ann.attributes['label'], ann.attributes['score']))

                                                                En l'absence de trouble hemorragique -->           trouble hemorragique  (1.0)
                                                             On ne retrouve pas d'hypocomplementemie -->             hypocomplementemie  (1.0)
                                                 Le patien présente un rachitisme hypophosphatemique -->  rachitisme hypophosphatemique  (1.0)
                                                    Doute sur un possible syndrome de walker-warburg -->     syndrome de walker-warburg  (1.0)
                                Le patien présente un rachitisme avec de faible niveau de phosphates -->                     rachitisme  (1.0)
                                                      Avec un antécédent de naissance avant le terme -->          naissance avant terme  (1.0)
                                                Le patient présente un kyste au niveau du rein droit -->                          kyste  (1.0)

In [74]:
for ann in doc_demo.get_annotations('umls_signs'):
    print("{:>50} --> {:>30}  ({:>2})".format(ann.attributes['snippet'], ann.attributes['label'], ann.attributes['score']))

                                Mydriase areactive -->                       mydriase  (1.0)
                                 Douleur au ventre -->                        douleur  (1.0)
                                      Douleur abdo -->                   douleur abdo  (1.0)
                                     Epigastralgie -->                  epigastralgie  (1.0)


In [81]:
for ann in doc_demo.get_annotations('umls_diag_name'):
    print("{:>50} --> {:>30}  ({:>2})".format(ann.attributes['snippet'], ann.attributes['label'], ann.attributes['score']))

                              trouble hemorragique -->           trouble hemorragique  (1.0)
                     rachitisme hypophosphatemique -->  rachitisme hypophosphatemique  (1.0)
                        syndrome de walker-warburg -->     syndrome de walker-warburg  (1.0)
                                        rachitisme -->                     rachitisme  (1.0)
                     kyste au niveau du rein droit -->                          kyste  (1.0)
                   kyste au niveau du poumon droit -->                          kyste  (1.0)
                         kyste au niveau du pelvis -->                          kyste  (1.0)
                         kyste au niveau splénique -->                          kyste  (1.0)
                        kyste au niveau de la rate -->                          kyste  (1.0)
                                    kyste saignant -->                          kyste  (1.0)
                                kyste hémorragique -->             kys

In [83]:
for ann in doc_demo.get_annotations('lev_norm'):
    if not ann.value:
        continue
    print("{:>50} --> {:>30}  ({:.2f}) ".format(ann.attributes['mention'], ann.attributes['label'], ann.attributes['score_ed']))

                              trouble hemorragique -->           trouble hemorragique  (0.00) 
                     rachitisme hypophosphatemique -->  rachitisme hypophosphatemique  (0.00) 
                        syndrome de walker-warburg -->          syndrome de silverman  (0.17) 
                                        rachitisme -->                     rachitisme  (0.25) 
                    Sd de Walker-Warbourg débutant -->               Syndrome de Down  (0.50) 
                                    kyste saignant -->            kyste suprasellaire  (0.38) 
                                kyste hémorragique -->                kyste hepatique  (0.38) 
                                            lésion -->                         lesion  (0.50) 
                                       signes d'IC -->              absence d'appetit  (0.50) 
                                           diabète -->                        diabete  (0.00) 
                                           diabète

In [84]:
for ann in doc_demo.get_annotations('dtw_norm'):
    if not ann.value:
        continue
    print("{:>50} --> {:>30}  ({:.2f}) ".format(ann.attributes['mention'], ann.attributes['label'], ann.attributes['score_ed']))

                              trouble hemorragique -->           trouble hemorragique  (6.69) 
                     rachitisme hypophosphatemique -->  rachitisme hypophosphatemique  (7.63) 
                         antécédent de prématurité -->          Anémie de prématurité  (35.97) 
                        syndrome de walker-warburg -->       syndrome de kleine-levin  (26.28) 
                              baisse du complément -->        modification du transit  (35.00) 
                                        rachitisme -->                     rachitisme  (7.03) 
                           antécédent de naissance -->           Dépassement de terme  (36.45) 
                  possible Walker-Warburg débutant -->               syndrome leopard  (42.95) 
                    Sd de Walker-Warbourg débutant -->     syndrome de walker-warburg  (38.51) 
                     kyste au niveau du rein droit -->                  kyste du rein  (46.17) 
                   kyste au niveau du poumo

In [87]:
for ann in doc_demo.get_annotations('cat_norm'):
    if not ann.value:
        continue
    print("{:>50} --> {:>30}  ({:.2f}) ".format(ann.attributes['mention'], ann.attributes['label'], ann.attributes['score_ed']))

                              trouble hemorragique -->           trouble hemorragique  (0.97) 
                     rachitisme hypophosphatemique -->  rachitisme hypophosphatemique  (0.97) 
                         antécédent de prématurité --> Rétinopathie de la prématurité  (0.57) 
                        syndrome de walker-warburg -->           syndrome de stickler  (0.74) 
                              baisse du complément --> Déficits en protéines du complément  (0.53) 
                                        rachitisme -->                     rachitisme  (0.95) 
                           antécédent de naissance --> Hémorragie ombilicale après la naissance  (0.49) 
                  possible Walker-Warburg débutant -->        Schizophrénie débutante  (0.41) 
                    Sd de Walker-Warbourg débutant -->        Schizophrénie débutante  (0.57) 
                     kyste au niveau du rein droit -->           kyste du corps jaune  (0.62) 
                   kyste au niveau 

In [143]:
moi = ['umls_diag_name', 'umls_signs', 'cat_norm', 'cat_norm_nu', 'dtw_norm', 'lev_norm']
moi = ['umls_diag_name', 'umls_signs', 'cat_norm', 'cat_norm_nu']

In [144]:
acc = []
for method in moi:
    for ann in doc_demo.get_annotations(method):
        if not ann.value:
            continue
            
        if method in [ 'umls_signs', 'umls_diag_name'] :
            mention = ann.attributes['snippet']
        else:
            mention = ann.attributes['mention']

        tmp = {"method":method, "mention":mention, "label":ann.attributes['label']}
        acc.append(tmp)

In [145]:
res = pd.DataFrame(acc).drop_duplicates(["method", "mention"]).pivot(index="mention", columns="method", values="label")


In [146]:
res

method,cat_norm,cat_norm_nu,umls_diag_name,umls_signs
mention,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
""" réname chronique",Fusion rénale,Infection rénale chronique,,
Auscultation anormale,auscultation pulmonaire anormale,auscultation pulmonaire anormale,,
Douleur abdo,douleur abdo,douleur abdo,,douleur abdo
Douleur au ventre,Douleur au cou,Douleur au cou,,douleur
Epigastralgie,epigastralgie,Epigastralgie,,epigastralgie
IRA,ira,ira,ira,
IVG,ivu,ivu,,
Mydriase areactive,Mydriase,Mydriase,,mydriase
PKRAD,pku,pku,,
Sd de Walker-Warbourg débutant,syndrome de walker-warburg,syndrome de walker-warburg,,


In [147]:
res.loc[res.index.str.contains('kyste')].loc[:, moi]

method,umls_diag_name,umls_signs,cat_norm,cat_norm_nu
mention,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
kyste au niveau de la rate,kyste,,kyste du pancreas,kyste du pancreas
kyste au niveau du pelvis,kyste,,kyste au sein,kyste au sein
kyste au niveau du poumon droit,kyste,,kyste du pancreas,kyste du pancreas
kyste au niveau du rein droit,kyste,,kyste du rein,kyste du rein
kyste au niveau splénique,kyste,,kyste du cordon spermatique,kyste du cordon spermatique
kyste hémorragique,kyste hemorragique,,kyste,kyste
kyste saignant,kyste,,kyste,kyste


In [148]:
res.loc[res.index.str.contains('polykistose|hemorragique|Marphan|diabate')].loc[:, moi]

method,umls_diag_name,umls_signs,cat_norm,cat_norm_nu
mention,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Syndrome de Marphan,syndrome,,Syndrome de Marfan,Syndrome de Marfan
diabate,,,diabetique,diabete
polykistose rénale autoseomique dominante,,,polykystose renale autosomique dominante,polykystose renale autosomique dominante
trouble hemorragique,trouble hemorragique,,trouble hemorragique,trouble hemorragique


In [149]:
res.loc[res.index.str.contains('IRA|IVG|IRC|PKRAD|ICD|ICG')].loc[:, moi]

method,umls_diag_name,umls_signs,cat_norm,cat_norm_nu
mention,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
IRA,ira,,ira,ira
IVG,,,ivu,ivu
PKRAD,,,pku,pku
signes d'ICD,,,signes ecg d'ischemie myocardique,signes ecg d'ischemie myocardique
signes d'ICG,,,signes ecg d'ischemie myocardique,signes ecg d'ischemie myocardique


In [150]:
res.loc[res.index.str.contains('bien|mal')].loc[:, moi]

method,umls_diag_name,umls_signs,cat_norm,cat_norm_nu
mention,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Auscultation anormale,,,auscultation pulmonaire anormale,auscultation pulmonaire anormale
se sent bien,,,se sent tres mal,se sent tres mal
va bien,,,dort mal,dort mal


In [151]:
display_annotations(doc_demo, entities=['ENT/DIAG_NAME'])