In [None]:
%cd ..

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json

In [4]:
from tqdm import tqdm

In [5]:
import os

from glob import glob
import pandas as pd
import re
from pprint import pprint
import pkg_resources

from pymedextcore.document import Document
from pymedext_eds.annotators import Endlines, SentenceTokenizer, Hypothesis, \
                                    ATCDFamille, SyntagmeTokenizer, Negation, RegexMatcher, \
                                    QuickUMLSAnnotator, Pipeline, SectionSplitter, Pipeline

from pymedext_eds.utils import rawtext_loader
from pymedext_eds.ner import NERAnnotator
from pymedext_eds.norm import NERNormalizer, NormPheno, FeedDictionnary, NormPhenoLev, NormPhenoDTW

  return torch._C._cuda_getDeviceCount() > 0


## 1. Init previous elements of pipeline

In [6]:
endlines = Endlines(['raw_text'], 'clean_text', 'endlines:v1')
sentences = SentenceTokenizer(['clean_text'], 'sentence', 'sentenceTokenizer:v1')
sections = SectionSplitter(['clean_text'], "section", ID= 'sections')
hypothesis = Hypothesis(['sentence'], 'hypothesis', 'hypothesis:v1')
family = ATCDFamille(['sentence'], 'context', 'ATCDfamily:v1')
syntagmes = SyntagmeTokenizer(['sentence'], 'syntagme', 'SyntagmeTokenizer:v1')
negation = Negation(['syntagme'], 'negation', 'Negation:v1')
regex = RegexMatcher(['clean_text','syntagme'], 'regex', 'RegexMatcher:v1', 'list_regexp.json')

sosy|T184|Sign or Symptom  


dsyn|T047|Disease or Syndrome  
neop|T191|Neoplastic Process  
comd|T049|Cell or Molecular Dysfunction  
mobd|T048|Mental or Behavioral Dysfunction   
patf|T046|Pathologic Function  
anab|T190|Anatomical Abnormality  
cgab|T019|Congenital Abnormality  
acab|T020|Acquired Abnormality  
inpo|T037|Injury or Poisoning  

diap|T060|Diagnostic Procedure  
lbpr|T059|Laboratory Procedure  
lbtr|T034|Laboratory or Test Result  

topp|T061|Therapeutic or Preventive Procedure  


In [7]:
#                                    accepted_semtypes = {'T184', 'T047', 'T191', 'T049', 'T048', 'T046', 'T190', 'T019', 'T020', 'T037','T060',  'T059', 'T034', 'T061'},


In [8]:
quick_umls_th = 0.9
quick_umls_dist = "score"

umls_syntagme = QuickUMLSAnnotator(['syntagme'], 'umls_syntagme', 'QuickUMLS:2020AA', 
                                   quickumls_fp='data/umls2_UL/',
                                   overlapping_criteria=quick_umls_dist,
                                   threshold=quick_umls_th,
                                   similarity_name='jaccard',
                                   accepted_semtypes = {'T184', 'T047', 'T191', 'T049', 'T048', 'T046', 'T190', 'T019', 'T020', 'T037'},
                                   window=5)

umls_signs = QuickUMLSAnnotator(['ENT/SIGNS'], 'umls_signs', 'QuickUMLS:2020AA', 
                                quickumls_fp='data/umls2_UL/',
                                overlapping_criteria=quick_umls_dist,
                                threshold=quick_umls_th,
                                similarity_name='jaccard',
                                accepted_semtypes = {'T184'},
                                window=5)

umls_diag_proc = QuickUMLSAnnotator(['ENT/DIAG_PROC'], 'umls_diag_proc', 'QuickUMLS:2020AA', 
                                quickumls_fp='data/umls2_UL/',
                                overlapping_criteria=quick_umls_dist,
                                threshold=quick_umls_th,
                                similarity_name='jaccard',
                                accepted_semtypes = {'T060', 'T059', 'T034'},
                                window=5)

umls_diag_name = QuickUMLSAnnotator(['ENT/DIAG_NAME'], 'umls_diag_name', 'QuickUMLS:2020AA', 
                                quickumls_fp='data/umls2_UL/',
                                overlapping_criteria=quick_umls_dist,
                                threshold=quick_umls_th,
                                similarity_name='jaccard',
                                accepted_semtypes = {'T047', 'T191', 'T049', 'T048', 'T046', 'T190', 'T019', 'T020', 'T037'},
                                window=5)

umls_therap = QuickUMLSAnnotator(['ENT/THERAP_PROC'], 'umls_therap', 'QuickUMLS:2020AA', 
                                quickumls_fp='data/umls2_UL/',
                                overlapping_criteria=quick_umls_dist,
                                threshold=quick_umls_th,
                                similarity_name='jaccard',
                                accepted_semtypes = {'T061'},
                                window=5)


In [10]:
models_param = [{'tagger_path':'data/models/apcner_deid/entities_7/best-model.pt' ,
                  'store_embedding':True,
                'tag_name': 'pheno_pred' }]

ner = NERAnnotator(['sentence'], 'ner', ID='med:v2', models_param=models_param,  device='cpu', reduce_embedding='concat', mini_batch_size=128)

2021-06-04 09:38:18,707 loading file data/models/apcner_deid/entities_7/best-model.pt


## 2. Init feeddictionnay to build embedding dictionnary

In [11]:
dico_umls_exact_match = FeedDictionnary(['ENT/SIGNS','ENT/DIAG_NAME'],
                                        'feed_dictionnary_quick_umls',
                                        ID = 'fd_concat:v1',
                                        path_dict='data/emb_dict_concat_v4.h5py',
                                        threshold=1)

In [12]:
feed_dic_pipeline = [endlines, sections, sentences, ner, dico_umls_exact_match]

In [12]:
from glob import glob
import random

### 2.A Create embedding dictionnary on large corpus of texts

In [None]:
count = 0
subdirs = glob("../../../../marc/Data/CiliTALexport/*")
for subdir in random.sample(subdirs, len(subdirs)):
    paths = glob(subdir + "/*.json")
    
    for json_path in tqdm(random.sample(paths, len(paths))):
        try:

            with open(json_path) as h:
                doc = json.load(h)

            if doc['DOCUMENT_ORIGIN_CODE'] == 'ORBIS':
                continue

            doc = Document(
                    raw_text = doc['DISPLAYED_TEXT'],
                    ID = doc['DOCUMENT_NUM'],
                    attributes = {'person_id': doc['PATIENT_NUM']}
                )
            
            try:
                doc.annotate(feed_dic_pipeline)
                count += 1
            except:
                print("Annotation error", doc.source_ID)
        except:
            print("Load error", json_path)



print(count)

In [14]:
print(count)

2102722


with open("../data/export_mincil_pheno_loc.v1.json") as h:
    docs = json.load(h)

docs = [Document(
        raw_text = x['ano_text'],
        ID = x['document_num'],
        attributes = {'person_id': x['patient_num']}
    ) for x in docs]

for doc in tqdm(docs[:]):
    try:
        doc.annotate(feed_dic_pipeline)
    except:
        print(doc.source_ID)
    del doc

### 2.B Add UMLS synonyms to dictionnary

In [15]:
from flair.data import Sentence
import numpy as np
import h5py

In [16]:

target_tui = {'T184', 'T047', 'T191', 'T049', 'T048', 'T046', 'T190', 'T019', 'T020', 'T037'}

conso_head = pd.read_csv('data/umls2_UL/mrconso_headers.csv', header=None)
conso_head = conso_head.iloc[:,0].tolist()
mrconso = pd.read_csv('data/umls2_UL/MRCONSO.RRF', header=None, names=conso_head+[''], sep="|")

sty_head = pd.read_csv('data/umls2_UL/mrsty_headers.csv', header=None)
sty_head = sty_head.iloc[:,0].tolist()
sty = pd.read_csv('data/umls2_UL/MRSTY.RRF', header=None, names=sty_head+[''],  sep="|")

mrconso = mrconso.loc[mrconso.LAT=="FRE"]

tagger = ner.model_zoo[0][1]

hf = h5py.File('data/emb_dict_v3.h5py', 'a')


for i, row in mrconso.loc[mrconso.CUI.isin(sty.loc[sty.TUI.isin(target_tui), "CUI"].tolist())].iterrows():
    tmp = Sentence(row.STR)
    tagger.embeddings.embed(tmp)
    emb = np.concatenate([tok._embeddings['0-transformer-word-data/embeddings/bert-base-medical-huge-cased/checkpoint-800000/'].cpu().numpy().reshape(1, -1) \
                          for tok in tmp], 0).mean(axis=0).reshape(1, -1)
    
    groupname = row.CUI+"/" + row.STR

    if groupname in hf:
        group = hf.get(groupname)
    else:
        group = hf.create_group(groupname)

    group.create_dataset(row.AUI, data = emb)


hf.close()

del mrconso
del sty
del hf

### 3. Load embedding dictionnary normalizer

In [139]:
%%time
norm_pheno_lev = NormPhenoLev(['ENT/SIGNS','ENT/DIAG_NAME'],
                              'lev_norm',
                              ID='pheno_dic:v4',
                              path_dict='data/emb_dict_concat_v4.h5py',
                              make_code = False,
                              pca_dim= 16,
                              code_base = 8,
                              best_code_size= 8,
                              max_editdistance = 0.2)

CPU times: user 1.13 s, sys: 100 ms, total: 1.23 s
Wall time: 1.23 s


In [140]:
%%timeit -n 2
norm_pheno_lev.get_closest_match(norm_pheno_lev.dict_label[0]['seq'])['label']

2.61 s ± 15.7 ms per loop (mean ± std. dev. of 7 runs, 2 loops each)


In [141]:
len(norm_pheno_lev.dict_label)

281243

In [142]:
%%time
norm_pheno_dtw = NormPhenoDTW(['ENT/SIGNS','ENT/DIAG_NAME'],
                              'dtw_norm',
                              ID='pheno_dic:v5',
                              path_dict='data/emb_dict_concat_v4.h5py',
                              make_code = False,
                              pca_dim= 256,
                              dtw_window= 5,
                              max_editdistance = 100)

CPU times: user 587 ms, sys: 273 ms, total: 860 ms
Wall time: 859 ms


In [None]:
%%timeit -n 2
norm_pheno_dtw.get_closest_match(norm_pheno_dtw.dict_label[0]['seq'])['label']

In [None]:
len(norm_pheno_dtw.dict_label)

## 4. Compare methods

In [106]:
from pymedext_eds.extract.corpus_generation_tools import brat_norm_to_pdf

In [50]:
import numpy as np

In [None]:
pipeline = [endlines, sections, sentences, hypothesis, family, syntagmes,
            negation, regex, umls_syntagme, ner, norm_pheno_lev, norm_pheno_dtw,
            umls_signs, umls_diag_proc, umls_diag_name, umls_therap]

### 4.A NeckerNorm

In [None]:
# data_path = pkg_resources.resource_filename('pymedext_eds', 'data/demo')
file_list = glob("../data/NeckerNorm/CiliTalConfirm/*.txt")

In [None]:
docs = [rawtext_loader(x) for x in file_list]

In [None]:
file_list[:3]

In [None]:
source_ids = [doc.source_ID for doc in docs]

In [None]:
assert len(source_ids) == len(np.unique(source_ids))

In [None]:
%%time
for doc in docs[:10]:
    doc.annotate(pipeline)

In [83]:
doc.get_annotations('ENT/DIAG_NAME')[0].to_dict()

{'type': 'ENT/DIAG_NAME',
 'value': 'chute',
 'ngram': None,
 'span': (815, 820),
 'source': 'med:v2',
 'source_ID': 'c30b5ab2-c512-11eb-b7b4-fdc0b84cc529',
 'isEntity': True,
 'attributes': {'hypothesis': 'certain',
  'context': 'patient',
  'score': 0.5726926326751709,
  'embedding': array([[ 0.04092915,  0.02011072, -0.10284506, ...,  0.40328822,
          -0.07274213, -0.6418335 ]], dtype=float32)},
 'ID': 'c391eeba-c512-11eb-b7b4-fdc0b84cc529'}

In [84]:
doc.get_annotations('umls_syntagme')[0].to_dict()

{'type': 'umls_syntagme',
 'value': 'etat general',
 'ngram': None,
 'span': (771, 783),
 'source': 'QuickUMLS:2020AA',
 'source_ID': 'c30be842-c512-11eb-b7b4-fdc0b84cc529',
 'isEntity': False,
 'attributes': {'hypothesis': 'certain',
  'context': 'patient',
  'negation': 'aff',
  'cui': 'C0236102',
  'label': 'etat general',
  'semtypes': ['T047'],
  'score': 1.0,
  'snippet': " J'ai revu en consultation Madame ∆∆∆∆∆∆∆ ∆∆∆∆∆∆∆ dont l'etat general reste bon"},
 'ID': 'c31f7042-c512-11eb-b7b4-fdc0b84cc529'}

In [86]:
doc.get_annotations('umls_diag_name')[0].to_dict()

{'type': 'umls_diag_name',
 'value': 'deshydratation',
 'ngram': None,
 'span': (1144, 1158),
 'source': 'QuickUMLS:2020AA',
 'source_ID': 'c391f0ea-c512-11eb-b7b4-fdc0b84cc529',
 'isEntity': False,
 'attributes': {'hypothesis': 'certain',
  'context': 'patient',
  'score': 1.0,
  'embedding': array([[-0.528751  , -0.36933076,  0.54514694, ...,  0.1609435 ,
           0.05293216,  0.14058281]], dtype=float32),
  'cui': 'C0011175',
  'label': 'deshydratation',
  'semtypes': ['T047'],
  'snippet': 'deshydratation'},
 'ID': 'c9299c06-c512-11eb-b7b4-fdc0b84cc529'}

In [87]:
doc.get_annotations('lev_norm')

[]

In [88]:
doc.get_annotations('dtw_norm')

[]

In [108]:
all_ann = brat_norm_to_pdf("../data/NeckerNorm/CiliTalConfirm/")

In [110]:
all_ann.head()

Unnamed: 0,ent_type,start,stop,mention_ner,doc_id,ann_id,ent_id,termino,cui,mention_norm
0,phenotype,1313,1327,deshydratation,13735351509,N1,T1,UMLS_ALL,C0011175,DESHYDRATATION
0,phenotype,1165,1214,Bruits du coeur reguliers sans souffle surajoute,13736496216,N1,T1,UMLS_FR,C0577821,Bruits du coeur normaux
1,phenotype,775,791,Bon etat general,13736496216,N2,T2,UMLS_ALL,C2939150,General wellbeing (observable entity)
2,phenotype,1217,1236,Lesions au visage,13736496216,N3,T3,UMLS_ALL,C3263723,injury
3,phenotype,1291,1323,auscultation pulmonaire normale,13736496216,N4,T4,UMLS_ALL,C0231855,Bruits respiratoires normaux


In [111]:
all_ann = all_ann.loc[lambda x:x.ent_type.isin(["phenotype"])]

In [124]:
(all_ann
 .assign(has_norm = lambda x:x.cui.notnull())
 .groupby(['doc_id'])['has_norm']
 .aggregate([('is_annotated', any), ('n_CUI', sum)])
 .groupby(['is_annotated'])['n_CUI']
 .aggregate([('n_annotated', len), ('n_CUI', sum)])

)

Unnamed: 0_level_0,n_annotated,n_CUI
is_annotated,Unnamed: 1_level_1,Unnamed: 2_level_1
False,1,0
True,59,408


In [125]:
(all_ann.assign(has_norm = lambda x:x.cui.notnull())
 .groupby(['ent_type'])['has_norm']
 .aggregate([('With CUI', sum), ('Total', len), ('freq', lambda x: round(sum(x)/len(x) ,2))])
)

Unnamed: 0_level_0,With CUI,Total,freq
ent_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
phenotype,408,459,0.89


In [126]:
all_ann = all_ann.loc[lambda x:x.cui.notnull()]

In [129]:
def pymedext_to_pdf(doc, key):
    """Transform pymedext annotations to pandas dataframe"""
    columns = ["doc_id", "mention", "start", "end", "cui", "label"]
    lines = [(doc.source_ID, t.value, t.span[0], t.span[1], t.attributes["cui"], t.attributes["label"]) \
             for t in doc.get_annotations(key)]
    
    df = pd.DataFrame(lines if lines else {k: [] for k in columns}, 
                      columns = columns
                     )    
    return df

In [130]:
ner_acc = []
syn_acc = []

for doc in docs[:10]:
    df_syn = pymedext_to_pdf(doc, 'umls_syntagme')
    break

In [131]:
pymedext_to_pdf(doc, 'umls_syntagme')

Unnamed: 0,doc_id,mention,start,end,cui,label
0,13624692484,oedeme larynge,750,764,C0023052,oedeme larynge
1,13624692484,collapsus,780,789,C0036974,collapsus
2,13624692484,stenose,699,706,C1261287,stenose
3,13624692484,enterite,1081,1089,C0014335,enterite
4,13624692484,pneumopathie,1142,1154,C0024115,pneumopathie
5,13624692484,erytheme,1589,1597,C0041834,erytheme
6,13624692484,stenose,1804,1811,C1261287,stenose
7,13624692484,stenose,1897,1904,C1261287,stenose
8,13624692484,stenose,1963,1970,C1261287,stenose
9,13624692484,stenose,1982,1989,C1261287,stenose


In [132]:
pymedext_to_pdf(doc, 'umls_diag_name')

Unnamed: 0,doc_id,mention,start,end,cui,label
0,13624692484,stenose,699,706,C1261287,stenose
1,13624692484,collapsus,780,789,C0036974,collapsus
2,13624692484,enterite a Rotavirus,1081,1101,C0347854,enterite a rotavirus
3,13624692484,pneumopathie,1142,1154,C0024115,pneumopathie
4,13624692484,erytheme,1589,1597,C0041834,erytheme
5,13624692484,stenose,1804,1811,C1261287,stenose
6,13624692484,stenose,1897,1904,C1261287,stenose
7,13624692484,stenose,1963,1970,C1261287,stenose
8,13624692484,stenose,1982,1989,C1261287,stenose
9,13624692484,Stenose,2048,2055,C1261287,stenose


In [136]:
for doc in docs[:10]:
    print(len(doc.get_annotations('norm_dtw')))


0
0
0
0
0
0
0
0
0
0


In [134]:
pymedext_to_pdf(doc, 'norm_dtw')

Unnamed: 0,doc_id,mention,start,end,cui,label


In [21]:
    
#     ner_acc.append(to_pdf(doc, 'umls_therap'))
    ner_acc.append(pymedext_to_pdf(doc, 'umls_diag_name'))
#     ner_acc.append(to_pdf(doc, 'umls_diag_proc'))
    ner_acc.append(pymedext_to_pdf(doc, 'umls_signs'))

    syn_acc.append(df_syn)

ner_acc = pd.concat(ner_acc).assign(mod = "ner")
syn_acc = pd.concat(syn_acc).assign(mod = "syn")

In [22]:
ner_acc.head()

Unnamed: 0,doc_id,mention,start,end,cui,label,mod
0,698430240,contusion,297.0,306.0,C0009938,contusion,ner
1,698430240,rupture,401.0,408.0,C3203359,rupture,ner
2,698430240,rupture,446.0,453.0,C3203359,rupture,ner
3,698430240,épanchement intra-articulaire assez,484.0,519.0,C1253936,epanchement intra-articulair,ner
4,698430240,traumatisme,1429.0,1440.0,C3714660,traumatisme,ner


In [23]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score

In [24]:
def compute_metrics(gold, pred, label = ""):
    tp, fn, fp, tn = confusion_matrix(gold, pred).ravel()
    acc = accuracy_score(gold, pred)
    f1= f1_score(gold, pred, average="binary")
    r = recall_score(gold, pred, average="binary")
    p = precision_score(gold, pred, average="binary")
    
    #reformat metrics
    metrics = pd.DataFrame([[tp + fn, tp, fn, fp, r, p, f1, acc]],
                           columns=['total_pos', "TP", "FN", "FP", "Recall", "Precision", "F1", "Accuracy"], index = [label])
    return metrics

In [25]:
all_merge = (all_ann
             .merge(ner_acc, how = "outer", on =["doc_id", "cui"])
             .merge(syn_acc, how = "outer", on =["doc_id", "cui"])
             .assign(true = lambda x:x.ent_id.notnull())
             .assign(pred_ner = lambda x:x['mod_x'].notnull())
             .assign(pred_q = lambda x:x['mod_y'].notnull())

            )

In [26]:
pd.concat([compute_metrics(all_merge.true, all_merge.pred_ner, label = "NER + QuickUMLS (ent level)"), 
           compute_metrics(all_merge.true, all_merge.pred_q, label = "QuickUMLS (ent level)"),
           compute_metrics(all_merge.drop_duplicates(["doc_id", "cui"]).true, all_merge.drop_duplicates(["doc_id", "cui"]).pred_ner, label = "NER + QuickUMLS (doc level)"),
           compute_metrics(all_merge.drop_duplicates(["doc_id", "cui"]).true, all_merge.drop_duplicates(["doc_id", "cui"]).pred_q, label = "QuickUMLS (doc level)")]
         ).round(2)

Unnamed: 0,total_pos,TP,FN,FP,Recall,Precision,F1,Accuracy
NER + QuickUMLS (ent level),168,73,95,162,0.78,0.86,0.81,0.71
QuickUMLS (ent level),168,3,165,140,0.81,0.78,0.79,0.66
NER + QuickUMLS (doc level),104,55,49,122,0.29,0.51,0.37,0.38
QuickUMLS (doc level),104,3,101,110,0.36,0.38,0.37,0.24


In [15]:
len(syn_acc), len(ner_acc)

(682, 418)

In [16]:
ner_acc.merge(syn_acc, how = "inner", on =["doc_id", "start", "end", "cui"]).shape

(296, 10)

In [17]:
diff = ner_acc.merge(syn_acc, how = "outer", on =["doc_id", "start", "end"]).loc[lambda x:x.cui_x != x.cui_y]

In [18]:
from IPython.display import display_html

In [19]:
gen = ((k, group) for k, group in diff.groupby('doc_id'))


In [21]:
k, group = next(gen)
ner_mention = group.loc[lambda x:x.mod_x.notnull(), ["mention_x", "label_x"]]
syn_mention = group.loc[lambda x:x.mod_y.notnull(), ["mention_y", "label_y"]]

display_html(ner_mention)
display_html(syn_mention)

Unnamed: 0,mention_x,label_x
19,cardiopathie ischémique,myelopathie ischemique
20,infarctus,infarctus
31,échographie cardiaque,echographie cardiaque
33,cardiovasculaire,syncope cardiovasculaire


Unnamed: 0,mention_y,label_y
33,cardiovasculaire,nevrose cardiovasculaire
450,consultation,teleconsultation
451,consultation,teleconsultation
452,suivi de cardiopathie ischémique,cardiopathie ischemique
453,fait infarctus,post infarctus
454,TSA et des membres inférieurs,ulceres des membres inferieurs
455,fraction d’éjection retrouvée,fraction d'ejection
456,échographie,echographie sai
457,consultation,teleconsultation
458,chute,chutes


In [32]:
# python -m quickumls.install -E FR data/umls data/quickumls

### 5. Qualitative comparison

In [15]:
from pymedext_eds.viz import display_annotations

In [45]:
doc_demo = rawtext_loader("data/test_data/pheno_norm.txt")

In [46]:
norm_pheno.max_editdistance = 100

In [47]:
pipeline = [endlines, sections, sentences, hypothesis, family, syntagmes,
            negation, regex, umls_syntagme, ner, norm_pheno,
            umls_signs, umls_diag_proc, umls_diag_name, umls_therap]

In [48]:
%%time
doc_demo.annotate(pipeline)

CPU times: user 3min 5s, sys: 5.84 s, total: 3min 11s
Wall time: 1min 27s


In [39]:
import random

In [44]:
random.choices([t['label'] for t in norm_pheno.dict_label.values()],k=5)

['trouble hemorragique',
 'hypocomplementemie',
 'rachitisme hypophosphatemique',
 'prematurite',
 'syndrome de walker-warburg']

In [49]:
print("\n".join(["\t".join([t.attributes['mention'], t.value]) + "\t score: {}".format(round(t.attributes['score_ed'],3)) for t in doc_demo.get_annotations('normalized_mention') if t.value]))

trouble hemorragique	trouble hemorragique	 score: 6.879
rachitisme hypophosphatemique	rachitisme hypophosphatemique	 score: 7.847
antécédent de prématurité	antecedent de cesarienne	 score: 39.372
syndrome de walker-warburg	syndrome de kleine-levin	 score: 26.717
baisse du complément	modification du transit	 score: 35.002
rachitisme	rachitisme	 score: 7.086
antécédent de naissance	retard de croissance	 score: 37.037
possible Walker-Warburg débutant	syndrome leopard	 score: 43.056
Sd de Walker-Warbourg débutant	syndrome de walker-warburg	 score: 38.451
kyste au niveau du rein droit	kyste du rein	 score: 45.484
kyste au niveau du poumon droit	kyste du pancreas	 score: 48.552
kyste au niveau du pelvis	kyste du pancreas	 score: 43.255
kyste au niveau splénique	kyste du cordon spermatique	 score: 37.334
kyste au niveau de la rate	rupture de rate	 score: 50.687
kyste saignant	kyste retentionnel	 score: 27.785
kyste hémorragique	kyste hemorragique	 score: 24.71
lésion	nodule	 score: 20.78
lési

In [40]:
print("\n".join(["\t".join([t.attributes['mention'], t.value]) + "\t score: {}".format(round(t.attributes['score_cos'],3)) for t in doc_demo.get_annotations('normalized_mention')]))

kyste au niveau du rein droit	kyste du rein	 score: 0.961
kyste au niveau du poumon droit	kyste du pancreas	 score: 0.952
kyste au niveau du pelvis	kyste du pancreas	 score: 0.948
kyste au niveau splénique	kyste du cordon spermatique	 score: 0.952
kyste au niveau de la rate	kyste du pancreas	 score: 0.947
kyste saignant	kyste	 score: 0.909
kyste hémorragique	kyste ovarien hemorragique	 score: 0.928
lésion	lesion intracranienne	 score: 0.862
lésion	lesion intracranienne	 score: 0.864
signes d'ICG	signes ecg d'ischemie myocardique	 score: 0.907
signes d'ICD	signes ecg d'ischemie myocardique	 score: 0.908
signes d'IC	signes ecg d'ischemie myocardique	 score: 0.909
Auscultation anormale	auscultation pulmonaire anormale	 score: 0.955
diabète	diabete	 score: 0.95
diabète	diabete	 score: 0.945
IVG	ivu	 score: 0.917
se sent bien	se sent tres mal	 score: 0.939
va bien	se sent tres mal	 score: 0.901
Syndrome de Marphan	syndrome de marfan	 score: 0.971
Mydriase areactive	astigmatisme myopique	 sc

In [72]:
display_annotations(doc_demo, entities=['normalized_mention'])

In [73]:
display_annotations(doc_demo, entities=['umls_syntagme'])

In [214]:
display_annotations(doc_demo, entities=['ENT/SIGNS'])

In [215]:
display_annotations(doc_demo, entities=['ENT/DIAG_NAME'])