In [1]:
%cd ..

/home/ivan/Documents/rel_ext/pymedext_eds


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import json

In [4]:
from tqdm import tqdm

In [5]:
import os

from glob import glob
import pandas as pd
import re
from pprint import pprint
import pkg_resources

from pymedextcore.document import Document
from pymedext_eds.annotators import Endlines, SentenceTokenizer, Hypothesis, \
                                    ATCDFamille, SyntagmeTokenizer, Negation, RegexMatcher, \
                                    QuickUMLSAnnotator, Pipeline, SectionSplitter, Pipeline

from pymedext_eds.utils import rawtext_loader
from pymedext_eds.ner import NERAnnotator
from pymedext_eds.norm import NERNormalizer, NormPheno, FeedDictionnary

In [6]:
endlines = Endlines(['raw_text'], 'clean_text', 'endlines:v1')
sentences = SentenceTokenizer(['clean_text'], 'sentence', 'sentenceTokenizer:v1')
sections = SectionSplitter(['clean_text'], "section", ID= 'sections')
hypothesis = Hypothesis(['sentence'], 'hypothesis', 'hypothesis:v1')
family = ATCDFamille(['sentence'], 'context', 'ATCDfamily:v1')
syntagmes = SyntagmeTokenizer(['sentence'], 'syntagme', 'SyntagmeTokenizer:v1')
negation = Negation(['syntagme'], 'negation', 'Negation:v1')
regex = RegexMatcher(['clean_text','syntagme'], 'regex', 'RegexMatcher:v1', 'list_regexp.json')

sosy|T184|Sign or Symptom  


dsyn|T047|Disease or Syndrome  
neop|T191|Neoplastic Process  
comd|T049|Cell or Molecular Dysfunction  
mobd|T048|Mental or Behavioral Dysfunction   
patf|T046|Pathologic Function  
anab|T190|Anatomical Abnormality  
cgab|T019|Congenital Abnormality  
acab|T020|Acquired Abnormality  
inpo|T037|Injury or Poisoning  

diap|T060|Diagnostic Procedure  
lbpr|T059|Laboratory Procedure  
lbtr|T034|Laboratory or Test Result  

topp|T061|Therapeutic or Preventive Procedure  


In [7]:
#                                    accepted_semtypes = {'T184', 'T047', 'T191', 'T049', 'T048', 'T046', 'T190', 'T019', 'T020', 'T037','T060',  'T059', 'T034', 'T061'},


In [8]:
quick_umls_th = 0.9
quick_umls_dist = "score"

umls_syntagme = QuickUMLSAnnotator(['syntagme'], 'umls_syntagme', 'QuickUMLS:2020AA', 
                                   quickumls_fp='data/umls2_UL/',
                                   overlapping_criteria=quick_umls_dist,
                                   threshold=quick_umls_th,
                                   similarity_name='jaccard',
                                   accepted_semtypes = {'T184', 'T047', 'T191', 'T049', 'T048', 'T046', 'T190', 'T019', 'T020', 'T037'},
                                   window=5)

umls_signs = QuickUMLSAnnotator(['ENT/SIGNS'], 'umls_signs', 'QuickUMLS:2020AA', 
                                quickumls_fp='data/umls2_UL/',
                                overlapping_criteria=quick_umls_dist,
                                threshold=quick_umls_th,
                                similarity_name='jaccard',
                                accepted_semtypes = {'T184'},
                                window=5)

umls_diag_proc = QuickUMLSAnnotator(['ENT/DIAG_PROC'], 'umls_diag_proc', 'QuickUMLS:2020AA', 
                                quickumls_fp='data/umls2_UL/',
                                overlapping_criteria=quick_umls_dist,
                                threshold=quick_umls_th,
                                similarity_name='jaccard',
                                accepted_semtypes = {'T060', 'T059', 'T034'},
                                window=5)

umls_diag_name = QuickUMLSAnnotator(['ENT/DIAG_NAME'], 'umls_diag_name', 'QuickUMLS:2020AA', 
                                quickumls_fp='data/umls2_UL/',
                                overlapping_criteria=quick_umls_dist,
                                threshold=quick_umls_th,
                                similarity_name='jaccard',
                                accepted_semtypes = {'T047', 'T191', 'T049', 'T048', 'T046', 'T190', 'T019', 'T020', 'T037'},
                                window=5)

umls_therap = QuickUMLSAnnotator(['ENT/THERAP_PROC'], 'umls_therap', 'QuickUMLS:2020AA', 
                                quickumls_fp='data/umls2_UL/',
                                overlapping_criteria=quick_umls_dist,
                                threshold=quick_umls_th,
                                similarity_name='jaccard',
                                accepted_semtypes = {'T061'},
                                window=5)


In [9]:
models_param = [{'tagger_path':'data/models/apcner_deid/entities_7/best-model.pt' ,
                  'store_embedding':True,
                'tag_name': 'pheno_pred' }]

ner = NERAnnotator(['sentence'], 'ner', ID='med:v2', models_param=models_param,  device='cuda:1', reduce_embedding='weighted_mean', mini_batch_size=128)

2021-05-04 17:46:32,248 loading file data/models/apcner_deid/entities_7/best-model.pt


In [10]:
dico_umls_exact_match = FeedDictionnary(['ENT/SIGNS','ENT/DIAG_NAME'],
                                        'feed_dictionnary_quick_umls',
                                        ID = 'fd:v3',
                                        path_dict='data/emb_dict_v3.h5py',
                                        threshold=1)

In [11]:
feed_dic_pipeline = [endlines, sections, sentences, ner, dico_umls_exact_match]

In [12]:
from glob import glob

In [None]:
count = 0

for subdir in glob("../../../../marc/Data/CiliTALexport/*"):
    for json_path in tqdm(glob(subdir + "/*.json")):
        try:

            with open(json_path) as h:
                doc = json.load(h)

            if doc['DOCUMENT_ORIGIN_CODE'] == 'ORBIS':
                continue

            doc = Document(
                    raw_text = doc['DISPLAYED_TEXT'],
                    ID = doc['DOCUMENT_NUM'],
                    attributes = {'person_id': doc['PATIENT_NUM']}
                )
            
            try:
                doc.annotate(feed_dic_pipeline)
                count += 1
            except:
                print("Annotation error", doc.source_ID)
        except:
            print("Load error", json_path)



print(count)

In [14]:
print(count)

2102722


with open("../data/export_mincil_pheno_loc.v1.json") as h:
    docs = json.load(h)

docs = [Document(
        raw_text = x['ano_text'],
        ID = x['document_num'],
        attributes = {'person_id': x['patient_num']}
    ) for x in docs]

for doc in tqdm(docs[:]):
    try:
        doc.annotate(feed_dic_pipeline)
    except:
        print(doc.source_ID)
    del doc

### Embed UMLS

In [15]:
from flair.data import Sentence
import numpy as np
import h5py

In [16]:

target_tui = {'T184', 'T047', 'T191', 'T049', 'T048', 'T046', 'T190', 'T019', 'T020', 'T037'}

conso_head = pd.read_csv('data/umls2_UL/mrconso_headers.csv', header=None)
conso_head = conso_head.iloc[:,0].tolist()
mrconso = pd.read_csv('data/umls2_UL/MRCONSO.RRF', header=None, names=conso_head+[''], sep="|")

sty_head = pd.read_csv('data/umls2_UL/mrsty_headers.csv', header=None)
sty_head = sty_head.iloc[:,0].tolist()
sty = pd.read_csv('data/umls2_UL/MRSTY.RRF', header=None, names=sty_head+[''],  sep="|")

mrconso = mrconso.loc[mrconso.LAT=="FRE"]

tagger = ner.model_zoo[0][1]

hf = h5py.File('data/emb_dict_v3.h5py', 'a')


for i, row in mrconso.loc[mrconso.CUI.isin(sty.loc[sty.TUI.isin(target_tui), "CUI"].tolist())].iterrows():
    tmp = Sentence(row.STR)
    tagger.embeddings.embed(tmp)
    emb = np.concatenate([tok._embeddings['0-transformer-word-data/embeddings/bert-base-medical-huge-cased/checkpoint-800000/'].cpu().numpy().reshape(1, -1) \
                          for tok in tmp], 0).mean(axis=0).reshape(1, -1)
    
    groupname = row.CUI+"/" + row.STR

    if groupname in hf:
        group = hf.get(groupname)
    else:
        group = hf.create_group(groupname)

    group.create_dataset(row.AUI, data = emb)


hf.close()

del mrconso
del sty
del hf

In [35]:
%%time
norm_pheno = NormPheno(['ENT/SIGNS','ENT/DIAG_NAME'], 'normalized_mention', ID='pheno_dic:v3', path_dict='data/emb_dict_v3_noumls.h5py' )

CPU times: user 1min 41s, sys: 11.9 s, total: 1min 53s
Wall time: 2min 6s


In [18]:
pipeline = [endlines, sections, sentences, hypothesis, family, syntagmes,
            negation, regex, umls_syntagme, ner, norm_pheno,
            umls_signs, umls_diag_proc, umls_diag_name, umls_therap]

In [21]:
# data_path = pkg_resources.resource_filename('pymedext_eds', 'data/demo')
file_list = glob("data/test_data" + '/*.txt')

docs = [rawtext_loader(x) for x in file_list]

In [22]:
file_list

['data/test_data/pheno_norm.txt', 'data/test_data/pheno_ner.txt']

In [61]:
for doc in docs:
    doc.annotate(pipeline)

In [62]:
doc.get_annotations('normalized_mention')[0].to_dict()

{'type': 'normalized_mention',
 'value': 'insuffisance renale chronique',
 'ngram': None,
 'span': (66, 97),
 'source': 'pheno_dic:v1',
 'source_ID': '7355399c-a9c5-11eb-bd0a-6d1683a67d6f',
 'isEntity': False,
 'attributes': {'score_cos': 0.8400646364128913,
  'mention': 'sclérose latérale amyotrophique',
  'cui': 'C0022661',
  'label': 'insuffisance renale chronique',
  'hypothesis': 'certain',
  'context': 'patient',
  'score': 0.9832151134808859},
 'ID': '7360264a-a9c5-11eb-bd0a-6d1683a67d6f'}

In [63]:
doc.get_annotations("ENT/SIGNS")[2].to_dict()

{'type': 'ENT/SIGNS',
 'value': 'signe du flot',
 'ngram': None,
 'span': (335, 348),
 'source': 'med:v2',
 'source_ID': '73553af0-a9c5-11eb-bd0a-6d1683a67d6f',
 'isEntity': True,
 'attributes': {'hypothesis': 'certain',
  'context': 'patient',
  'score': 0.678720494111379},
 'ID': '73601ae2-a9c5-11eb-bd0a-6d1683a67d6f'}

In [11]:
ann_list = glob("../../brat_data/covid_pheno_norm/annotator_0_training" + '/*.ann')


In [12]:
def read_brat_ann(path):
    doc_id =  os.path.splitext(os.path.basename(path))[0].split('_')[-1]
    with open(path, "r") as h:
        ann_norm = []
        ann_ner = []
        for line in h.readlines():
            ann_id, field, mention = line.split('\t')
            if ann_id[0] == "T":
                ent_type = field.split(' ')[0]
                start = field.split(' ')[1]
                stop = field.split(' ')[-1]
                ann_ner.append((doc_id, ann_id, ent_type, start, stop, mention.strip()))
            elif ann_id[0] == "N":
                _, ent_id, code = field.split(' ')
                termino, code = code.split(':')
                ann_norm.append((doc_id, ann_id, ent_id, termino, code, mention.strip()))
    
    return(ann_norm, ann_ner)

In [13]:
all_ann = []
for ann_path in ann_list:
    ann_norm, ann_ner = read_brat_ann(ann_path)
    ann_ner = pd.DataFrame(ann_ner, columns=["doc_id", "ent_id", "ent_type", "start", "stop", "mention"])
    ann_norm =  pd.DataFrame(ann_norm, columns = ["doc_id", "ann_id", "ent_id", "termino", "cui", "mention"])
    ann_ner = ann_ner.merge(ann_norm, how= "left", on = ["doc_id", "ent_id"], suffixes=['_ner', '_norm'])
    all_ann.append(ann_ner)

all_ann = pd.concat(all_ann)

In [14]:
all_ann = all_ann.loc[lambda x:x.ent_type.isin(["DIAG_NAME", "SIGNS"])]

In [15]:
all_ann.assign(has_norm = lambda x:x.cui.notnull()).groupby(['ent_type'])['has_norm'].aggregate([sum, len, lambda x: sum(x)/len(x)])

Unnamed: 0_level_0,sum,len,<lambda_0>
ent_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DIAG_NAME,106,126,0.84127
SIGNS,127,180,0.705556


In [16]:
all_ann = all_ann.loc[lambda x:x.cui.notnull()]

In [17]:
all_ann.head()

Unnamed: 0,doc_id,ent_id,ent_type,start,stop,mention_ner,ann_id,termino,cui,mention_norm
7,698430240,T25,SIGNS,1510,1547,douleurs intenses notamment nocturnes,N13,UMLS_FR,C0030193,Douleur SAI
8,698430240,T1,SIGNS,201,208,malaise,N1,UMLS_FR,C0231218,Se sentait malade en général
9,698430240,T2,SIGNS,292,328,contusion directe de l’épaule droite,N2,UMLS_FR,C2218555,
10,698430240,T3,SIGNS,330,357,impotence massive immédiate,N3,UMLS_FR,C0311394,Incapacité de marcher
11,698430240,T4,DIAG_NAME,395,437,rupture étendue au niveau du supra-épineux,N4,UMLS_FR,C0439059,


In [19]:
# pd.DataFrame.from_records(NERAnnotator.doc_to_omop(docs[1]))

In [20]:
def to_pdf(doc, key):
    columns = ["doc_id", "mention", "start", "end", "cui", "label"]
    lines = [(doc.source_ID, t.value, t.span[0], t.span[1], t.attributes["cui"], t.attributes["label"]) \
             for t in doc.get_annotations(key)]
    
    
    df = pd.DataFrame(lines if lines else {k: [] for k in columns}, 
                      columns = columns
                     )    
    return df

In [21]:
ner_acc = []
syn_acc = []

for doc in docs:
    df_syn = to_pdf(doc, 'umls_syntagme')
    
#     ner_acc.append(to_pdf(doc, 'umls_therap'))
    ner_acc.append(to_pdf(doc, 'umls_diag_name'))
#     ner_acc.append(to_pdf(doc, 'umls_diag_proc'))
    ner_acc.append(to_pdf(doc, 'umls_signs'))

    syn_acc.append(df_syn)

ner_acc = pd.concat(ner_acc).assign(mod = "ner")
syn_acc = pd.concat(syn_acc).assign(mod = "syn")

In [22]:
ner_acc.head()

Unnamed: 0,doc_id,mention,start,end,cui,label,mod
0,698430240,contusion,297.0,306.0,C0009938,contusion,ner
1,698430240,rupture,401.0,408.0,C3203359,rupture,ner
2,698430240,rupture,446.0,453.0,C3203359,rupture,ner
3,698430240,épanchement intra-articulaire assez,484.0,519.0,C1253936,epanchement intra-articulair,ner
4,698430240,traumatisme,1429.0,1440.0,C3714660,traumatisme,ner


In [23]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score

In [24]:
def compute_metrics(gold, pred, label = ""):
    tp, fn, fp, tn = confusion_matrix(gold, pred).ravel()
    acc = accuracy_score(gold, pred)
    f1= f1_score(gold, pred, average="binary")
    r = recall_score(gold, pred, average="binary")
    p = precision_score(gold, pred, average="binary")
    
    #reformat metrics
    metrics = pd.DataFrame([[tp + fn, tp, fn, fp, r, p, f1, acc]],
                           columns=['total_pos', "TP", "FN", "FP", "Recall", "Precision", "F1", "Accuracy"], index = [label])
    return metrics

In [25]:
all_merge = (all_ann
             .merge(ner_acc, how = "outer", on =["doc_id", "cui"])
             .merge(syn_acc, how = "outer", on =["doc_id", "cui"])
             .assign(true = lambda x:x.ent_id.notnull())
             .assign(pred_ner = lambda x:x['mod_x'].notnull())
             .assign(pred_q = lambda x:x['mod_y'].notnull())

            )

In [26]:
pd.concat([compute_metrics(all_merge.true, all_merge.pred_ner, label = "NER + QuickUMLS (ent level)"), 
           compute_metrics(all_merge.true, all_merge.pred_q, label = "QuickUMLS (ent level)"),
           compute_metrics(all_merge.drop_duplicates(["doc_id", "cui"]).true, all_merge.drop_duplicates(["doc_id", "cui"]).pred_ner, label = "NER + QuickUMLS (doc level)"),
           compute_metrics(all_merge.drop_duplicates(["doc_id", "cui"]).true, all_merge.drop_duplicates(["doc_id", "cui"]).pred_q, label = "QuickUMLS (doc level)")]
         ).round(2)

Unnamed: 0,total_pos,TP,FN,FP,Recall,Precision,F1,Accuracy
NER + QuickUMLS (ent level),168,73,95,162,0.78,0.86,0.81,0.71
QuickUMLS (ent level),168,3,165,140,0.81,0.78,0.79,0.66
NER + QuickUMLS (doc level),104,55,49,122,0.29,0.51,0.37,0.38
QuickUMLS (doc level),104,3,101,110,0.36,0.38,0.37,0.24


In [15]:
len(syn_acc), len(ner_acc)

(682, 418)

In [16]:
ner_acc.merge(syn_acc, how = "inner", on =["doc_id", "start", "end", "cui"]).shape

(296, 10)

In [17]:
diff = ner_acc.merge(syn_acc, how = "outer", on =["doc_id", "start", "end"]).loc[lambda x:x.cui_x != x.cui_y]

In [18]:
from IPython.display import display_html

In [19]:
gen = ((k, group) for k, group in diff.groupby('doc_id'))


In [21]:
k, group = next(gen)
ner_mention = group.loc[lambda x:x.mod_x.notnull(), ["mention_x", "label_x"]]
syn_mention = group.loc[lambda x:x.mod_y.notnull(), ["mention_y", "label_y"]]

display_html(ner_mention)
display_html(syn_mention)

Unnamed: 0,mention_x,label_x
19,cardiopathie ischémique,myelopathie ischemique
20,infarctus,infarctus
31,échographie cardiaque,echographie cardiaque
33,cardiovasculaire,syncope cardiovasculaire


Unnamed: 0,mention_y,label_y
33,cardiovasculaire,nevrose cardiovasculaire
450,consultation,teleconsultation
451,consultation,teleconsultation
452,suivi de cardiopathie ischémique,cardiopathie ischemique
453,fait infarctus,post infarctus
454,TSA et des membres inférieurs,ulceres des membres inferieurs
455,fraction d’éjection retrouvée,fraction d'ejection
456,échographie,echographie sai
457,consultation,teleconsultation
458,chute,chutes


In [32]:
# python -m quickumls.install -E FR data/umls data/quickumls

In [19]:
from pymedext_eds.viz import display_annotations

In [36]:
doc_demo = rawtext_loader("data/test_data/pheno_norm.txt") 

In [37]:
#norm_pheno.k_neighboors = 2

In [38]:
pipeline = [endlines, sections, sentences, hypothesis, family, syntagmes,
            negation, regex, umls_syntagme, ner, norm_pheno,
            umls_signs, umls_diag_proc, umls_diag_name, umls_therap]

In [39]:
doc_demo.annotate(pipeline)

In [37]:
display_annotations(doc_demo, entities=['ENT/DIAG_NAME', 'ENT/SIGNS'])

In [38]:
list(norm_pheno.dict_label.values())[:10]

[('C0000727', 'ABDOMEN AIGU'),
 ('C0000727', 'Abdomen aigu'),
 ('C0000727', 'Abdomen aigu'),
 ('C0000727', 'Abdomen aigu'),
 ('C0000727', 'Abdomen chirurgical'),
 ('C0000727', 'Abdomen urgent'),
 ('C0000727', 'Syndrome abdominal aigu'),
 ('C0000727', 'Syndrome abdominal aigu'),
 ('C0000727', 'syndrome abdominal aigu'),
 ('C0000729', 'CRAMPES ABDOMINALES')]

In [33]:
print("\n".join(["\t".join([t.attributes['mention'], t.value]) + "\t score: {}".format(round(t.attributes['score_cos'],3)) for t in doc_demo.get_annotations('normalized_mention')]))

kyste au niveau du rein droit	kyste du rein	 score: 0.961
kyste au niveau du poumon droit	Lésion du poumon, avec plaie ouverte du thorax	 score: 0.954
kyste au niveau du pelvis	kyste du pancreas	 score: 0.948
kyste au niveau splénique	kyste du cordon spermatique	 score: 0.952
kyste au niveau de la rate	Lésion de la rate avec plaie ouverte de la cavité	 score: 0.961
kyste saignant	Ulcère duodénal saignant	 score: 0.912
kyste hémorragique	Kyste hémorragique	 score: 0.961
lésion	lesion intracranienne	 score: 0.862
lésion	lesion intracranienne	 score: 0.864
signes d'ICG	signes ecg d'ischemie myocardique	 score: 0.907
signes d'ICD	signes ecg d'ischemie myocardique	 score: 0.908
signes d'IC	signes ecg d'ischemie myocardique	 score: 0.909
Auscultation anormale	auscultation pulmonaire anormale	 score: 0.955
diabète	diabete	 score: 0.95
diabète	diabete	 score: 0.945
IVG	ivu	 score: 0.917
se sent bien	Ne se sent pas bien	 score: 0.948
va bien	Ne se sent pas bien	 score: 0.904
Syndrome de Marphan

In [40]:
print("\n".join(["\t".join([t.attributes['mention'], t.value]) + "\t score: {}".format(round(t.attributes['score_cos'],3)) for t in doc_demo.get_annotations('normalized_mention')]))

kyste au niveau du rein droit	kyste du rein	 score: 0.961
kyste au niveau du poumon droit	kyste du pancreas	 score: 0.952
kyste au niveau du pelvis	kyste du pancreas	 score: 0.948
kyste au niveau splénique	kyste du cordon spermatique	 score: 0.952
kyste au niveau de la rate	kyste du pancreas	 score: 0.947
kyste saignant	kyste	 score: 0.909
kyste hémorragique	kyste ovarien hemorragique	 score: 0.928
lésion	lesion intracranienne	 score: 0.862
lésion	lesion intracranienne	 score: 0.864
signes d'ICG	signes ecg d'ischemie myocardique	 score: 0.907
signes d'ICD	signes ecg d'ischemie myocardique	 score: 0.908
signes d'IC	signes ecg d'ischemie myocardique	 score: 0.909
Auscultation anormale	auscultation pulmonaire anormale	 score: 0.955
diabète	diabete	 score: 0.95
diabète	diabete	 score: 0.945
IVG	ivu	 score: 0.917
se sent bien	se sent tres mal	 score: 0.939
va bien	se sent tres mal	 score: 0.901
Syndrome de Marphan	syndrome de marfan	 score: 0.971
Mydriase areactive	astigmatisme myopique	 sc

In [72]:
display_annotations(doc_demo, entities=['normalized_mention'])

In [73]:
display_annotations(doc_demo, entities=['umls_syntagme'])

In [214]:
display_annotations(doc_demo, entities=['ENT/SIGNS'])

In [215]:
display_annotations(doc_demo, entities=['ENT/DIAG_NAME'])