In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from glob import glob
import pandas as pd
import re
from pprint import pprint
import pkg_resources

from pymedextcore.document import Document
from pymedext_eds.annotators import Endlines, SentenceTokenizer, SectionSplitter
from pymedext_eds.utils import rawtext_loader
from pymedext_eds.med import MedicationAnnotator, NewMedicationAnnotator, MedicationNormalizer

In [3]:
models_param = [
    {'tagger_path':'data/models/apmed5/entities/final-model.pt' ,
    'tag_name': 'entity_pred' },
    {'tagger_path':'data/models/apmed5/events/final-model.pt' ,
    'tag_name': 'event_pred' },
    {'tagger_path': "data/models/apmed5/drugblob/final-model.pt",
    'tag_name': 'drugblob_pred'}
]

data_path = pkg_resources.resource_filename('pymedext_eds', 'data/romedi')
romedi_path = glob(data_path + '/*.p')[0]

In [4]:
endlines = Endlines(["raw_text"], "clean_text", ID="endlines")
sections = SectionSplitter(['clean_text'], "section", ID= 'sections')
sentenceSplitter = SentenceTokenizer(["section"],"sentence", ID="sentences")
med = MedicationAnnotator(['sentence'], 'med', ID='med:v2', models_param=models_param,  device='cpu')
norm = MedicationNormalizer(['ENT/DRUG','ENT/CLASS'], 'normalized_mention', ID='norm',romedi_path= romedi_path)

2021-03-18 15:26:23,727 loading file data/models/apmed5/entities/final-model.pt
2021-03-18 15:26:25,361 loading file data/models/apmed5/events/final-model.pt
2021-03-18 15:26:26,996 loading file data/models/apmed5/drugblob/final-model.pt


In [5]:
pipeline = [endlines, sections, sentenceSplitter, med, norm]
# new_pipeline = [endlines, sections, sentenceSplitter, new_med,  norm]

In [6]:
data_path = pkg_resources.resource_filename('pymedext_eds', 'data/demo')
file_list = glob(data_path + '/*.txt')

docs = [rawtext_loader(x) for x in file_list]

In [8]:
for doc in docs:
    doc.annotate(pipeline)

Ignore 2 sentence(s) with no tokens.


In [14]:
# Test du bug
doc = Document("doliprane 3 fois par jour tant que la fièvre ne basse pas et " * 100)
doc.annotate(pipeline)
doc.annotations[-1].to_dict()

{'type': 'ENT/DRUG',
 'value': 'doliprane',
 'ngram': None,
 'span': (976, 985),
 'source': 'med:v2',
 'source_ID': 'f1a4ee5a-87f6-11eb-b720-0242ac100a62',
 'isEntity': False,
 'attributes': {'section': 'head',
  'score': 0.99920254945755,
  'ENT/FREQ': [{'value': '3 fois par jour',
    'span': (803, 818),
    'type': 'ENT/FREQ',
    'attributes': {'score': 0.8958472311496735},
    'source_ID': 'f1a4ee5a-87f6-11eb-b720-0242ac100a62',
    'in_blob': True,
    'normalized_mention': 'H_1 1 0 1'},
   {'value': '3 fois par jour',
    'span': (864, 879),
    'type': 'ENT/FREQ',
    'attributes': {'score': 0.9133852571249008},
    'source_ID': 'f1a4ee5a-87f6-11eb-b720-0242ac100a62',
    'in_blob': True,
    'normalized_mention': 'H_1 1 0 1'},
   {'value': '3 fois par jour',
    'span': (925, 940),
    'type': 'ENT/FREQ',
    'attributes': {'score': 0.943561464548111},
    'source_ID': 'f1a4ee5a-87f6-11eb-b720-0242ac100a62',
    'in_blob': True,
    'normalized_mention': 'H_1 1 0 1'},
   {'val

In [None]:
pd.DataFrame.from_records(NewMedicationAnnotator.doc_to_omop(docs[0])).T