In [1]:
%cd pymedext_eds/

/home/ivan/Documents/rel_ext/pymedext_eds


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os

import pandas as pd
import re
from pprint import pprint
import pkg_resources

from pymedextcore.document import Document
from pymedext_eds.annotators import Endlines, CleanText, SentenceTokenizer, Hypothesis, \
                                    ATCDFamille, SyntagmeTokenizer, Negation, RegexMatcher, \
                                    QuickUMLSAnnotator, Pipeline, SectionSplitter, Pipeline

from pymedext_eds.utils import rawtext_loader
from pymedext_eds.ner import NERAnnotator, NERNormalizer

In [24]:
from pymedext_eds.extract.corpus_generation_tools import get_new_corpus, write_to_brat, write_brat_conf

In [5]:
#cleantext = CleanText(['raw_text'], 'clean_text', 'cleantext:v1')
sections = SectionSplitter(['raw_text'], "section", ID= 'sections')
sentences = SentenceTokenizer(['raw_text'], 'sentence', 'sentenceTokenizer:v1')

sosy|T184|Sign or Symptom  


dsyn|T047|Disease or Syndrome  
neop|T191|Neoplastic Process  
comd|T049|Cell or Molecular Dysfunction  
mobd|T048|Mental or Behavioral Dysfunction   
patf|T046|Pathologic Function  
anab|T190|Anatomical Abnormality  
cgab|T019|Congenital Abnormality  
acab|T020|Acquired Abnormality  
inpo|T037|Injury or Poisoning  

diap|T060|Diagnostic Procedure  
lbpr|T059|Laboratory Procedure  
lbtr|T034|Laboratory or Test Result  

topp|T061|Therapeutic or Preventive Procedure  


In [6]:
#                                    accepted_semtypes = {'T184', 'T047', 'T191', 'T049', 'T048', 'T046', 'T190', 'T019', 'T020', 'T037','T060',  'T059', 'T034', 'T061'},


In [7]:
quick_umls_th = 0.9
quick_umls_dist = "score"



umls_signs = QuickUMLSAnnotator(['ENT/SIGNS'], 'umls_signs', 'QuickUMLS:2020AA', 
                                quickumls_fp='data/umls2_UL/',
                                overlapping_criteria=quick_umls_dist,
                                threshold=quick_umls_th,
                                similarity_name='jaccard',
                                accepted_semtypes = {'T184'},
                                window=5)

umls_diag_proc = QuickUMLSAnnotator(['ENT/DIAG_PROC'], 'umls_diag_proc', 'QuickUMLS:2020AA', 
                                quickumls_fp='data/umls2_UL/',
                                overlapping_criteria=quick_umls_dist,
                                threshold=quick_umls_th,
                                similarity_name='jaccard',
                                accepted_semtypes = {'T060', 'T059', 'T034'},
                                window=5)

umls_diag_name = QuickUMLSAnnotator(['ENT/DIAG_NAME'], 'umls_diag_name', 'QuickUMLS:2020AA', 
                                quickumls_fp='data/umls2_UL/',
                                overlapping_criteria=quick_umls_dist,
                                threshold=quick_umls_th,
                                similarity_name='jaccard',
                                accepted_semtypes = {'T047', 'T191', 'T049', 'T048', 'T046', 'T190', 'T019', 'T020', 'T037'},
                                window=5)

umls_therap = QuickUMLSAnnotator(['ENT/THERAP_PROC'], 'umls_therap_proc', 'QuickUMLS:2020AA', 
                                quickumls_fp='data/umls2_UL/',
                                overlapping_criteria=quick_umls_dist,
                                threshold=quick_umls_th,
                                similarity_name='jaccard',
                                accepted_semtypes = {'T061'},
                                window=5)


In [8]:
models_param = [{'tagger_path':'data/models/apcner_deid/entities_7/best-model.pt' ,
                'tag_name': 'pheno_pred'}]

ner = NERAnnotator(['sentence'], 'ner', ID='med:v2', models_param=models_param,  device='cuda:1')

2021-03-15 09:55:17,233 loading file data/models/apcner_deid/entities_7/best-model.pt


In [None]:
corpus =   get_new_corpus(text_path = "../data/CiliTalConfirm/0",
                          pipeline = [sections, sentences, ner, umls_signs, umls_diag_proc, umls_diag_name, umls_therap],
                          pheno_ent_type = ["ENT/SIGNS", "ENT/DIAG_NAME"], 
                          size = None,
                          threshold = 0,
                          min_n_tokens = 0,
                          seed=6,
                          verbose = 1)


In [10]:
write_to_brat(output_dir= "../data/CiliTalConfirm/0/brat", corpus= corpus)

In [29]:
write_brat_conf(outdir = "../data/CiliTalConfirm/0/brat",
                entities = ["SIGNS", "DIAG_NAME"], 
                norm = {"UMLS_ALL":{"DB":"umls0/umls_bglinsty", "<URL>":"http://en.wikipedia.org", "<URLBASE>":"http://en.wikipedia.org/?curid=%s"},
            "UMLS_FR":{"DB":"umls0/umls_frlinsty", "<URL>":"http://en.wikipedia.org", "<URLBASE>":"http://en.wikipedia.org/?curid=%s"}}, 
                relations = {})