In [1]:
import os
import json
import time
from pymetamap import MetaMap
from nltk.tokenize.punkt import PunktSentenceTokenizer

In [2]:
# corpus path
corpus_path = '/home/denglizong/venv4attr/corpus/WikiPediaR5'
# os.listdir( corpus_path )[0:3]


In [3]:
# list of disease names
list_of_diseases = [ filename.replace('.txt',"") 
                    for filename in os.listdir( corpus_path ) if filename.endswith(".txt")]
#
list_of_diseases[0:3]                    

['Acinetobacter infections',
 'Actinomycosis',
 'African sleeping sickness (African trypanosomiasis)']

In [4]:
# annotate corpus with metamap 
# explore optimal parameters of metamap based on training set

In [5]:
# creat instance of metamap
mm = MetaMap.get_instance("/home/denglizong/public_mm/bin/metamap20")

In [6]:
# restrict scope
semantic_group_of_disorder = ['acab','anab','cgab','comd','dsyn','emod','fndg','inpo','mobd','patf','sosy']
semantic_group_of_anatomy  = ['anst','bdsu','bdsy','bpoc','bsoj','emst','ffas']
restrict_to_sts = semantic_group_of_disorder + semantic_group_of_anatomy

In [7]:
# restrict sources
# restrict_to_sources = ['SNOMEDCT_US']

In [8]:
# help( mm.extract_concepts )

In [10]:
%%time
# given .txt and .ann
# output annotation results in text file
dict_of_concepts_in_texts = {}

for index, disease_name in enumerate(list_of_diseases):
    # text file 
    textfile = os.path.join( corpus_path, disease_name + ".txt" )

    # text string
    textstr = ""
    with open(textfile, 'r', encoding='utf-8') as f:
        textstr = f.read()
    # repalce '\n' to ','
    textstr = textstr.replace('\n',',')

    # sentence tokonize with position
    text_sents = []
    text_sents_with_spans = []
    for start, end in PunktSentenceTokenizer().span_tokenize(textstr):
        text_sents_with_spans.append( (start,end, textstr[start:end]) )
        text_sents.append( textstr[start:end] )
    # print( len(text_sents) )
    # print( text_sents[0] )


    # sentence index
    sents_idx = range( len(text_sents) )
    # print( list(sents_idx) )

    # annotation sentences with metamap
    # 标注参数 composite_phrase = 2, allow_concept_gaps=True, ignore_word_order=True (aim to improve recall)
    # concepts,error = mm.extract_concepts(text_sents, list(sents_idx), restrict_to_sts=restrict_to_sts, \
    #                                     composite_phrase=2, allow_concept_gaps=True, ignore_word_order=True, no_nums = ['fndg'], \
    #                                     word_sense_disambiguation=True)
    # 
    # change parameters
    # composite_phrase=0, allow_concept_gaps=False, ignore_word_order=False
    # SNOMEDCT_US, HPO
    concepts,error = mm.extract_concepts(text_sents, list(sents_idx), \
                                        restrict_to_sts=restrict_to_sts, restrict_to_sources=['HPO'], \
                                        composite_phrase=2, allow_concept_gaps=False, ignore_word_order=False, no_nums = ['fndg'], \
                                        word_sense_disambiguation=True)

    # change parameters
    # concepts,error = mm.extract_concepts(text_sents, list(sents_idx), restrict_to_sts=restrict_to_sts, \
    #                                     composite_phrase=0, allow_concept_gaps=False, ignore_word_order=True, no_nums = ['fndg'], \
    #                                     word_sense_disambiguation=True)    

    
    # save annotation result
    print(index, disease_name, len(text_sents), len(concepts) )
    dict_of_concepts_in_texts.setdefault( disease_name, concepts )                                   

0 Acinetobacter infections 2 10
1 Actinomycosis 4 5
2 African sleeping sickness (African trypanosomiasis) 16 22
3 AIDS (acquired immunodeficiency syndrome) 37 34
4 Amoebiasis 21 8
5 Anaplasmosis 6 10
6 Angiostrongyliasis 9 24
7 Anthrax 32 50
8 Argentine hemorrhagic fever 8 4
9 Ascariasis 15 21
10 Aspergillosis 12 23
11 Astrovirus infection 10 11
12 Babesiosis 13 46
13 Bacterial meningitis 39 73
14 Bacterial pneumonia 1 9
15 Bacterial vaginosis 18 12
16 Balantidiasis 1 10
17 Bartonellosis 31 56
18 Baylisascaris infection 1 11
19 Bejel 2 1
20 BK virus infection 10 9
21 Blastocystosis 9 9
22 Blastomycosis 2 27
23 Bolivian hemorrhagic fever 6 11
24 Botulism (and Infant botulism) 28 33
25 Brucellosis 8 33
26 Bubonic plague 8 17
27 Buruli ulcer 7 9
28 Campylobacteriosis 14 25
29 Candidiasis (Moniliasis; Thrush) 17 29
30 Carrion's disease 26 31
31 Cat-scratch disease 18 28
32 Cellulitis 3 7
33 Chagas disease (American trypanosomiasis) 30 39
34 Chancroid 9 10
35 Chickenpox 18 36
36 Chikungunya

In [47]:
# finish in 17min

In [10]:
# save annotation results to json file
save_path = '/home/denglizong/SSUMiner/corpus/MetaMapAnnotations'

In [11]:
for disease_name in dict_of_concepts_in_texts:
    # list of concepts (as dict) in the text
    list_of_concepts_in_text = []

    #
    concepts = dict_of_concepts_in_texts[disease_name]
    for concept in concepts:
        # keep concept with mm tag
        if 'mm' in dir(concept):
            # 记录该concept的信息为词典形式
            tmpdict = {}    
            #
            tmpdict.setdefault( "index", concept.index ) 
            tmpdict.setdefault( "score", concept.score ) 
            tmpdict.setdefault( "preferred_name", concept.preferred_name ) 
            tmpdict.setdefault( "cui", concept.cui ) 
            tmpdict.setdefault( "semtypes", concept.semtypes ) 
            tmpdict.setdefault( "trigger", concept.trigger ) 
            tmpdict.setdefault( "pos_info", concept.pos_info )     
            tmpdict.setdefault( "tree_codes", concept.tree_codes )     
            #
            # 
            if len(tmpdict) != 0:
                list_of_concepts_in_text.append( tmpdict )    

    # 
    outfilepath = os.path.join( save_path, disease_name+'.json' )
    json.dump(list_of_concepts_in_text, open(outfilepath,'w',encoding='utf-8'),indent=2,ensure_ascii=False)                

In [12]:
# data exploration for metamap annotations

In [13]:
# compare metamap with metamap lite

In [20]:
# compute performance of matches 

In [11]:
# get names for training and testing 
data_dir = "/home/denglizong/SSUMiner/corpus/"

In [12]:
# disease names in training and testing set
diseases_used_for_training = []
diseases_used_for_test = []

# training 
file_of_diseases_for_training = os.path.join( data_dir+"TrainTestSplit", "diseases_for_training.txt" )
with open( file_of_diseases_for_training, 'r', encoding='utf-8' ) as f:
    for line in f.readlines():
        if line.strip() != "":
            diseases_used_for_training.append( line.strip()  )

# testing
file_of_diseases_for_test = os.path.join( data_dir+"TrainTestSplit", "diseases_for_test.txt" )
with open( file_of_diseases_for_test, 'r', encoding='utf-8' ) as f:
    for line in f.readlines():
        if line.strip() != "":
            diseases_used_for_test.append( line.strip()  )

#
len( diseases_used_for_training ), len( diseases_used_for_test )            

(133, 60)

In [13]:
import re

In [14]:
# get span, name and types of concepts in brat
# dict_of_brat_annotations.setdefault( ent_id, ('FindingSite', ent_name, pos_info) )    
def read_brat_annotation_file( file_of_brat_annotation ):
    # to get
    dict_of_brat_annotations = {}
    # text string
    text_string = ""
    with open(file_of_brat_annotation,'r',encoding='utf-8') as f:
        text_string = f.read()    
    # annotation content
    ann_lines = []
    with open(file_of_brat_annotation,'r',encoding='utf-8') as f:
        ann_lines = f.readlines()
    # resolve annotation content line by line
    for line in ann_lines:
        # 
        if line.startswith('T'): 
            # phenotype concept
            if re.search('Phenotype',line,re.I):
                # T1	Phenotype 69 86	painful abscesses
                ent_id, ent_info, ent_name = line.strip().split('\t')
                # position info 153 157;180 194
                pos_info = ent_info.replace('Phenotype ','')
                # 
                # dict_of_brat_annotations.setdefault( ent_id, ('Phenotype', ent_name, pos_info) )   
                dict_of_brat_annotations.setdefault( pos_info, ('Phenotype', ent_name, ent_id) )
            # findingsite concept
            elif re.search('FindingSite',line,re.I):
                # T4	FindingSite 114 120	breast
                ent_id, ent_info, ent_name = line.strip().split('\t')
                # position 114 120
                pos_info = ent_info.replace('FindingSite ','')   
                #
                # dict_of_brat_annotations.setdefault( ent_id, ('FindingSite', ent_name, pos_info) )   
                dict_of_brat_annotations.setdefault( pos_info, ('FindingSite', ent_name, ent_id) )     
    #
    return dict_of_brat_annotations   
    

In [15]:
# disease_name = "Aspergillosis"

In [16]:
# resolve annotation results of metamap , index by span (compatible with brat)
# concept in concepts 
# continuous concept
# The simplest form : 228/6  pos_info='228/6' 
# non-continuous concept
# Multiple comma-separated StartPos/Length pairs:  7059/5,7073/5 (indicating disjoint text strings mapped to one concept)
# Multiple comma-separated bracketed StartPos/Length pairs:  [1351/8],[1437/8]  multiple occurrences of a concept in an utterance
# Finally, forms (b) and (c) above can in rare cases be combined, e.g., [4061/10,4075/11],[4166/10,4180/11] 
def read_metamap_annotation_result( textstr, list_of_concepts_in_text ):
    # to obtain
    # span_key (compatible with brat) : concept
    metamap_annotated_concepts_by_spans = {}

    # sentences and positions
    text_sents_with_spans = []
    for start, end in PunktSentenceTokenizer().span_tokenize(textstr):
        text_sents_with_spans.append( (start,end, textstr[start:end]) )

    # iterate concepts
    for concept in list_of_concepts_in_text:
        span_key = ""

        # location of sentences in text
        sent_idx  = int( concept.index )
        sent_spos = text_sents_with_spans[sent_idx][0]

        # 
        string_of_pos_info = ""
        if ';' in concept.pos_info:
            string_of_pos_info = concept.pos_info.split(';')[0]
        else:
            string_of_pos_info = concept.pos_info

        # concept.pos_info 
        if '[' not in string_of_pos_info:
            # case a)
            if string_of_pos_info.count('/') == 1 :
                # 统计 case a
                spos, slen = string_of_pos_info.split('/')
                spos = int(spos) -1 + sent_spos
                slen = int(slen)
                epos = spos + slen
                # 
                span_key = str(spos) + ' '+ str(epos) 
                # 
                if span_key not in metamap_annotated_concepts_by_spans:
                    metamap_annotated_concepts_by_spans.setdefault( span_key, concept )

            #  case b
            elif string_of_pos_info.count('/') > 1:
                # ‘7059/5,7073/5’ 
                pos_of_parts = []
                for part_pos_info in string_of_pos_info.split(','):
                    # print( part_pos_info , concept)
                    spos, slen = part_pos_info.split('/')
                    spos = int(spos) -1 + sent_spos
                    slen = int(slen)
                    epos = spos + slen  
                    # 
                    pos_of_parts.append( str(spos) + ' '+ str(epos)  ) 
                #
                span_key = ';'.join( pos_of_parts )  
                # 
                if span_key not in metamap_annotated_concepts_by_spans:
                    metamap_annotated_concepts_by_spans.setdefault( span_key, concept )                
        # 
        # [1351/8],[1437/8]  case c)
        # [4061/10,4075/11],[4166/10,4180/11]   case d)
        else:
            #  [1351/8],[1437/8] --> ['[1351/8]', '[1437/8]']
            for bracket_pos_info in re.findall('\[.*?\]',string_of_pos_info):
                # change  '[1351/8]' to '1351/8' ,  case a)
                # change  '[4061/10,4075/11]' to 4061/10,4075/11 , case b)
                cleaned_pos_info = bracket_pos_info[1:-1]
                # 
                # case c) -->  case a)
                if cleaned_pos_info.count('/') == 1 :
                    # 
                    spos, slen = cleaned_pos_info.split('/')
                    spos = int(spos) -1 + sent_spos
                    slen = int(slen)
                    epos = spos + slen
                    # 
                    span_key = str(spos) + ' '+ str(epos) 
                    # 
                    if span_key not in metamap_annotated_concepts_by_spans:
                        metamap_annotated_concepts_by_spans.setdefault( span_key, concept )                    
                # case d) --> case b
                elif cleaned_pos_info.count('/') > 1:
                    # ‘7059/5,7073/5’ 
                    pos_of_parts = []
                    for part_pos_info in cleaned_pos_info.split(','):
                        spos, slen = part_pos_info.split('/')
                        spos = int(spos) -1 + sent_spos
                        slen = int(slen)
                        epos = spos + slen  
                        # 
                        pos_of_parts.append( str(spos) + ' '+ str(epos)  ) 
                    #
                    span_key = ';'.join( pos_of_parts )  
                    # 
                    if span_key not in metamap_annotated_concepts_by_spans:
                        metamap_annotated_concepts_by_spans.setdefault( span_key, concept )        
    #
    return metamap_annotated_concepts_by_spans                                                        


In [17]:
# statistics of entities annotated by metamap

counts = [0]*5

for disease_name in list_of_diseases:
# for disease_name in diseases_used_for_training:
    # disease_name的metamap annotation result
    concepts_annotated_by_metamap = dict_of_concepts_in_texts[disease_name]
    # 将pos_info的形式输出来看看
    for concept in concepts_annotated_by_metamap:
        string_of_pos_info = ""
        if ';' in concept.pos_info:
            string_of_pos_info = concept.pos_info.split(';')[0]
            counts[4] +=1
        else:
            string_of_pos_info = concept.pos_info

        # concept.pos_info 
        # the simplest form 
        if '[' not in string_of_pos_info:
            #  case a)
            if string_of_pos_info.count('/') == 1 :
                #  case a
                counts[0] +=1
                # position
                spos, slen = string_of_pos_info.split('/')
                spos = int(spos) -1
                slen = int(slen)
                epos = spos + slen
                # 
                span_key = str(spos) + ' '+ str(epos) 
            # case b
            elif string_of_pos_info.count('/') > 1:
                #  case b
                counts[1] += 1
                # ‘7059/5,7073/5’ 
                pos_of_parts = []
                for part_pos_info in string_of_pos_info.split(','):
                    # print( part_pos_info , concept)
                    spos, slen = part_pos_info.split('/')
                    spos = int(spos) -1
                    slen = int(slen)
                    epos = spos + slen  
                    # 
                    pos_of_parts.append( str(spos) + ' '+ str(epos)  ) 
                #
                span_key = ';'.join( pos_of_parts )  
        # 
        # [4061/10,4075/11],[4166/10,4180/11]  case d)
        else:
            # [1351/8],[1437/8] --> ['[1351/8]', '[1437/8]']
            for bracket_pos_info in re.findall('\[.*?\]',string_of_pos_info):
                # 将 '[1351/8]' 变成 '1351/8' ,  case a)
                # 将 '[4061/10,4075/11]' 变成 4061/10,4075/11 , case b)
                cleaned_pos_info = bracket_pos_info[1:-1]
                # 
                # case c) --> case a)
                if cleaned_pos_info.count('/') == 1 :
                    # count
                    counts[2] +=1
                    # 解析概念的起止位置
                    spos, slen = cleaned_pos_info.split('/')
                    spos = int(spos) -1
                    slen = int(slen)
                    epos = spos + slen
                    # 生成brat中的span_key
                    span_key = str(spos) + ' '+ str(epos) 
                # case d) -->  case b
                elif cleaned_pos_info.count('/') > 1:
                    # count
                    counts[3] += 1
                    # ‘7059/5,7073/5’ 
                    pos_of_parts = []
                    for part_pos_info in cleaned_pos_info.split(','):
                        spos, slen = part_pos_info.split('/')
                        spos = int(spos) -1
                        slen = int(slen)
                        epos = spos + slen  
                        # 
                        pos_of_parts.append( str(spos) + ' '+ str(epos)  ) 
                    #
                    span_key = ';'.join( pos_of_parts )                 

counts

[4194, 225, 243, 0, 187]

In [18]:
# compare text span annotated by human and machine  

In [19]:
from interval import Interval

In [20]:
# precision = true prediction/total prediction

# to obtain
full_matched_pred_entity_counts    = [0,0]
partial_matched_pred_entity_counts = [0,0]

#  
total_pred_entity_counts = [0,0]


for disease_name in diseases_used_for_training:
    # 
    file_of_original_text  = os.path.join( corpus_path, disease_name + '.txt' )
    raw_text = open( file_of_original_text, 'r', encoding='utf-8').read() 
    # 
    raw_text = raw_text.replace('\n',',')

    # human annotation
    file_of_brat_annotation  = os.path.join( corpus_path, disease_name + '.ann' )
    dict_of_brat_annotations = read_brat_annotation_file(file_of_brat_annotation) 

    # machine annotation
    dict_of_meta_annotations = read_metamap_annotation_result( raw_text, dict_of_concepts_in_texts[disease_name] )

    # compare
    for span_key_of_metamap in dict_of_meta_annotations:
        # metamap concepts
        concept = dict_of_meta_annotations[span_key_of_metamap]

        # strict and relax match
        strict_match_of_interval = False
        relax_match_of_interval  = False

        # entity type
        pred_type = ""
        # '[sosy]'
        if concept.semtypes[1:-1] in semantic_group_of_disorder :
            pred_type = 'Phenotype'
        elif concept.semtypes[1:-1] in semantic_group_of_anatomy :
            pred_type = 'FindingSite'
        
        # stat
        if pred_type == 'Phenotype':
            total_pred_entity_counts[0] +=1
        elif pred_type == 'FindingSite':
            total_pred_entity_counts[1] +=1

        # span
        list_of_span_intervals_by_metamap = []

        # 
        if ';' in span_key_of_metamap:
            for pos_info in span_key_of_metamap.split(';'):
                spos, epos = pos_info.split(' ')
                spos = int(spos)
                epos = int(epos)
                # 
                list_of_span_intervals_by_metamap.append( Interval(spos, epos) )
        else:
            pos_info = span_key_of_metamap
            #
            spos, epos = pos_info.split(' ')
            spos = int(spos)
            epos = int(epos)
            # 
            list_of_span_intervals_by_metamap.append( Interval(spos, epos) )     

        # list_of_span_intervals_by_metamap
        # compare
        for span_key_of_expert in dict_of_brat_annotations:
            # type
            real_type = dict_of_brat_annotations[span_key_of_expert][0] 

            # span
            list_of_span_intervals_by_expert = []

            # span_key
            if ';' in span_key_of_expert:
                #  span_key_of_expert mistaked as span_key_of_metamap
                for pos_info in span_key_of_expert.split(';'):
                    spos, epos = pos_info.split(' ')
                    spos = int(spos)
                    epos = int(epos)
                    # 
                    list_of_span_intervals_by_expert.append( Interval(spos, epos) )
            else:
                pos_info = span_key_of_expert
                #
                spos, epos = pos_info.split(' ')
                spos = int(spos)
                epos = int(epos)
                # 
                list_of_span_intervals_by_expert.append( Interval(spos, epos) )      

            # compare
            if real_type == pred_type:
                if set( list_of_span_intervals_by_expert ) == set( list_of_span_intervals_by_metamap ):
                    # print("full match", list_of_span_intervals_by_expert, list_of_span_intervals_by_metamap )
                    strict_match_of_interval = True
                    break
                else:
                    # overlap
                    for _span_interval_by_metamap in list_of_span_intervals_by_metamap:
                        if relax_match_of_interval == True:
                            break
                        else:
                            for _span_interval_by_expert in list_of_span_intervals_by_expert:
                                if _span_interval_by_metamap.overlaps( _span_interval_by_expert ):
                                    relax_match_of_interval = True
                                    break

        # count
        if strict_match_of_interval == True:
            # print( "full match" +'\t' + span_key_of_metamap +'\t' + concept.trigger + '\t' )
            if pred_type == 'Phenotype':
                full_matched_pred_entity_counts[0] +=1
            elif pred_type == 'FindingSite':
                full_matched_pred_entity_counts[1] +=1      
        elif relax_match_of_interval == True:
            # print( "partial match" +'\t' + span_key_of_metamap +'\t' + concept.trigger + '\t' )
            if pred_type == 'Phenotype':
                partial_matched_pred_entity_counts[0] +=1
            elif pred_type == 'FindingSite':
                partial_matched_pred_entity_counts[1] +=1                                 

In [21]:
# print
print(full_matched_pred_entity_counts)
print(partial_matched_pred_entity_counts)
print(total_pred_entity_counts)


[1696, 0]
[475, 0]
[3341, 0]


In [22]:
# recall = coverred annotations/total annotations

# to get
full_coverred_real_entity_counts    = [0,0]
partial_coverred_real_entity_counts = [0,0]

#  
total_real_entity_counts = [0,0]

for disease_name in diseases_used_for_training:
    # 
    file_of_original_text  = os.path.join( corpus_path, disease_name + '.txt' )
    raw_text = open( file_of_original_text, 'r', encoding='utf-8').read() 
    raw_text = raw_text.replace('\n',',') 

    # human annotation
    file_of_brat_annotation  = os.path.join( corpus_path, disease_name + '.ann' )
    dict_of_brat_annotations = read_brat_annotation_file(file_of_brat_annotation) 

    # machine annotation
    dict_of_meta_annotations = read_metamap_annotation_result( raw_text, dict_of_concepts_in_texts[disease_name] )


    # compare
    for span_key_of_expert in dict_of_brat_annotations:
        # 
        strict_match_of_interval = False
        relax_match_of_interval  = False

        # type
        real_type = dict_of_brat_annotations[span_key_of_expert][0] 

        # count
        if real_type == 'Phenotype':
            total_real_entity_counts[0] +=1
        elif real_type == 'FindingSite':
            total_real_entity_counts[1] +=1

        # span
        list_of_span_intervals_by_expert = []

        # span_key
        if ';' in span_key_of_expert:
            for pos_info in span_key_of_expert.split(';'):
                spos, epos = pos_info.split(' ')
                spos = int(spos)
                epos = int(epos)
                list_of_span_intervals_by_expert.append( Interval(spos, epos) )
        else:
            pos_info = span_key_of_expert
            #
            spos, epos = pos_info.split(' ')
            spos = int(spos)
            epos = int(epos)
            list_of_span_intervals_by_expert.append( Interval(spos, epos) )          

        # compare
        for span_key_of_metamap in dict_of_meta_annotations:
            # metamap concept
            concept = dict_of_meta_annotations[span_key_of_metamap]

            # type
            pred_type = ""
            # '[sosy]'
            if concept.semtypes[1:-1] in semantic_group_of_disorder :
                pred_type = 'Phenotype'
            elif concept.semtypes[1:-1] in semantic_group_of_anatomy :
                pred_type = 'FindingSite'
            

            # 
            list_of_span_intervals_by_metamap = []

            # 
            if ';' in span_key_of_metamap:
                for pos_info in span_key_of_metamap.split(';'):
                    spos, epos = pos_info.split(' ')
                    spos = int(spos)
                    epos = int(epos)
                    # 
                    list_of_span_intervals_by_metamap.append( Interval(spos, epos) )
            else:
                pos_info = span_key_of_metamap
                #
                spos, epos = pos_info.split(' ')
                spos = int(spos)
                epos = int(epos)
                # 
                list_of_span_intervals_by_metamap.append( Interval(spos, epos) )    

            # list_of_span_intervals_by_metamap 
            if real_type == pred_type:
                # 
                if set( list_of_span_intervals_by_expert ) == set( list_of_span_intervals_by_metamap ):
                    strict_match_of_interval = True
                    break
                # 
                else:
                    # 
                    for _span_interval_by_expert in list_of_span_intervals_by_expert:
                        if relax_match_of_interval == True:
                            break
                        else:
                            for _span_interval_by_metamap in list_of_span_intervals_by_metamap:
                                if _span_interval_by_expert.overlaps( _span_interval_by_metamap ):
                                    relax_match_of_interval = True
                                    break   

        # strict_match_of_interval                           
        if strict_match_of_interval == True:
            if real_type == 'Phenotype':
                full_coverred_real_entity_counts[0] +=1
            elif real_type == 'FindingSite':
                full_coverred_real_entity_counts[1] +=1      
        elif relax_match_of_interval == True:
            if real_type == 'Phenotype':
                partial_coverred_real_entity_counts[0] +=1
            elif real_type == 'FindingSite':
                partial_coverred_real_entity_counts[1] +=1                                        

In [23]:
print(full_coverred_real_entity_counts)
print(partial_coverred_real_entity_counts)
print(total_real_entity_counts)


[1696, 0]
[481, 0]
[2917, 332]


In [24]:
# filting General but meanless terms
# CUI : count
stat_of_concepts = {}

for disease_name in diseases_used_for_training:
    # 
    file_of_original_text  = os.path.join( corpus_path, disease_name + '.txt' )
    raw_text = open( file_of_original_text, 'r', encoding='utf-8').read()  
    raw_text = raw_text.replace('\n',',')

    # 
    file_of_brat_annotation  = os.path.join( corpus_path, disease_name + '.ann' )
    dict_of_brat_annotations = read_brat_annotation_file(file_of_brat_annotation) 

    # machine annotation
    dict_of_meta_annotations = read_metamap_annotation_result( raw_text, dict_of_concepts_in_texts[disease_name] )

    # compare
    for span_key_of_metamap in dict_of_meta_annotations:
        # 
        concept = dict_of_meta_annotations[span_key_of_metamap]
        #
        # if not concept.cui.startswith('C'):
        #     print( concept )        
        # 
        if concept.cui + '::' + concept.preferred_name not in stat_of_concepts:
            stat_of_concepts.setdefault( concept.cui + '::' + concept.preferred_name, 1 )
        else:
            stat_of_concepts[ concept.cui + '::' + concept.preferred_name ] += 1
        


In [25]:
# count
len(stat_of_concepts)

602

In [26]:
# stat_of_concepts
occurrences_of_concepts = []

for key in stat_of_concepts:
    count = stat_of_concepts[key]
    occurrences_of_concepts.append( (count, key) )

occurrences_of_concepts.sort(reverse=True)

In [29]:
# occurrences_of_concepts[0:100]
occurrences_of_concepts[0:10]

[(222, 'C0015967::Fever'),
 (117, 'C0015230::Exanthema'),
 (114, 'C0205082::Severe (severity modifier)'),
 (95, 'C0231218::Malaise'),
 (85, 'C0032285::Pneumonia'),
 (85, 'C0030193::Pain'),
 (73, 'C0018681::Headache'),
 (64, 'C0011991::Diarrhea'),
 (58, 'C0010200::Coughing'),
 (52, 'C0042963::Vomiting')]

In [70]:
list_of_excluded_cuis = []

with open("/home/denglizong/SSUMiner/corpus/Excluded_Concepts/excluded_cuis.txt", 'r', encoding='utf-8') as f:
    for line in f.readlines():
        if len( line.strip().split('\t') ) == 2:
            excluded_cui, excluded_name = line.strip().split('\t')
            list_of_excluded_cuis.append( excluded_cui )

In [71]:
len( set(list_of_excluded_cuis) )

344

In [35]:
list_of_excluded_cuis[0:3]

['C5203670', 'C5203340', 'C5203106']

In [36]:
# clean metamap annotation
def clean_metamap_annotations_in_text( raw_metamap_annotations, list_of_excluded_cuis, disorder_tags, anatomy_tags ):
    # 
    span_keys_to_be_excluded = []

    # 
    counts_of_remove = [0]*3

    # sent_index : [ (span_key, concept) ]
    sent_level_annotations = {}

    for span_key in raw_metamap_annotations:
        # 
        concept = raw_metamap_annotations[span_key]

        # rule 1: NN
        if '-noun-' not in concept.trigger:
            span_keys_to_be_excluded.append( span_key )
            counts_of_remove[0] += 1
        else:
            # rule 2：remove concepts without specific meanings
             if concept.cui in list_of_excluded_cuis:
                 span_keys_to_be_excluded.append( span_key )
                 counts_of_remove[1] += 1
        
        #
        if concept.index not in sent_level_annotations:
            sent_level_annotations.setdefault( concept.index, [(span_key, concept)] )
        else:
            sent_level_annotations[concept.index].append( (span_key, concept) )
    
    #
    for sent_idx in sent_level_annotations:
        # observe finding sites
        phenotype_in_sent = False
        findingsite_in_sent = False

        for (span_key, concept) in sent_level_annotations[sent_idx]:
            if concept.semtypes[1:-1] in disorder_tags:
                phenotype_in_sent = True
            elif concept.semtypes[1:-1] in anatomy_tags:
                findingsite_in_sent = True
        
        # 
        if phenotype_in_sent == False and findingsite_in_sent == True:
            for (span_key, concept) in sent_level_annotations[sent_idx]:
                if concept.semtypes[1:-1] in anatomy_tags:
                    if span_key not in span_keys_to_be_excluded:
                        span_keys_to_be_excluded.append( span_key )  
                        counts_of_remove[2] += 1

    # 
    clean_metamap_annotations = {}

    for span_key in raw_metamap_annotations:
        # 
        concept = raw_metamap_annotations[span_key]
        #
        if span_key not in span_keys_to_be_excluded:
            clean_metamap_annotations.setdefault( span_key, concept )
    
    #
    # print( counts_of_remove )
    #
    return clean_metamap_annotations
    

In [51]:
# clean metamap annotation with hpo
def clean_metamap_annotations_with_hpo( raw_metamap_annotations, list_of_excluded_cuis, disorder_tags, anatomy_tags ):
    # 
    span_keys_to_be_excluded = []

    #

    for span_key in raw_metamap_annotations:
        # 
        concept = raw_metamap_annotations[span_key]

        # rule 1：nn
        # if '-noun-' not in concept.trigger:
        #     span_keys_to_be_excluded.append( span_key )
        #     counts_of_remove[0] += 1
        # else:
        #     # rule 2：remove concepts without specific meanings
        #      if concept.cui in list_of_excluded_cuis:
        #          span_keys_to_be_excluded.append( span_key )
        #          counts_of_remove[1] += 1

        # 
        if concept.cui in list_of_excluded_cuis:
            span_keys_to_be_excluded.append( span_key )
        # 

    # 
    clean_metamap_annotations = {}

    for span_key in raw_metamap_annotations:
        # 
        concept = raw_metamap_annotations[span_key]
        #
        if span_key not in span_keys_to_be_excluded:
            clean_metamap_annotations.setdefault( span_key, concept )
    
    #
    # print( counts_of_remove )
    #
    return clean_metamap_annotations
    

In [72]:
# HPO表型异常对应的CUI
list_of_excluded_cuis_in_hpo = []

with open("/home/denglizong/SSUMiner/corpus/Excluded_Concepts/excluded_cuis_in_hpo.txt", 'r', encoding='utf-8') as f:
    for line in f.readlines():
        if line.startswith('C'):
            list_of_excluded_cuis_in_hpo.append( line.strip() )

In [73]:
len(list_of_excluded_cuis_in_hpo)

101

In [74]:
list_of_excluded_cuis_in_hpo[0:3]

['C1708511', 'C0443147', 'C4020899']

In [37]:
# re-calculate precision and recall

In [79]:
# 对于每一份疾病描述文本
# 观察机器所做的预测，并基于人工预测评估机器所作的预测是否正确 
# precision = true prediction/total prediction

# 预测正确的表型实体或部位实体
full_matched_pred_entity_counts    = [0,0]
partial_matched_pred_entity_counts = [0,0]

# 所有预测 
total_pred_entity_counts = [0,0]


# for disease_name in diseases_used_for_training:
for disease_name in diseases_used_for_test:
    # 疾病百科原文
    file_of_original_text  = os.path.join( corpus_path, disease_name + '.txt' )
    raw_text = open( file_of_original_text, 'r', encoding='utf-8').read() 
    raw_text = raw_text.replace('\n',',') 

    # 获取该疾病文本的人工标注
    # 原来是这里出错了，将 disease_name 设置为了 "Acinetobacter infections"
    file_of_brat_annotation  = os.path.join( corpus_path, disease_name + '.ann' )
    dict_of_brat_annotations = read_brat_annotation_file(file_of_brat_annotation) 

    # 获取该疾病文本的机器标注
    dict_of_meta_annotations = read_metamap_annotation_result( raw_text, dict_of_concepts_in_texts[disease_name] )
    # dict_of_meta_annotations_cleaned = clean_metamap_annotations_in_text( dict_of_meta_annotations, \
    #                                         list_of_excluded_cuis, semantic_group_of_disorder, semantic_group_of_anatomy )
    dict_of_meta_annotations_cleaned = clean_metamap_annotations_with_hpo( dict_of_meta_annotations, \
                                            list_of_excluded_cuis_in_hpo, semantic_group_of_disorder, semantic_group_of_anatomy )    

    # 比较机器标注与人工标注
    for span_key_of_metamap in dict_of_meta_annotations_cleaned:
        # metamap注释的概念
        concept = dict_of_meta_annotations_cleaned[span_key_of_metamap]

        # 是否存在一个人工标注，与metamap的标注完全一致或部分一致 
        # 实体类型一样的前提下，区间标注完全一致或部分一致(重叠)
        strict_match_of_interval = False
        relax_match_of_interval  = False

        # 该区间概念对应的实体类型
        pred_type = ""
        # '[sosy]'
        if concept.semtypes[1:-1] in semantic_group_of_disorder :
            pred_type = 'Phenotype'
        elif concept.semtypes[1:-1] in semantic_group_of_anatomy :
            pred_type = 'FindingSite'
        
        # 统计预测
        if pred_type == 'Phenotype':
            total_pred_entity_counts[0] +=1
        elif pred_type == 'FindingSite':
            total_pred_entity_counts[1] +=1

        # 生成区间；由于不连续实体的存在，可能有多段区间
        list_of_span_intervals_by_metamap = []

        # 
        if ';' in span_key_of_metamap:
            for pos_info in span_key_of_metamap.split(';'):
                spos, epos = pos_info.split(' ')
                spos = int(spos)
                epos = int(epos)
                # 生成span区间，便于比较
                list_of_span_intervals_by_metamap.append( Interval(spos, epos) )
        else:
            pos_info = span_key_of_metamap
            #
            spos, epos = pos_info.split(' ')
            spos = int(spos)
            epos = int(epos)
            # 生成span区间，便于比较
            list_of_span_intervals_by_metamap.append( Interval(spos, epos) )     

        # list_of_span_intervals_by_metamap
        # 观察这一区间是否存在于人工标注的区间中 (且实体类型一致)
        for span_key_of_expert in dict_of_brat_annotations:
            # 实体类型
            real_type = dict_of_brat_annotations[span_key_of_expert][0] 

            # span区间化
            list_of_span_intervals_by_expert = []

            # 解析span_key
            if ';' in span_key_of_expert:
                # 这里出错啦 span_key_of_expert mistaked as span_key_of_metamap
                for pos_info in span_key_of_expert.split(';'):
                    spos, epos = pos_info.split(' ')
                    spos = int(spos)
                    epos = int(epos)
                    # 生成span区间，便于比较
                    list_of_span_intervals_by_expert.append( Interval(spos, epos) )
            else:
                pos_info = span_key_of_expert
                #
                spos, epos = pos_info.split(' ')
                spos = int(spos)
                epos = int(epos)
                # 生成span区间，便于比较
                list_of_span_intervals_by_expert.append( Interval(spos, epos) )      

            # 在实体类型一致的基础上，进行区间一致性比较 
            if real_type == pred_type:
                # 区间一致性比较
                # 完全一致
                if set( list_of_span_intervals_by_expert ) == set( list_of_span_intervals_by_metamap ):
                    # print("full match", list_of_span_intervals_by_expert, list_of_span_intervals_by_metamap )
                    strict_match_of_interval = True
                    break
                # 如果两者区间不相等
                else:
                    # 但如果存在重叠关系，设定relax_match = True 
                    for _span_interval_by_metamap in list_of_span_intervals_by_metamap:
                        if relax_match_of_interval == True:
                            break
                        else:
                            for _span_interval_by_expert in list_of_span_intervals_by_expert:
                                if _span_interval_by_metamap.overlaps( _span_interval_by_expert ):
                                    relax_match_of_interval = True
                                    break

        # 计数完全一致的预测和部分一致的预测
        # 并根据类型分别记录表型和部位实体预测的一致性
        if strict_match_of_interval == True:
            # print( "full match" +'\t' + span_key_of_metamap +'\t' + concept.trigger + '\t' )
            if pred_type == 'Phenotype':
                full_matched_pred_entity_counts[0] +=1
            elif pred_type == 'FindingSite':
                full_matched_pred_entity_counts[1] +=1      
        elif relax_match_of_interval == True:
            # print( "partial match" +'\t' + span_key_of_metamap +'\t' + concept.trigger + '\t' )
            if pred_type == 'Phenotype':
                partial_matched_pred_entity_counts[0] +=1
            elif pred_type == 'FindingSite':
                partial_matched_pred_entity_counts[1] +=1                                 

In [80]:
# 预测的准确率(冗余)较大，是需要通过规则进行过滤
print(full_matched_pred_entity_counts)
print(partial_matched_pred_entity_counts)
print(total_pred_entity_counts)

[631, 0]
[171, 0]
[1266, 0]


In [81]:
# 统计人工标注的概念能被机器标注所覆盖的情况
# 对于每一份疾病描述文本
# 观察人工标注的概念，并观察人工标注的概念是否能被机器标注所覆盖
# recall = coverred annotations/total annotations

# 预测正确的表型实体或部位实体
full_coverred_real_entity_counts    = [0,0]
partial_coverred_real_entity_counts = [0,0]

# 所有预测 
total_real_entity_counts = [0,0]

# for disease_name in diseases_used_for_training:
for disease_name in diseases_used_for_test:
    # 疾病百科原文
    file_of_original_text  = os.path.join( corpus_path, disease_name + '.txt' )
    raw_text = open( file_of_original_text, 'r', encoding='utf-8').read()  
    raw_text = raw_text.replace('\n',',')

    # 获取该疾病文本的人工标注
    # 原来是这里出错了，将 disease_name 设置为了 "Acinetobacter infections"
    file_of_brat_annotation  = os.path.join( corpus_path, disease_name + '.ann' )
    dict_of_brat_annotations = read_brat_annotation_file(file_of_brat_annotation) 

    # 获取该疾病文本的机器标注
    dict_of_meta_annotations = read_metamap_annotation_result( raw_text, dict_of_concepts_in_texts[disease_name] )
    # dict_of_meta_annotations_cleaned = clean_metamap_annotations_in_text( dict_of_meta_annotations, \
    #                                         list_of_excluded_cuis, semantic_group_of_disorder, semantic_group_of_anatomy )
    dict_of_meta_annotations_cleaned = clean_metamap_annotations_with_hpo( dict_of_meta_annotations, \
                                            list_of_excluded_cuis_in_hpo, semantic_group_of_disorder, semantic_group_of_anatomy )        

    # 比较机器标注与人工标注
    # 对于一个人工标注
    for span_key_of_expert in dict_of_brat_annotations:
        # 是否存在一个机器标注，能完全或部分的覆盖该人工标注？
        strict_match_of_interval = False
        relax_match_of_interval  = False

        # 实体类型
        real_type = dict_of_brat_annotations[span_key_of_expert][0] 

        # 统计标注
        if real_type == 'Phenotype':
            total_real_entity_counts[0] +=1
        elif real_type == 'FindingSite':
            total_real_entity_counts[1] +=1

        # span区间化
        list_of_span_intervals_by_expert = []

        # 解析span_key
        if ';' in span_key_of_expert:
            # 这里出错啦，span_key_of_expert mistaken for span_key_of_metamap
            for pos_info in span_key_of_expert.split(';'):
                spos, epos = pos_info.split(' ')
                spos = int(spos)
                epos = int(epos)
                # 生成span区间，便于比较
                list_of_span_intervals_by_expert.append( Interval(spos, epos) )
        else:
            pos_info = span_key_of_expert
            #
            spos, epos = pos_info.split(' ')
            spos = int(spos)
            epos = int(epos)
            # 生成span区间，便于比较
            list_of_span_intervals_by_expert.append( Interval(spos, epos) )          

        # 看机器标注的区间能否覆盖 list_of_span_intervals_by_expert = []
        for span_key_of_metamap in dict_of_meta_annotations_cleaned:
            # metamap注释的概念
            concept = dict_of_meta_annotations_cleaned[span_key_of_metamap]

            # 该区间概念对应的实体类型
            pred_type = ""
            # '[sosy]'
            if concept.semtypes[1:-1] in semantic_group_of_disorder :
                pred_type = 'Phenotype'
            elif concept.semtypes[1:-1] in semantic_group_of_anatomy :
                pred_type = 'FindingSite'
            

            # 生成区间；由于不连续实体的存在，可能有多段区间
            list_of_span_intervals_by_metamap = []

            # 
            if ';' in span_key_of_metamap:
                for pos_info in span_key_of_metamap.split(';'):
                    spos, epos = pos_info.split(' ')
                    spos = int(spos)
                    epos = int(epos)
                    # 生成span区间，便于比较
                    list_of_span_intervals_by_metamap.append( Interval(spos, epos) )
            else:
                pos_info = span_key_of_metamap
                #
                spos, epos = pos_info.split(' ')
                spos = int(spos)
                epos = int(epos)
                # 生成span区间，便于比较
                list_of_span_intervals_by_metamap.append( Interval(spos, epos) )    

            # list_of_span_intervals_by_metamap 
            # 在实体类型一致的基础上，进行区间一致性比较 
            if real_type == pred_type:
                # 区间一致性比较
                # 完全一致
                if set( list_of_span_intervals_by_expert ) == set( list_of_span_intervals_by_metamap ):
                    strict_match_of_interval = True
                    break
                # 如果两者区间不相等
                else:
                    # 但如果存在重叠关系，设定relax_match = True 
                    for _span_interval_by_expert in list_of_span_intervals_by_expert:
                        if relax_match_of_interval == True:
                            break
                        else:
                            for _span_interval_by_metamap in list_of_span_intervals_by_metamap:
                                if _span_interval_by_expert.overlaps( _span_interval_by_metamap ):
                                    relax_match_of_interval = True
                                    break   

        # strict_match_of_interval                           
        # 计数完全一致的预测和部分一致的预测
        # 并根据类型分别记录表型和部位实体预测的一致性
        if strict_match_of_interval == True:
            if real_type == 'Phenotype':
                full_coverred_real_entity_counts[0] +=1
            elif real_type == 'FindingSite':
                full_coverred_real_entity_counts[1] +=1      
        elif relax_match_of_interval == True:
            if real_type == 'Phenotype':
                partial_coverred_real_entity_counts[0] +=1
            elif real_type == 'FindingSite':
                partial_coverred_real_entity_counts[1] +=1                                        

In [82]:
print(full_coverred_real_entity_counts)
print(partial_coverred_real_entity_counts)
print(total_real_entity_counts)


[631, 0]
[182, 0]
[1104, 126]


In [61]:
# 由于在训练集中标注过的术语都算是已知的数据，如果将训练集中的数据加入到MetaMap未识别到的部分 (string-based Method)，观测测试集所能达到的效果

In [83]:
# 统计在训练集中出现过的表型术语

In [92]:
# dict_of_brat_annotations.setdefault( pos_info, ('FindingSite', ent_name, ent_id) )  
# 只收集表型术语
terms_occurred_in_training_set = set()

for disease_name in diseases_used_for_training:
    # 获取该疾病文本的人工标注
    # 原来是这里出错了，将 disease_name 设置为了 "Acinetobacter infections"
    file_of_brat_annotation  = os.path.join( corpus_path, disease_name + '.ann' )
    dict_of_brat_annotations = read_brat_annotation_file(file_of_brat_annotation) 
    #
    for span_key in dict_of_brat_annotations:
        ent_type = dict_of_brat_annotations[span_key][0]
        ent_name = dict_of_brat_annotations[span_key][1]
        if ent_type == 'Phenotype':
            terms_occurred_in_training_set.add( ent_name )

In [93]:
len(terms_occurred_in_training_set)

1658

In [94]:
list(terms_occurred_in_training_set)[0:6]

['diarrhea with blood',
 'Congenital rubella syndrome',
 'collection of fluid in the heart covering',
 'eroding lesions',
 'Bleeding from mucous membranes',
 'disturbances in taste']

In [100]:
# 这是MetaMap的预测
# MetaMap没有覆盖到的人工标注，检查它是否是一个出现过的术语，这一部分也要纳入统计
# 用列表记录MetaMap未覆盖的术语 [修改覆盖度计算代码]
terms_not_covered_by_MetaMap = []

for disease_name in diseases_used_for_test:
    # 疾病百科原文
    file_of_original_text  = os.path.join( corpus_path, disease_name + '.txt' )
    raw_text = open( file_of_original_text, 'r', encoding='utf-8').read()  
    raw_text = raw_text.replace('\n',',')

    # 获取该疾病文本的人工标注
    # 原来是这里出错了，将 disease_name 设置为了 "Acinetobacter infections"
    file_of_brat_annotation  = os.path.join( corpus_path, disease_name + '.ann' )
    dict_of_brat_annotations = read_brat_annotation_file(file_of_brat_annotation) 

    # 获取该疾病文本的机器标注
    dict_of_meta_annotations = read_metamap_annotation_result( raw_text, dict_of_concepts_in_texts[disease_name] )
    # dict_of_meta_annotations_cleaned = clean_metamap_annotations_in_text( dict_of_meta_annotations, \
    #                                         list_of_excluded_cuis, semantic_group_of_disorder, semantic_group_of_anatomy )
    dict_of_meta_annotations_cleaned = clean_metamap_annotations_with_hpo( dict_of_meta_annotations, \
                                            list_of_excluded_cuis_in_hpo, semantic_group_of_disorder, semantic_group_of_anatomy )        

    # 比较机器标注与人工标注
    # 对于一个人工标注
    for span_key_of_expert in dict_of_brat_annotations:
        # 是否存在一个机器标注，能完全或部分的覆盖该人工标注？
        strict_match_of_interval = False
        relax_match_of_interval  = False

        # 实体类型 ent_type
        real_type = dict_of_brat_annotations[span_key_of_expert][0] 
        # 实体名称 ent_name
        real_name = dict_of_brat_annotations[span_key_of_expert][1] 

        # 统计标注
        if real_type == 'Phenotype':
            total_real_entity_counts[0] +=1
        elif real_type == 'FindingSite':
            total_real_entity_counts[1] +=1

        # span区间化
        list_of_span_intervals_by_expert = []

        # 解析span_key
        if ';' in span_key_of_expert:
            # 这里出错啦，span_key_of_expert mistaken for span_key_of_metamap
            for pos_info in span_key_of_expert.split(';'):
                spos, epos = pos_info.split(' ')
                spos = int(spos)
                epos = int(epos)
                # 生成span区间，便于比较
                list_of_span_intervals_by_expert.append( Interval(spos, epos) )
        else:
            pos_info = span_key_of_expert
            #
            spos, epos = pos_info.split(' ')
            spos = int(spos)
            epos = int(epos)
            # 生成span区间，便于比较
            list_of_span_intervals_by_expert.append( Interval(spos, epos) )          

        # 看机器标注的区间能否覆盖 list_of_span_intervals_by_expert = []
        for span_key_of_metamap in dict_of_meta_annotations_cleaned:
            # metamap注释的概念
            concept = dict_of_meta_annotations_cleaned[span_key_of_metamap]

            # 该区间概念对应的实体类型
            pred_type = ""
            # '[sosy]'
            if concept.semtypes[1:-1] in semantic_group_of_disorder :
                pred_type = 'Phenotype'
            elif concept.semtypes[1:-1] in semantic_group_of_anatomy :
                pred_type = 'FindingSite'
            

            # 生成区间；由于不连续实体的存在，可能有多段区间
            list_of_span_intervals_by_metamap = []

            # 
            if ';' in span_key_of_metamap:
                for pos_info in span_key_of_metamap.split(';'):
                    spos, epos = pos_info.split(' ')
                    spos = int(spos)
                    epos = int(epos)
                    # 生成span区间，便于比较
                    list_of_span_intervals_by_metamap.append( Interval(spos, epos) )
            else:
                pos_info = span_key_of_metamap
                #
                spos, epos = pos_info.split(' ')
                spos = int(spos)
                epos = int(epos)
                # 生成span区间，便于比较
                list_of_span_intervals_by_metamap.append( Interval(spos, epos) )    

            # list_of_span_intervals_by_metamap 
            # 在实体类型一致的基础上，进行区间一致性比较 
            if real_type == pred_type:
                # 区间一致性比较
                # 完全一致
                if set( list_of_span_intervals_by_expert ) == set( list_of_span_intervals_by_metamap ):
                    strict_match_of_interval = True
                    break
                # 如果两者区间不相等
                else:
                    # 但如果存在重叠关系，设定relax_match = True 
                    for _span_interval_by_expert in list_of_span_intervals_by_expert:
                        if relax_match_of_interval == True:
                            break
                        else:
                            for _span_interval_by_metamap in list_of_span_intervals_by_metamap:
                                if _span_interval_by_expert.overlaps( _span_interval_by_metamap ):
                                    relax_match_of_interval = True
                                    break   

        # strict_match_of_interval                           
        # 计数完全一致的预测和部分一致的预测
        # 并根据类型分别记录表型和部位实体预测的一致性
        if strict_match_of_interval == True:
            if real_type == 'Phenotype':
                full_coverred_real_entity_counts[0] +=1
            elif real_type == 'FindingSite':
                full_coverred_real_entity_counts[1] +=1      
        elif relax_match_of_interval == True:
            if real_type == 'Phenotype':
                partial_coverred_real_entity_counts[0] +=1
            elif real_type == 'FindingSite':
                partial_coverred_real_entity_counts[1] +=1 

        # 如果该术语没有被MetaMap覆盖，记录这一未被MetaMap覆盖的实体
        if strict_match_of_interval == False and relax_match_of_interval == False:
            # 统计标注
            if real_type == 'Phenotype':
                terms_not_covered_by_MetaMap.append( real_name )
                                

In [101]:
len( terms_not_covered_by_MetaMap )

291

In [102]:
terms_not_covered_by_MetaMap[0:5]

['irritability',
 'muscles weak',
 'muscles paralyzed',
 'changes in mental status',
 'paralytic disease']

In [103]:
# 观测未被覆盖的术语有多少已在训练集中标注过
count = 0
for term in terms_not_covered_by_MetaMap:
    if term in terms_occurred_in_training_set:
        count += 1
count        

97

In [105]:
# 表型属性模板的定义方法
# 给定语料，提取包含有表型的句子，统计表型上下文中属性的出现情况
# 根据属性的出现情况筛选属性

In [106]:
# concepts = dict_of_concepts_in_texts[diseases_name]
# concepts = mm.extract_concepts(text_sents, list(sents_idx)
# concept.index --> text_sents[concept.index]
# concept.pos_info 句子级别的位置 
# finding attribute in the context around the concept
diseases_name = "Actinomycosis"
dict_of_concepts_in_texts[diseases_name]

[ConceptMMI(index='0', mm='MMI', score='3.75', preferred_name='Pneumonia', cui='C0032285', semtypes='[dsyn]', trigger='["Pneumonia"-tx-1-"lungs"-noun-0]', location='TX', pos_info='102/5', tree_codes=''),
 ConceptMMI(index='0', mm='MMI', score='3.55', preferred_name='Abscess', cui='C0000833', semtypes='[dsyn]', trigger='["Abscess"-tx-1-"abscesses"-noun-0]', location='TX', pos_info='78/9', tree_codes=''),
 ConceptMMI(index='0', mm='MMI', score='3.55', preferred_name='Pain', cui='C0030193', semtypes='[sosy]', trigger='["Pain"-tx-1-"painful"-adj-0]', location='TX', pos_info='70/7', tree_codes=''),
 ConceptMMI(index='1', mm='MMI', score='3.68', preferred_name='Abscess', cui='C0000833', semtypes='[dsyn]', trigger='["Abscess"-tx-1-"abscesses"-noun-0]', location='TX', pos_info='19/9', tree_codes=''),
 ConceptMMI(index='2', mm='MMI', score='3.50', preferred_name='Severe (severity modifier)', cui='C0205082', semtypes='[fndg]', trigger='["Severe"-tx-1-"severe"-adj-0]', location='TX', pos_info='4/

In [107]:
from flashtext import KeywordProcessor

In [111]:
# 纳入全部的疾病
sents_with_phenotypes = set()

for disease_name in list_of_diseases:
    # 该疾病文档的句子划分
    file_of_original_text  = os.path.join( corpus_path, disease_name + '.txt' )
    raw_text = open( file_of_original_text, 'r', encoding='utf-8').read() 
    raw_text = raw_text.replace('\n',',')     

    text_sents = []
    text_sents_with_spans = []
    for start, end in PunktSentenceTokenizer().span_tokenize(raw_text):
        text_sents_with_spans.append( (start,end, raw_text[start:end]) )
        text_sents.append( raw_text[start:end] )


    # 获取该疾病文本的机器标注
    dict_of_meta_annotations = read_metamap_annotation_result( raw_text, dict_of_concepts_in_texts[disease_name] )
    # 只保留其中HPO表型异常的部分
    dict_of_meta_annotations_cleaned = clean_metamap_annotations_with_hpo( dict_of_meta_annotations, \
                                            list_of_excluded_cuis_in_hpo, semantic_group_of_disorder, semantic_group_of_anatomy )    


    # 
    for span_key_of_metamap in dict_of_meta_annotations_cleaned:
        # metamap注释的概念
        concept = dict_of_meta_annotations_cleaned[span_key_of_metamap]
        # 定位该表型概念所在的句子索引
        sent_idx = int( concept.index )
        # 记录该句子
        sents_with_phenotypes.add( text_sents[sent_idx] ) 


In [112]:
# 合计得到1688个句子
len( sents_with_phenotypes )

1688

In [116]:
# list(sents_with_phenotypes)[1]

'Signs and symptoms,Signs and symptoms of PCP include fever, nonproductive cough (because sputum is too viscous to become productive), shortness of breath (especially on exertion), weight loss, and night sweats.'

In [113]:
# 载入属性库

In [118]:
attribute_lib_path = "/home/denglizong/SSUMiner/corpus/AttributeLibrary"
snomed_attributes_file = os.path.join( attribute_lib_path, "snomed_attributes_lib_filted.json" )
hpo_attributes_file    = os.path.join( attribute_lib_path, "hpo_attributes_lib_filted.json" )
icd_attributes_file    = os.path.join( attribute_lib_path, "icd_attributes_lib.json" )
cem_attributes_file    = os.path.join( attribute_lib_path, "cem_attributes_lib.json" )

In [119]:
# 读入属性库
dict_of_snomed_attributes = json.load( open(snomed_attributes_file,'r',encoding='utf-8') )
# dict_of_snomed_attributes
dict_of_hpo_attributes = json.load( open(hpo_attributes_file,'r',encoding='utf-8') )
#
dict_of_icd_attributes = json.load( open(icd_attributes_file,'r',encoding='utf-8') )
#
dict_of_cem_attributes = json.load( open(cem_attributes_file,'r',encoding='utf-8') )

In [121]:
# dict_of_cem_attributes

In [122]:
# 合并属性库
dict_of_attributes = {}
dict_of_attributes.update( dict_of_snomed_attributes )
dict_of_attributes.update( dict_of_hpo_attributes )
dict_of_attributes.update( dict_of_icd_attributes )
dict_of_attributes.update( dict_of_cem_attributes )
len(dict_of_attributes)

334

In [123]:
dict_of_attributes["272141005::Severities"]

['Fatal',
 'Life threatening severity',
 'Mild',
 'Mild to moderate',
 'Moderate',
 'Moderate to severe',
 'Not severe',
 'Severe']

In [125]:
# 属性库中有342个属性，应该挑选哪一些属性及属性槽来配置属性模板呢？ 

In [176]:
# 肯定不纳入属性模板的属性 
excluded_attributes = ['272068007::Negative integer',"272070003::Ordinal number","272150007::Openness",
                        "272126005::Order values","272127001::Event orders","272146000::Uniformities", "57615005::Definite time",
                        "310886004::Absolute times - hours", "314804007::Cardiovascular site descriptor","272147009::Velocities",
                        "303109001::Pathogeneses","314808005::Oral site descriptor","314809002::Urinary site descriptor",
                        "314806009::Respiratory site descriptor","314805008::Digestive site descriptor"]

In [177]:
# 先从属性库中删除这些属性
for key in excluded_attributes:
    if key in dict_of_attributes:
        del dict_of_attributes[key]

len(dict_of_attributes)

319

In [178]:
# "314806009::Respiratory site descriptor" in dict_of_attributes

In [203]:
# 机器初筛：在至少两个不同的句子中出现，且出现的属性取值是不相同的。

# 符合条件的候选属性个数
count_of_candidate_attributes = 0

# 符合条件的候选属性信息 list of tmpdict
info_of_candidate_attributes = []


for name_of_attribute in dict_of_attributes:
    #
    if name_of_attribute in excluded_attributes:
        continue

    # values 
    values_of_attribute = dict_of_attributes[name_of_attribute]
    # 如果属性的取值列表过长(超过30个), 跳过
    if len(values_of_attribute) >=25:
        continue

    # creat a keyword_processor for this attribute
    keyword_processor = KeywordProcessor()

    # keyword_processor.add_keyword(<unclean name/替换起点>, <standardised name/替换终点>)
    for value in values_of_attribute:
        # severe --> 【severe】
        keyword_processor.add_keyword( value, '【' + value + '】' ) 

    # occurences of values in sentences
    # 1. 统计该属性出现在了多少句子中 (general)
    occurence_of_attribute_in_sents = 0
    # 2. 统计该属性的取值的出现次数 (在一个句子中出现多次仅算一次，以便与属性的统计统一)
    occurence_of_values_in_sents = {}
    for value in values_of_attribute:
        occurence_of_values_in_sents.setdefault( value.lower() , 0)
    # 3. 记录n(10)个包含该属性某一取值的句子，记录该句子包含的该属性的1个取值
    sentences_containing_attribute = []
    # 注意sent不是一个string
    for sent in sents_with_phenotypes:
        # 搜索句子中出现的属性取值
        # keyword_processor.add_keyword('is', 'IS')
        # keyword_processor.extract_keywords('This is a problem')
        # ['IS']
        # keyword_processor.replace_keywords('This is a problem')
        # 'This IS a problem'
        hits = keyword_processor.extract_keywords(sent)
        if len(hits) > 0:
            # 如果搜索到了属性取值，那么该属性的出现次数+1
            occurence_of_attribute_in_sents += 1
            # 记录出现了该属性的句子，及其中出现的属性取值
            if len(sentences_containing_attribute) <= 10:
                # sentences_containing_attribute.setdefault( hits[0], sent )
                # sent marked up with value
                marked_sent = keyword_processor.replace_keywords(sent)
                sentences_containing_attribute.append(marked_sent)
            # 如果某一取值存在，统计数+1
            # hits ['【Early stage】']  hit[1:-1]
            lower_hits = [hit[1:-1].lower() for hit in hits]
            for occurred_value in set(lower_hits):
                if occurred_value in occurence_of_values_in_sents:
                    occurence_of_values_in_sents[occurred_value] += 1
    # 属性筛选
    # 至少出现在两个句子中；至少有两个取值出现。
    # 为了避免假阳性/偶然性，一个有效的取值至少应该出现在2个句子中
    # 一个属性至少具有2个这样的取值
    keep_flag = False
    number_of_qualified_value = 0
    for occurred_value in occurence_of_values_in_sents:
        # 为了避免假阳性/偶然性，一个有效的取值至少应该出现在2个句子中
        if occurence_of_values_in_sents[occurred_value] >= 2:
            number_of_qualified_value += 1
    # 如果一个属性至少具有2个这样的取值，那这一属性存在的可能性非常大了
    if number_of_qualified_value >=2 :
        keep_flag = True

    # output observing results
    if keep_flag:
        # print("候选属性名称: ", name_of_attribute)
        # print("该属性在句子中的出现次数：", occurence_of_attribute_in_sents)
        # print("该属性的取值在语料中的分布：", occurence_of_values_in_sents)
        # print("包含该属性的句子证据:")
        # for index, sent in enumerate(sentences_containing_attribute) :
        #     print(index, sent)
        # break
        count_of_candidate_attributes += 1
        # info of candidate attribute
        tmpdict = {}
        tmpdict.setdefault("name", name_of_attribute )
        tmpdict.setdefault("occurrences", occurence_of_attribute_in_sents )
        # 取值分布按出现次数排列 (精简下显示模式) [ ("covered",5), ("unaided",2)]
        distribution_of_values_sorted = sorted(occurence_of_values_in_sents.items(), key=lambda item:item[1], reverse=True)
        number_of_occurred_values = 0
        dict_of_distribution_of_values = {}
        for _value, _count in distribution_of_values_sorted:
            dict_of_distribution_of_values.setdefault(_value, _count)
            if _count >= 1:
                number_of_occurred_values +=1
        tmpdict.setdefault("distributions", dict_of_distribution_of_values )
        # 在语料中出现过的属性取值占所有取值数目的百分比
        percentage_of_occurred_values = round(number_of_occurred_values/len(distribution_of_values_sorted), 2)
        tmpdict.setdefault("percentage", percentage_of_occurred_values )
        # evidence of sentences
        tmpdict.setdefault("evidences", sentences_containing_attribute )
        #
        info_of_candidate_attributes.append( tmpdict )
        
print(count_of_candidate_attributes)


37


In [204]:
info_of_candidate_attributes[0]

{'name': '106240007::General clinical stage for disease AND/OR neoplasm',
 'occurrences': 5,
 'distributions': {'early stage': 3,
  'late stage': 2,
  'end-stage': 0,
  'histologic grading differentiation and/or behavior': 0,
  'midstage': 0,
  'stage level 1': 0,
  'stage level 2': 0,
  'stage level 3': 0,
  'stage level 4': 0,
  'stage level 5': 0},
 'percentage': 0.2,
 'evidences': ['[20],,Early complications,Additional problems may occur in the 【Early stage】 of the illness.',
  'Patients in the 【Late stage】 had significant thrombocytopenia, and deficiency of coagulation factors was less severe than in the early form.',
  'Very low blood pressure may occur at an 【Early stage】, especially but not exclusively in meningococcal meningitis; this may lead to insufficient blood supply to other organs.',
  '[20],,,,Early complications,Additional problems may occur in the 【Early stage】 of the illness.',
  'Most people with the 【Late stage】 form died within 8 to 12 days of illness.']}

In [205]:
# 根据属性的出现次数进行排序
sorted_occurrence_of_attributes = []
for infodict in info_of_candidate_attributes:
    sorted_occurrence_of_attributes.append( (infodict["occurrences"], infodict["name"]) )

sorted_occurrence_of_attributes.sort(reverse=True)
sorted_occurrence_of_attributes[0:5]


[(189, 'HP:0011008::Temporal pattern'),
 (164, '272141005::Severities'),
 (158, 'ICD::Course'),
 (152, 'ICD::Mild Moderate Severe Scale Value'),
 (152, 'HP:0012824::Severity')]

In [206]:
# 根据occurence排序后的结果文件
sorted_info_of_candidate_attributes = []

for (count, name_of_attribute) in sorted_occurrence_of_attributes:
    # 
    for infodict in info_of_candidate_attributes:
        if infodict["name"] == name_of_attribute:
            sorted_info_of_candidate_attributes.append( infodict )
    

In [207]:
os.getcwd()

'/home/denglizong/SSUMiner'

In [208]:
# 保存 info_of_candidate_attributes 看看
file_of_candidate_attributes = os.path.join( attribute_lib_path, 'results_of_candidate_attributes.json' )

json.dump(sorted_info_of_candidate_attributes,
          open(file_of_candidate_attributes,'w', encoding='utf-8'), ensure_ascii=False, indent=2 )