In [1]:
# 构建、测试并调试基于特征工程和机器学习的实体属性预测模型
# 使用SVM多分类模型
# 特征工程使用 1. 触发词是否出现；2. 触发词相对位置； 3. 是否有语义分割 三类特征

In [8]:
# 改进方向: 小样本复制一下上采样 (表征基本上是一样的) 
# 完善触发词规则，将数量型触发信号加入进来看看

In [3]:
import os
import re
import json
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import TreebankWordTokenizer
from nltk import pos_tag
from nltk.util import ngrams

In [5]:
from flashtext import KeywordProcessor
from interval import Interval
from collections import Counter
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier

In [9]:
# 词形还原器
wordnet_lemmatizer = WordNetLemmatizer()

In [10]:
# 相关数据文件路径
data_dir = "/home/denglizong/SSUMiner/corpus/"

In [11]:
# 疾病描述文本文件路径
corpus_path = '/home/denglizong/SSUMiner/corpus/WikiPediaR5'
# os.listdir( corpus_path )[0:3]

In [12]:
# 疾病名称列表
list_of_diseases = [ filename.replace('.txt',"") 
                    for filename in os.listdir( corpus_path ) if filename.endswith(".txt")]
#
list_of_diseases[0:3]                    

['Acinetobacter infections', 'Kawasaki disease', 'Paragonimiasis']

In [13]:
# 读入训练集和测试集的疾病名称索引
diseases_used_for_training = []
diseases_used_for_test = []

# 用于训练模型或调优规则的疾病文档
file_of_diseases_for_training = os.path.join( data_dir+"TrainTestSplit", "diseases_for_training.txt" )
with open( file_of_diseases_for_training, 'r', encoding='utf-8' ) as f:
    for line in f.readlines():
        if line.strip() != "":
            diseases_used_for_training.append( line.strip()  )

# 用于测试模型或测试规则的疾病文档
file_of_diseases_for_test = os.path.join( data_dir+"TrainTestSplit", "diseases_for_test.txt" )
with open( file_of_diseases_for_test, 'r', encoding='utf-8' ) as f:
    for line in f.readlines():
        if line.strip() != "":
            diseases_used_for_test.append( line.strip()  )

#
len( diseases_used_for_training ), len( diseases_used_for_test )            

(133, 60)

In [14]:
# 读取brat标注文件的函数
# 获取brat对应的标注文件中的实体span,名称及类型
# dict_of_brat_annotations.setdefault( ent_id, ('FindingSite', ent_name, pos_info) )    
# 将ent_id 和 pos_info的位置置换一下   
def read_brat_annotation_file( file_of_brat_annotation ):
    # 求解目标
    dict_of_brat_annotations = {}
    # 原文字符串
    text_string = ""
    with open(file_of_brat_annotation,'r',encoding='utf-8') as f:
        text_string = f.read()    
    # 注释文件内容
    ann_lines = []
    with open(file_of_brat_annotation,'r',encoding='utf-8') as f:
        ann_lines = f.readlines()
    # 解析注释文件中的内容
    for line in ann_lines:
        # 
        if line.startswith('T'): 
            # 表型实体
            if re.search('Phenotype',line,re.I):
                # T1	Phenotype 69 86	painful abscesses
                ent_id, ent_info, ent_name = line.strip().split('\t')
                # 位置信息 153 157;180 194
                pos_info = ent_info.replace('Phenotype ','')
                # 
                # dict_of_brat_annotations.setdefault( ent_id, ('Phenotype', ent_name, pos_info) )   
                dict_of_brat_annotations.setdefault( pos_info, ('Phenotype', ent_name, ent_id) )
            # 部位实体
            elif re.search('FindingSite',line,re.I):
                # T4	FindingSite 114 120	breast
                ent_id, ent_info, ent_name = line.strip().split('\t')
                # 位置信息 114 120
                pos_info = ent_info.replace('FindingSite ','')   
                #
                # dict_of_brat_annotations.setdefault( ent_id, ('FindingSite', ent_name, pos_info) )   
                dict_of_brat_annotations.setdefault( pos_info, ('FindingSite', ent_name, ent_id) )     
    #
    return dict_of_brat_annotations   
    

In [15]:
# 获取通过brat标注产生的表型术语和部位术语 (需要移除的对象)(避免干扰)
# dict_of_brat_annotations.setdefault( pos_info, ('Phenotype', ent_name, ent_id) )
# dict_of_brat_annotations.setdefault( pos_info, ('FindingSite', ent_name, ent_id) )

# expert_annotated_terms = set()
expert_annotated_terms = {}

for disease_name in diseases_used_for_training:
    # 疾病百科原文
    file_of_original_text  = os.path.join( corpus_path, disease_name + '.txt' )
    raw_text = open( file_of_original_text, 'r', encoding='utf-8').read() 
    # 莫非是 raw_text 没有替换'\n'的问题？(因为标注的时候有替换)
    # 还真的是这个问题
    raw_text = raw_text.replace('\n',',')

    # 获取该疾病文本的人工标注
    # 原来是这里出错了，将 disease_name 设置为了 "Acinetobacter infections"
    file_of_brat_annotation  = os.path.join( corpus_path, disease_name + '.ann' )
    dict_of_brat_annotations = read_brat_annotation_file(file_of_brat_annotation) 

    # 
    for span_key in dict_of_brat_annotations:
        # expert_annotated_terms.add( dict_of_brat_annotations[span_key][1] )
        annotated_term = dict_of_brat_annotations[span_key][1]
        annotated_type = dict_of_brat_annotations[span_key][0]
        #
        expert_annotated_terms.setdefault( annotated_term, annotated_type )


In [16]:
# 属性名称列表
# list_of_attributes = ['Assertion','Severity','Temporal','Sensation','Color','Magnitudes','Shape','Age','Gender',
#                      'Frequency','Relevance','Laterality','Quadrant','Distribution',
#                       'Stage','Phase','Type','SOI','Complication']
# 待预测的属性名称
# list_of_attributes = ['Assertion','Severity','Temporal','Sensation','Color','Age','Gender',
#                      'Frequency','Laterality','Quadrant','Distribution','SOI']       
# 不预测 'Laterality','Quadrant'                                 
list_of_attributes = ['Assertion','Severity','Temporal','Sensation','Color','Age','Gender',
                     'Frequency','Distribution','SOI']  

In [17]:
# 观察 某一具体属性(wanted_attr_name)在疾病文档(disease_names)中取值分布的函数
# {value 1: count 1, value 2: count 2}
# occurences_of_attribute_values = {}
def stat_occurences_of_attribute_values( wanted_attr_name, disease_names ):
    # 
    dict_of_attribute_value_distribution = {}

    #
    for disease_name in disease_names:
        # 这一疾病对应的文本文件和标注文件
        # textfilepath = os.path.join( filepath, disease_names+'.txt')
        annfilepath  = os.path.join( corpus_path, disease_name+'.ann')
        
        # 读入标注文件
        with open(annfilepath,'r',encoding='utf-8') as f:
            # 载入标注文件的行
            lines = f.readlines()     
            # 找到属性行
            for line in lines:
                # 解析属性行，
                if line.startswith('A'):
                    # 解析属性行中的属性名称
                    # A2	Assertion T6 Possible
                    info_a, info_b = line.strip().split('\t')
                    att_name, ent_id, att_value = info_b.split(' ')                     
                    # 如果是需要统计的属性
                    if att_name == wanted_attr_name:
                        # 统计属性值
                        if att_value not in dict_of_attribute_value_distribution:
                            dict_of_attribute_value_distribution.setdefault(att_value, 1)
                        else:
                            dict_of_attribute_value_distribution[att_value] +=1
    #
    return dict_of_attribute_value_distribution                                

In [18]:
# 
# 目标变量: annotated_phenotypes_info_of_diseases.setdefault(disease_name, info_of_annotated_phenotypes )
# list of info_of_annotated_phenotypes 以表型的id为key,记录表型的标注信息
# info_of_annotated_phenotypes.setdefault(pheno_id, tmpdict)
# 以表型为单位，包含以下信息记录到tmpdict
# tmpdict.setdefault("pheno_name",pheno_name)
# tmpdict.setdefault("pheno_pos",pheno_pos)
# tmpdict.setdefault("associated_sites", ';'.join(associated_finding_sites) )
# tmpdict.setdefault("associated_attributes", ';'.join(associated_attributes) )      
annotated_phenotypes_info_of_diseases = {}

for disease_name in list_of_diseases:
    # 疾病百科原文
    file_of_original_text  = os.path.join( corpus_path, disease_name + '.txt' )
    raw_text = open( file_of_original_text, 'r', encoding='utf-8').read() 
    # 莫非是 raw_text 没有替换'\n'的问题？(因为标注的时候有替换)
    # 还真的是这个问题
    raw_text = raw_text.replace('\n',' ')

    # 获取该疾病文本的人工标注
    # 原来是这里出错了，将 disease_name 设置为了 "Acinetobacter infections"
    file_of_brat_annotation  = os.path.join( corpus_path, disease_name + '.ann' )

    # 解析其中标注的表型信息
    info_of_annotated_phenotypes = {}  

    with open(file_of_brat_annotation,'r',encoding='utf-8') as f:

        # 载入标注文件的行
        lines = f.readlines()
        # 搜索标注的表型实体
        # 获取该文本中出现过的表型实体编号及其名称   ent_id, ent_name 
        dict_of_phenotypic_id_and_name = {}
        # 获取该文本中出现过的部位实体编号及其名称   ent_id, ent_name 
        dict_of_findingsite_id_and_name = {}
        # 记录表型实体与部位实体之间的关联          ent1_id, [ent2_id] 可能会关联多个部位
        dict_of_phenotypes_related_sites = {}   
        # 记录表型实体具有的属性信息                ent_id, [(att_name,att_value)] 可能会关联多个属性
        dict_of_phenotypes_related_attributes = {}         
        #
        for line in lines:
            # 表型实体，不包括部位实体
            if line.startswith('T'): 
                if re.search('Phenotype',line,re.I):
                    # T1	Phenotype 69 86	painful abscesses
                    ent_id, ent_info, ent_name = line.strip().split('\t')
                    # 位置信息 153 157;180 194
                    ent_pos = ent_info.replace('Phenotype ','')
                    # 
                    dict_of_phenotypic_id_and_name.setdefault( ent_id, (ent_name,ent_pos) )   
                elif re.search('FindingSite',line,re.I):
                    # T4	FindingSite 114 120	breast
                    ent_id, ent_info, ent_name = line.strip().split('\t')
                    dict_of_findingsite_id_and_name.setdefault( ent_id, ent_name )                
            # 找到关系 ，记录标注有部位的表型
            elif line.startswith('R'):
                if re.search('locate',line,re.I):
                    # R1	locate Arg1:T1 Arg2:T2	
                    rel_id, rel_info, tmp_str = line.split('\t')
                    # locate Arg1:T1 Arg2:T2
                    part_a, part_b, part_c = rel_info.split(' ')
                    # 
                    ent1_id = part_b.split(':')[1]
                    ent2_id = part_c.split(':')[1]
                    #
                    if ent1_id not in dict_of_phenotypes_related_sites:
                        dict_of_phenotypes_related_sites.setdefault( ent1_id, [ent2_id] )     
                    else:
                        dict_of_phenotypes_related_sites[ent1_id].append( ent2_id )
            # 属性行，关联到对应的表型实体
            elif line.startswith('A'):
                # A2	Assertion T6 Possible
                info_a, info_b = line.strip().split('\t')
                att_name, ent_id, att_value = info_b.split(' ') 
                # 除外'AgreeAnn','EqualRaw'属性
                if att_name not in ['AgreeAnn','EqualRaw']:
                    # 表型id, 属性名称，属性值
                    if ent_id not in dict_of_phenotypes_related_attributes:
                        dict_of_phenotypes_related_attributes.setdefault( ent_id, [(att_name,att_value)]  )
                    else:
                        dict_of_phenotypes_related_attributes[ent_id].append( (att_name,att_value) ) 
        #
        for pheno_id in dict_of_phenotypic_id_and_name:
            # 文档名称
            doc_name = disease_name
            # 表型名称  (ent_name,end_start,end_end)
            pheno_name = dict_of_phenotypic_id_and_name[pheno_id][0]
            # 表型位置
            pheno_pos = dict_of_phenotypic_id_and_name[pheno_id][1]
            # 关联部位名称 (分号隔开)
            associated_finding_sites = []
            if pheno_id in dict_of_phenotypes_related_sites:
                site_ids = dict_of_phenotypes_related_sites[pheno_id]
                for site_id in site_ids:
                    if site_id in dict_of_findingsite_id_and_name:
                        associated_finding_sites.append( dict_of_findingsite_id_and_name[site_id] )
            # 关联属性
            associated_attributes = []
            if pheno_id in dict_of_phenotypes_related_attributes:
                list_of_attrs = dict_of_phenotypes_related_attributes[pheno_id]
                for (_attr, _value) in list_of_attrs:
                    associated_attributes.append( _attr+":"+_value) 
            # 记录
            tmpdict = {}
            tmpdict.setdefault("pheno_name",pheno_name)
            tmpdict.setdefault("pheno_pos",pheno_pos)
            tmpdict.setdefault("associated_sites", ';'.join(associated_finding_sites) )
            tmpdict.setdefault("associated_attributes", ';'.join(associated_attributes) )                       
            # 
            info_of_annotated_phenotypes.setdefault(pheno_id, tmpdict)
            annotated_phenotypes_info_of_diseases.setdefault(disease_name, info_of_annotated_phenotypes )  

In [19]:
# annotated_phenotypes_info_of_diseases = {}
# annotated_phenotypes_info_of_diseases.setdefault(disease_name, info_of_annotated_phenotypes )  
# 类似S5A_基于规则预测实体属性，在其内增加一个算法预测的属性模块

In [20]:
# 给定一份疾病文档，输出专家标注的表型信息
annotated_phenotypes_info_of_diseases['Actinomycosis']

{'T1': {'pheno_name': 'painful abscesses',
  'pheno_pos': '69 86',
  'associated_sites': 'mouth;lungs;breast;gastrointestinal tract',
  'associated_attributes': 'Assertion:Present;Relevance:Distinctive_finding'},
 'T6': {'pheno_name': 'pus',
  'pheno_pos': '366 369',
  'associated_sites': 'skin',
  'associated_attributes': 'Assertion:Possible;SOI:Severe_problem'},
 'T8': {'pheno_name': 'sulfur granules',
  'pheno_pos': '417 432',
  'associated_sites': '',
  'associated_attributes': 'Assertion:Possible;Frequency:Frequent;Relevance:Distinctive_finding;SOI:Severe_problem'}}

In [21]:
# 进一步的，对于每一个标注的表型，记录表型所在的句子文本信息
# 将之增添到 annotated_phenotypes_info_of_diseases 变量中
for disease_name in list_of_diseases:
    # for the disease_name 

    # 疾病百科原文
    file_of_original_text  = os.path.join( corpus_path, disease_name + '.txt' )
    raw_text = open( file_of_original_text, 'r', encoding='utf-8').read() 
    # 莫非是 raw_text 没有替换'\n'的问题？(因为标注的时候有替换)
    # 还真的是这个问题 
    # 在S1A 使用MetaMap注释语料的过程中, 替换其中的'\n'为','
    raw_text = raw_text.replace('\n',',')

    # 读入文本文件，获取句子信息
    # text_sents = []
    for start, end in PunktSentenceTokenizer().span_tokenize(raw_text):
        # text_sents.append( (start,end, raw_text[start:end]) ) 
        sent_text = raw_text[start:end]
        
        # 遍历标注的表型
        for pheno_id in annotated_phenotypes_info_of_diseases[disease_name]:
            # 解析表型的起止位置
            pheno_info = annotated_phenotypes_info_of_diseases[disease_name][pheno_id]
            pheno_spos = int( pheno_info['pheno_pos'].split(' ')[0] )
            pheno_epos = int( pheno_info['pheno_pos'].split(' ')[-1] )

            # 进一步定位表型所在的句子,
            if pheno_spos >= start and pheno_epos<= end:  
                # 记录表型所在的句子
                if 'pheno_context' not in annotated_phenotypes_info_of_diseases[disease_name][pheno_id]:
                    annotated_phenotypes_info_of_diseases[disease_name][pheno_id].setdefault( 'pheno_context', sent_text )
                else:
                    annotated_phenotypes_info_of_diseases[disease_name][pheno_id]['pheno_context'] = sent_text
                # 记录表型在句子中的起始位置
                pheno_spos_in_sent = pheno_spos - start
                pheno_epos_in_sent = pheno_epos - start
                pos_info = (pheno_spos_in_sent, pheno_epos_in_sent)
                if pos_info not in annotated_phenotypes_info_of_diseases[disease_name][pheno_id]:
                    annotated_phenotypes_info_of_diseases[disease_name][pheno_id].setdefault( 'pheno_pos_in_sent', pos_info )

In [22]:
# 再看看现在记录到变量中的信息
annotated_phenotypes_info_of_diseases['Actinomycosis']

{'T1': {'pheno_name': 'painful abscesses',
  'pheno_pos': '69 86',
  'associated_sites': 'mouth;lungs;breast;gastrointestinal tract',
  'associated_attributes': 'Assertion:Present;Relevance:Distinctive_finding',
  'pheno_context': 'Signs and symptoms,,The disease is characterised by the formation of painful abscesses in the mouth, lungs,[3][4] breast,[5] or gastrointestinal tract.',
  'pheno_pos_in_sent': (69, 86)},
 'T6': {'pheno_name': 'pus',
  'pheno_pos': '366 369',
  'associated_sites': 'skin',
  'associated_attributes': 'Assertion:Possible;SOI:Severe_problem',
  'pheno_context': 'In severe cases, they may penetrate the surrounding bone and muscle to the skin, where they break open and leak large amounts of pus, which often contains characteristic granules (sulfur granules) filled with progeny bacteria.',
  'pheno_pos_in_sent': (129, 132)},
 'T8': {'pheno_name': 'sulfur granules',
  'pheno_pos': '417 432',
  'associated_sites': '',
  'associated_attributes': 'Assertion:Possible;Fr

In [23]:
# 给定 属性:取值(wanted_attribute), 并指定疾病文档(wanted_diseases), 输出这些疾病文档中包含有指定属性取值的表型信息
def get_phenotypes_with_wanted_attribute_value( annotated_phenotypes_info_of_diseases, wanted_diseases, wanted_attribute_value ):
    #
    list_of_phenotypes_with_wanted_attribute_value = []
    list_of_phenotypes_without_wanted_attribute_value = []

    # 遍历标注的表型
    for disease_name in wanted_diseases:
        for pheno_id in annotated_phenotypes_info_of_diseases[disease_name]:
            # infodict of annotated phenotypes
            pheno_info = annotated_phenotypes_info_of_diseases[disease_name][pheno_id]  

            if "pheno_context" not in pheno_info:
                continue

            # 观察是否该表型是否具有指定的属性值
            pheno_values = pheno_info["associated_attributes"].split(';')
            # Severity:Severe
            if wanted_attribute_value in pheno_values:
                list_of_phenotypes_with_wanted_attribute_value.append( pheno_info )
            else:
                list_of_phenotypes_without_wanted_attribute_value.append( pheno_info )
    
    #
    return list_of_phenotypes_with_wanted_attribute_value

In [24]:
# 给定 属性 (wanted_attribute), 并指定疾病文档(wanted_diseases), 输出这些疾病文档中没有选择该属性取值的表型信息
def get_phenotypes_without_wanted_attribute( annotated_phenotypes_info_of_diseases, wanted_diseases, wanted_attribute ):
    #
    list_of_phenotypes_with_wanted_attribute = []
    list_of_phenotypes_without_wanted_attribute = []

    # 遍历标注的表型
    for disease_name in wanted_diseases:
        for pheno_id in annotated_phenotypes_info_of_diseases[disease_name]:
            # infodict of annotated phenotypes
            pheno_info = annotated_phenotypes_info_of_diseases[disease_name][pheno_id]  

            if 'pheno_context' not in pheno_info:
                continue

            # 观察是否该表型是否具有指定的属性值
            # "Assertion:Present;Severity:Severe" 
            pheno_values = pheno_info["associated_attributes"]
            # "Severity:" in "Assertion:Present;Severity:Severe" 
            if wanted_attribute+':' in pheno_values:
                list_of_phenotypes_with_wanted_attribute.append( pheno_info )
            else:
                list_of_phenotypes_without_wanted_attribute.append( pheno_info )
    
    #
    return list_of_phenotypes_without_wanted_attribute

In [25]:
# 读入实体属性标准取值的触发词表知识库
trigger_kb_file = "/home/denglizong/SSUMiner/corpus/TriggerWords/attribute_triggers.xlsx"

In [26]:
df_of_attribute_triggers = pd.read_excel(trigger_kb_file, dtype=str).fillna('0')
df_of_attribute_triggers.head()

Unnamed: 0,属性中文名称,属性英文名称,标准取值,样本数,训练集,测试集,触发词,触发范围
0,表型存在情况,Assertion,Present,1229,862,367,0,long
1,表型存在情况,Assertion,Possible,2692,1976,716,may;might;can;could;possible;possibly;likely;i...,long
2,表型存在情况,Assertion,Conditional,76,62,14,if;while;when;in case of;without treatment,long
3,表型存在情况,Assertion,Hypothetical,4,4,0,0,long
4,表型存在情况,Assertion,Absent,20,13,7,no;without;never developed;never had;no compla...,short


In [27]:
df_of_attribute_triggers[ df_of_attribute_triggers['属性英文名称'] == 'Severity' ]

Unnamed: 0,属性中文名称,属性英文名称,标准取值,样本数,训练集,测试集,触发词,触发范围
5,表型严重程度,Severity,Mild,52,31,21,mild;low-grade;low grade;subtle;mildly;a littl...,short
6,表型严重程度,Severity,Mild_to_Moderate,2,2,0,mild to moderate;mild or moderate,short
7,表型严重程度,Severity,Moderate_to_Severe,9,8,1,moderate or very high;moderate to severe,short
8,表型严重程度,Severity,Severe,128,89,39,severe;high;extreme;intense;profound;strong;si...,short
9,表型严重程度,Severity,Life_threatening_severity,6,5,1,life-threatening;dangerously;dangerous,short


In [28]:
# svm多分类示例

In [29]:
X = np.array([
                [10, 10],
                [8, 10],
                [-5, 5.5],
                [-5.4, 5.5],
                [-20, -20],
                [-15, -20]
            ])

In [30]:
y = np.array([0, 0, 1, 1, 2, 2])

In [31]:
clf = OneVsRestClassifier(SVC()).fit(X, y)

In [32]:
clf.predict([[-19, -20], [9, 9], [-5, 5]])

array([2, 0, 1])

In [33]:
# X [ [表征]     ]
# y [ [标签]     ]

In [34]:
# 构建实体属性值预测的多分类模型demo

In [35]:
# 第一阶段 求解目标: 
# trained classifier for each of attributes
# dict_of_trained_classifiers = {}
# dict_of_trained_classifiers.sefdefault( attribute_name, clf )

In [36]:
dict_of_trained_classifiers = {}

In [37]:
dict_of_trained_classifiers

{}

In [38]:
# 数据量不够 Laterality; Quadrant;
print(list_of_attributes)

['Assertion', 'Severity', 'Temporal', 'Sensation', 'Color', 'Age', 'Gender', 'Frequency', 'Distribution', 'SOI']


In [299]:
# 回到这里
# 目标属性名称
target_attribute = 'SOI'

In [300]:
# 对目标属性的取值进行数字编码
codes_of_attribute_values = {}

# default的属性取值设置为0
if target_attribute == 'Assertion':
    codes_of_attribute_values.setdefault( "Present", 0 )
else:
    codes_of_attribute_values.setdefault( "None", 0 )  


# 观察目标属性在语料集中的出现情况
# 观察目标属性在训练集中的出现情况
present_of_attribute_values = stat_occurences_of_attribute_values(target_attribute, diseases_used_for_training)
# 该属性在训练集中出现的类别数 len( present_of_attribute_values )

# 对出现过的目标属性取值进行数字编码
for attribute_value in present_of_attribute_values:
    if attribute_value != 'Present':
        # 赋予编号
        # 只有当该属性值具有的样本数大于类别数时才考虑
        count_of_attribute_value = present_of_attribute_values[ attribute_value ] 
        # 有样本即可
        if count_of_attribute_value >= 0 :
            code_of_attribute_value = len( codes_of_attribute_values )
            codes_of_attribute_values.setdefault( attribute_value , code_of_attribute_value )  

# 符合样本数条件的属性类
print(codes_of_attribute_values)

{'None': 0, 'Severe_problem': 1, 'Mild_problem': 2}


In [301]:
# 提取训练集中具有目标属性取值的表型信息
# {"Severity:Severe":[], }
# phenotypes_with_target_attribute.setdefault( attribute_value, list_of_phenoinfo )
phenotypes_with_target_attribute = {}

# 对于在语料中出现过的属性取值 [只需要考虑符合样本数条件的属性类]
for attribute_value in codes_of_attribute_values:
    if attribute_value == 'None':
        continue
    # 获取它们所依附的表型信息
    list_of_phenoinfo = get_phenotypes_with_wanted_attribute_value(annotated_phenotypes_info_of_diseases, 
                        diseases_used_for_training, target_attribute+ ':'+ attribute_value)
    # 保存
    phenotypes_with_target_attribute.setdefault( attribute_value, list_of_phenoinfo )
    

# 如果是可选属性(Assertion之外的属性)，还要考虑这些属性有'None'的取值
if target_attribute != 'Assertion':
    # 获取不具有该属性的表型标注
    list_of_phenoinfo = get_phenotypes_without_wanted_attribute(annotated_phenotypes_info_of_diseases, 
                                                                    diseases_used_for_training, target_attribute)
    # 保存
    phenotypes_with_target_attribute.setdefault( "None", list_of_phenoinfo )  

In [302]:
for attribute_value in phenotypes_with_target_attribute:
    print( attribute_value, len( phenotypes_with_target_attribute[attribute_value] ) ) 

Severe_problem 52
Mild_problem 9
None 2846


In [303]:
# 阴性数据和阳性数据的确非常不平衡，可以考虑对阴性数据进行下采样 [使得阴阳数据平衡]
# 阳性数据数目
count_of_positive_samples = 0

for attribute_value in phenotypes_with_target_attribute:
    if attribute_value != 'None':
        count_of_positive_samples += len( phenotypes_with_target_attribute[attribute_value] )

# print( count_of_positive_samples, len(phenotypes_with_target_attribute['None']) )

# 如果阴性数据的数目大于了阳性数据
# 对None阴性数据进行下采样 (随机下采样即可，因为它们的表征在理论上，都应该是0 0 0)
if 'None' in phenotypes_with_target_attribute:
    resampled_negative_samples = []
    resampled_negative_samples = random.sample( phenotypes_with_target_attribute['None'], count_of_positive_samples )
    # 更新 phenotypes_with_target_attribute 中的 None 样本列表
    phenotypes_with_target_attribute['None'] = resampled_negative_samples    

In [304]:
for attribute_value in phenotypes_with_target_attribute:
    print( attribute_value, len( phenotypes_with_target_attribute[attribute_value] ) ) 

Severe_problem 52
Mild_problem 9
None 61


In [305]:
# 获取该属性不同属性取值的触发词
# 不考虑default的属性取值
triggers_of_attribute_values = {}

for index, row in df_of_attribute_triggers[ df_of_attribute_triggers['属性英文名称'] == target_attribute ].iterrows():
    attribute_name = row['属性英文名称']
    attribute_value = row['标准取值']
    attribute_triggers = row['触发词']

    # 跳过必选属性的默认取值Present
    if attribute_value == 'Present':
        continue

    # 写入属性取值-触发词变量
    if attribute_triggers != '0':
        triggers_of_attribute_values.setdefault( attribute_value, attribute_triggers.split(';') )

# add
# 需要考虑 PseudoTrigger
for index, row in df_of_attribute_triggers[ df_of_attribute_triggers['属性英文名称'] == 'PT' ].iterrows():
    attribute_name = row['属性英文名称']
    attribute_value = row['标准取值']
    attribute_triggers = row['触发词']

    # 写入属性取值-触发词变量
    if attribute_triggers != '0':
        triggers_of_attribute_values.setdefault( attribute_value, attribute_triggers.split(';') )


# 如果是Assertion:Possible属性，需要考虑Frequency的触发词并入到Possible里
if target_attribute == 'Assertion':
    # 
    for index, row in df_of_attribute_triggers[ df_of_attribute_triggers['属性英文名称'] == 'Frequency' ].iterrows():
        attribute_name = row['属性英文名称']
        attribute_value = row['标准取值']
        attribute_triggers = row['触发词']

        # 写入属性取值-触发词变量
        if attribute_triggers != '0' and attribute_value != "Obligate":
            triggers_of_attribute_values['Possible'] += attribute_triggers.split(';')

# 如果是Severity的属性，需要把Severe Case作为虚假触发信号
if target_attribute == 'Severity':       
    # 
    for index, row in df_of_attribute_triggers[ df_of_attribute_triggers['属性英文名称'] == 'SOI' ].iterrows():
        attribute_name = row['属性英文名称']
        attribute_value = row['标准取值']
        attribute_triggers = row['触发词']

        # 写入属性取值-触发词变量
        if attribute_triggers != '0':
            triggers_of_attribute_values['PseudoTrigger'] += attribute_triggers.split(';')    

print(triggers_of_attribute_values)


{'Mild_problem': ['mild illness', 'mild case', 'mild infection', 'mild symtom', 'mild one'], 'Severe_problem': ['severe form', 'severe case', 'severe infection', 'severe disease', 'severe illness', 'severe case', 'severe infection', 'severe one', 'when severe', 'severe acute disease', 'serious one', 'serious form', 'severe symptoms'], 'PseudoTrigger': ['no increase', 'no significant change', 'not only', 'not drain', 'not certain whether', 'when urinating', 'when symptomatic', 'urinate frequently', 'most notably', 'most important', 'a few days', 'common cold', 'most common cause', 'most severe', 'most serious', 'common name', 'half of the body', 'a few months', 'difficulty becoming pregnant', 'complications of pregnancy', 'ectopic (tubal) pregnancy', 'adult worms', 'common cold', 'female worms', 'men and women', 'other than pregnant women', 'high numbers', 'many other', 'general ill', 'as many as', 'general abdominal pain', 'often referred to', 'without significant', 'heavy breathing', 

In [306]:
# 将 triggers_of_attribute_values 转换为 trigger 到 标准取值的形式，便于处理
mapping_btw_trigger_and_std_value = {}

for attribute_value in triggers_of_attribute_values:
    for trigger_word in triggers_of_attribute_values[attribute_value]:
        mapping_btw_trigger_and_std_value.setdefault( trigger_word, attribute_value )

In [307]:
print(codes_of_attribute_values)

{'None': 0, 'Severe_problem': 1, 'Mild_problem': 2}


In [308]:
# mapping_btw_trigger_and_std_value

In [309]:
# 训练集表征
# 尝试以表型为单位，生成该表型的特征矢量(X), 并记录对应的分类标签(y)
# 由于是有监督学习，用于训练的样本量还是要有一定保障，
# 要求在训练集中一个类别至少要有五个以上的样本 (5个样本以下就是小样本学习了)
# 对于小样本，尝试使用基于规则的方法来预测其属性，而非用机器学习
# 然后合并生成 y_pred 与 y_true进行比较
# 
# 对于Severity而言, 当表型对象是Fever的时候，摄氏度也要算作触发信号
# 如果某一个标准取值有多个触发信号，选择(保留)离核心词最近的那个
# 如果触发词超过了50个token, 忽略
features_of_phenotypes = []
labels_of_phenotypes   = []

# 对于Assertion属性的每一个具有足够样本量的属性取值
# {'Present': 0, 'Possible': 1, 'Conditional': 2, 'Absent': 3}
for attribute_value in codes_of_attribute_values:
    # 自带了限制，该属性取值的样本数不能少于该属性具有的取值分类数目
    # if attribute_value != 'None':
    #     # 该属性取值的样本数 < 该属性具有的分类数
    #     if present_of_attribute_values[ attribute_value ] < len(present_of_attribute_values):
    #         continue

    # 观测具有该属性取值的表型信息
    for pheno_info in phenotypes_with_target_attribute[attribute_value]:
        # 表型名称
        pheno_name = pheno_info['pheno_name']
        # 表型所在的句子
        pheno_text = pheno_info['pheno_context']
        # 表型在句子中的位置
        pheno_spos, pheno_epos = pheno_info['pheno_pos_in_sent']
        pheno_epos = pheno_epos-1

        # 表型属性取值的类别标签
        # value_code = codes_of_attribute_values[attribute_value]

        # 逗号替换为空格，避免它依附在单词上带来干扰 (有TreebankWordTokenizer就不必了)
        # pheno_text = pheno_text.replace(',',' ')

        # 对句子进行带位置信息的tokenize操作
        span_generator = TreebankWordTokenizer().span_tokenize(pheno_text)  

        # 对句子的token进行小写化处理和词形还原处理
        text_words_cleaned_with_pos = []
        text_words_cleaned = []

        for span in span_generator:
            # token原始形式
            raw_word = pheno_text[ span[0]:span[1] ]
            # token处理后形式
            cleaned_word = wordnet_lemmatizer.lemmatize( raw_word.lower() )
            # 记录token原始形式的位置和token处理后形式
            text_words_cleaned_with_pos.append( (span[0], span[1], cleaned_word) ) 
            # 记录token处理后形式
            text_words_cleaned.append( cleaned_word )


        # 特征矢量构建
        # 1. 不同属性取值的触发词的出现情况  [] 维度等于属性取值的数目，排序按code排列
        # 2. 如果出现，相对于中心词的位置 (token数)    
        # 3. 如果出现，与中心词之间是否存在语义分割?
        # 句中的触发信号 出现 1/未出现 0
        vector_of_trigger_signal    = [0]*len( codes_of_attribute_values )
        # 触发信号到核心词的距离 未出现 对应 -1; 出现 距离的绝对值/100；超过100个token设置为1
        # 不能设置为0，因为0是有特殊意义的
        vector_of_relative_distance = [-1]*len( codes_of_attribute_values )
        # 触发信号与核心词之间有无标点符号  (-)1 触发词未出现  (0)(无标点符号)  (1)(有标点符号)
        vector_of_semantic_split    = [0]*len( codes_of_attribute_values )


        # 扫描表型上下文中的触发词 (主要是看是哪一个属性取值的触发词)
        # 只要找到一个即可？还是所有的触发词都要找到？根据双向扫描的经验，应该是都要找到
        present_of_triggers = {}

        for raw_trigger_word in mapping_btw_trigger_and_std_value:
            # trigger_word小写处理
            trigger_word = raw_trigger_word.lower()

            # 避免错写的那种异常情况
            if trigger_word in ['',' ']:
                continue     

            # 如果触发词是unigram，那么在 text_words_cleaned 寻找其存在
            if ' ' not in trigger_word:
                # 观察该triger_word是否在text_words_cleaned(小写化和词形还原处理后)的列表中存在
                # 因为可能有多次出现 用 trigger word in/index只能找到1个
                for text_word_idx, text_word in enumerate( text_words_cleaned ):                    
                    # 避免一种异常情况
                    if ',' in text_word:
                        text_word = text_word.replace(',','')
                    #
                    if trigger_word == text_word:
                        # 提取 trigger_word 在句子中出现的位置
                        trigger_spos = text_words_cleaned_with_pos[ text_word_idx ][0]
                        trigger_epos = text_words_cleaned_with_pos[ text_word_idx ][1] -1
                        # 将之区间化，并记录到 present_of_triggers 中
                        trigger_interval = Interval( trigger_spos, trigger_epos )       
                        # 
                        if trigger_interval not in present_of_triggers:
                            # 该触发词触发的标准值
                            trigged_value = mapping_btw_trigger_and_std_value[trigger_word]
                            # 记录该区间存在的触发词及其触发的标准值
                            present_of_triggers.setdefault(  trigger_interval, (trigger_word, trigged_value) )                  
            # 如果触发词不是unigram, 直接在句子层面匹配(因为这种情况下产生冗余的可能性很小)
            else:
                # ' ' in trigger_word
                # 为了进一步排除干扰，还可以加一个空格 (由多个token组成的，就没有必要加空格了)
                # 不然 severe case 遇到severe cases就歇菜了
                if trigger_word in pheno_text:
                    # 提取 trigger_word 在句子中出现的位置
                    trigger_spos = pheno_text.index( trigger_word )
                    trigger_epos = trigger_spos + len(trigger_word) - 1  
                    # 将之区间化，并记录到 present_of_triggers 中
                    trigger_interval = Interval( trigger_spos, trigger_epos ) 
                    # 
                    if trigger_interval not in present_of_triggers:
                        # 该触发词触发的标准值
                        trigged_value = mapping_btw_trigger_and_std_value[trigger_word]
                        # 记录该区间存在的触发词及其触发的标准值
                        present_of_triggers.setdefault(  trigger_interval, (trigger_word, trigged_value) )                                               

        # 数值型触发信号
        # 对于Frequency属性，数量型触发词的判断
        # 数量型触发词 , 用 of, people, case的存在减少假阳性
        # 频率数字生效的条件同 present_of_trigger_words (的确可以考虑合并进去)
        # 一个问题是 XXX in 26%, XXX ... 这种情况 (还好，长程触发信号无此问题)(没有进行逗号的替换)
        # 26% of case has XXX, XXX 
        # present_of_triggers.setdefault(  trigger_interval, (trigger_word, trigged_value) )        
        if re.search('\sof\s|\sin\s|\speople|\scase|\spatient|\sperson', pheno_text):
            # 搜索数值型触发信号
            for match_pt in re.finditer( '\d+(\.\d+)?%', pheno_text ):
                # 观察该 match_pt去除%后的match_num 能否转化为数字
                match_num = match_pt.group().replace('%','')
                if match_num.replace( '.','',1 ).isdigit():
                    # 拿到频率数字
                    freq_num = float( match_num )
                    #
                    # 频率数字的位置
                    trigger_spos = pheno_text.index( match_pt.group() )
                    trigger_epos = trigger_spos + len( match_pt.group() ) -1    
                    # 将之区间化，并记录到 present_of_triggers 中
                    trigger_interval = Interval( trigger_spos, trigger_epos ) 
                    # 触发词是百分比
                    trigger_word = match_pt.group()                     
                    #                                      
                    # 根据频率数字判断频率取值
                    trigged_value = ""
                    # 
                    if freq_num == 100:
                        trigged_value = "Frequency:Obligate"
                    elif freq_num >=80 and freq_num <100:
                        trigged_value = "Frequency:Very_frequent"
                    elif freq_num >=30 and freq_num <80:
                        trigged_value = "Frequency:Frequent"
                    elif freq_num >=5 and freq_num <30:
                        trigged_value = "Frequency:Occasional"
                    elif freq_num >=1 and freq_num <5:
                        trigged_value = "Frequency:Very_rare"
                    # 记录该百分比触发信号及其对应触发的属性标准值
                    if trigged_value != "":
                        present_of_triggers.setdefault(  trigger_interval, (trigger_word, trigged_value) ) 


        # 初始触发信号扫描结果 present_of_triggers = {}
        # 对找到的触发信号进行去冗余
        # present_of_triggers = {}    
        # trigger_interval, (trigger_word, std_value)
        excluded_trigger_keys = set()

        for trigger_interval_A in present_of_triggers:
            for trigger_interval_B in present_of_triggers:
                if trigger_interval_A != trigger_interval_B:
                    # 看区间A或区间B是否被包含?
                    if trigger_interval_A in trigger_interval_B:
                        excluded_trigger_keys.add( trigger_interval_A ) 
                    elif trigger_interval_B in trigger_interval_A:
                        excluded_trigger_keys.add( trigger_interval_B ) 
        
        # 
        for excluded_trigger_key in excluded_trigger_keys:
            del present_of_triggers[excluded_trigger_key]


        # 如果存在该属性的属性值的触发信号
        if len( present_of_triggers ) != 0 :    
            # 对触发信号根据位置进行排序
            sorted_trigger_positions = []
            for trigger_interval in present_of_triggers:
                sorted_trigger_positions.append( (trigger_interval.lower_bound, trigger_interval.upper_bound) )
            sorted_trigger_positions.sort()

            # 还原成Interval
            sorted_trigger_interval = [ Interval(trigger_pos[0], trigger_pos[1]) for trigger_pos in sorted_trigger_positions]
                    

            # 从左到右阅读trigger信号
            for trigger_interval in sorted_trigger_interval:
                # 触发词及其触发的属性标准值
                trigger_word, trigged_value = present_of_triggers[trigger_interval]

                # 如果是虚假触发信号，跳过    
                if 'PseudoTrigger' in trigged_value:
                    continue

                # 触发信号的起止位置
                trigger_spos = trigger_interval.lower_bound
                trigger_epos = trigger_interval.upper_bound

                # 计算触发信号与核心词之间的距离，以及与核心词之间是否存在语义分割
                token_based_distance = -1     # 触发词未出现
                with_semantic_split  = False  # 没有语义分割

                # 若触发词在表型的左边
                if trigger_epos < pheno_spos :
                    # 相对距离计算
                    # 提取两者之间的文本
                    text_btw   = pheno_text[ trigger_epos+1:pheno_spos ]
                    # 计数两者之间的token数目 (不包括标点符号在内)
                    tokens_btw = word_tokenize(text_btw)
                    tokens_btw = [token for token in tokens_btw if token.isalpha()]
                    # left 
                    token_based_distance = len(tokens_btw)
                    # 距离归一化
                    if token_based_distance <=100:
                        token_based_distance = token_based_distance/100
                    else:
                        token_based_distance = 1
                    #
                    # 两者之间是否存在语义分割？
                    if re.search(',|;|:', text_btw):
                        with_semantic_split = True
                # 若触发词在表型的右边
                elif trigger_spos > pheno_epos:
                    # 相对距离计算
                    # 提取两者之间的文本
                    text_btw   = pheno_text[ pheno_epos+1:trigger_spos ]
                    # 计数两者之间的token数目 (不包括标点符号在内)
                    tokens_btw = word_tokenize(text_btw)
                    tokens_btw = [token for token in tokens_btw if token.isalpha()]
                    # right + 
                    token_based_distance = len(tokens_btw)
                    # 距离归一化
                    if token_based_distance <=100:
                        token_based_distance = token_based_distance/100     
                    else:
                        token_based_distance = 1                                               
                    #
                    # 两者之间是否存在语义分割？
                    if re.search(',|;|:', text_btw):
                        with_semantic_split = True
                # 若触发词就位于表型概念中
                elif trigger_spos >= pheno_spos and trigger_epos <= pheno_epos:
                    # 相对距离设置为0                                                
                    token_based_distance = 0
                    # 没有语义分割
                    with_semantic_split  = False
                #
                # 触发信号存在，触发信号距离
                # token_based_distance = -1     # 触发词未出现
                # with_semantic_split  = False  # 没有语义分割                    
                # 基于上述计算更新该属性触发信号的存在情况
                # 更新特征矢量，同一标准取值有多个特征矢量仅更新最近的那一个信号
                # 
                # 首先找到触发词对应的属性标准取值对应的数字编码
                # 只考虑需要预测的属性类型的触发信号(Nocturnal不在其中)
                if trigged_value in codes_of_attribute_values:
                    # 触发词触发的属性标准值对应的数字编码
                    trigged_value_idx = codes_of_attribute_values[ trigged_value ]                    
                    # 如果现在还没有该属性值的触发信号
                    if vector_of_trigger_signal[ trigged_value_idx ] == 0:
                        # 触发信号出现
                        vector_of_trigger_signal[ trigged_value_idx ] = 1                    
                        # 相对距离
                        vector_of_relative_distance[ trigged_value_idx ] = token_based_distance
                    else:
                        if token_based_distance < vector_of_relative_distance[ trigged_value_idx ]:
                            # 触发信号出现
                            vector_of_trigger_signal[ trigged_value_idx ] = 1                    
                            # 相对距离
                            vector_of_relative_distance[ trigged_value_idx ] = token_based_distance                                
                

        # 特征矢量构建
        # vector_of_trigger_signal    = [0]*len( codes_of_attribute_values )
        # vector_of_relative_distance = [0]*len( codes_of_attribute_values )
        # vector_of_semantic_split    = [0]*len( codes_of_attribute_values )

        # 如果在句子中没有找到触发信号，而属性的取值又不是None或Present，这样的数据不要写入
        if vector_of_trigger_signal == [0]*len( codes_of_attribute_values ):
            # 是阳性样本，而不是None，句子中没有找到触发信号，跳过 
            if not re.search('None|Present', attribute_value) :
                continue
            else:
                # 是None样本，写入
                # 记录该表型的特征矢量
                # features_of_phenotypes.append( vector_of_trigger_signal + vector_of_relative_distance  )
                features_of_phenotypes.append( vector_of_trigger_signal + vector_of_relative_distance )
                # 记录该表型属性取值的类别标签
                labels_of_phenotypes.append( codes_of_attribute_values[attribute_value] )
        else:
            # 记录该表型的特征矢量
            # features_of_phenotypes.append( vector_of_trigger_signal + vector_of_relative_distance  )
            features_of_phenotypes.append( vector_of_trigger_signal + vector_of_relative_distance )
            # 记录该表型属性取值的类别标签
            labels_of_phenotypes.append( codes_of_attribute_values[attribute_value] )                

In [310]:
len( features_of_phenotypes), len( labels_of_phenotypes )

(111, 111)

In [311]:
Counter( labels_of_phenotypes )

Counter({0: 61, 1: 45, 2: 5})

In [312]:
print(codes_of_attribute_values)

{'None': 0, 'Severe_problem': 1, 'Mild_problem': 2}


In [313]:
mapping_btw_codes_and_value = {}

for attribute_value in codes_of_attribute_values:
    mapping_btw_codes_and_value.setdefault( codes_of_attribute_values[attribute_value], attribute_value )

print(mapping_btw_codes_and_value)

{0: 'None', 1: 'Severe_problem', 2: 'Mild_problem'}


In [314]:
features_of_phenotypes[1], labels_of_phenotypes[1]

([0, 0, 0, -1, -1, -1], 0)

In [315]:
# 对类别中的小类进行上采样，直接复制features_of_phenotypes中的数据即可

In [316]:
added_features_of_phenotypes = []
added_labels_of_phenotypes = []

# Counter({0: 811, 1: 1844, 2: 48, 3: 13, 4: 4})
for value_code in Counter( labels_of_phenotypes ):
    value_count = Counter( labels_of_phenotypes )[value_code]
    # print(value_code, value_count)
    # 如果该属性取值的value_count过少，复制一些数据出来
    if value_count <= 20:
        # 找到 labels_of_phenotypes 对应的 value_code
        if value_code in labels_of_phenotypes:
            idx = labels_of_phenotypes.index(value_code)
            # 提取特征
            vec = features_of_phenotypes[idx]
            # 复制20份加入进去
            copy_num = 20 - value_count
            for i in range(copy_num):
                added_features_of_phenotypes.append( vec )
                added_labels_of_phenotypes.append( value_code )

# added_labels_of_phenotypes

In [317]:
# 添加到训练集中
features_of_phenotypes += added_features_of_phenotypes
labels_of_phenotypes   += added_labels_of_phenotypes

Counter( labels_of_phenotypes )

Counter({0: 61, 1: 45, 2: 20})

In [318]:
# 这种上采样方式应该问题不大，因为从特征层面的抽象角度讲，就是该都差不多的。

In [319]:
# 构建机器学习模型

In [320]:
# 参数空间
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf']}

In [321]:
%%time
grid = GridSearchCV(SVC(),param_grid,refit=True)
grid.fit(features_of_phenotypes, labels_of_phenotypes)

CPU times: user 264 ms, sys: 5.34 ms, total: 270 ms
Wall time: 267 ms


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001],
                         'kernel': ['rbf']})

In [322]:
grid.best_params_

{'C': 1, 'gamma': 1, 'kernel': 'rbf'}

In [323]:
# 记录该模型
# 注意 setsetdefault不会改变默认值
if target_attribute not in dict_of_trained_classifiers:
    dict_of_trained_classifiers.setdefault( target_attribute, grid )
else:
    dict_of_trained_classifiers[target_attribute] = grid 

In [324]:
# dict_of_trained_classifiers[target_attribute].best_params_
# 训练集上的预测效果 [还是倾向于大类，不过没问题]
Counter( list(dict_of_trained_classifiers[target_attribute].predict( features_of_phenotypes )) ) 

Counter({0: 59, 1: 45, 2: 22})

In [325]:
print(codes_of_attribute_values)

{'None': 0, 'Severe_problem': 1, 'Mild_problem': 2}


In [326]:
for attribute_name in dict_of_trained_classifiers:
    print( attribute_name, dict_of_trained_classifiers[attribute_name].best_params_ )

Assertion {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}
Severity {'C': 100, 'gamma': 1, 'kernel': 'rbf'}
Temporal {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
Sensation {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}
Color {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
Age {'C': 100, 'gamma': 1, 'kernel': 'rbf'}
Gender {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}
Frequency {'C': 100, 'gamma': 1, 'kernel': 'rbf'}
Distribution {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}
SOI {'C': 1, 'gamma': 1, 'kernel': 'rbf'}


In [69]:
# 测试集
# 对以上属性进行预测, 将预测结果解码后记录下来
# 这个流程是低效的，但作为评估流程尽快拿到数据，能用就行
# 应效仿我在深度学习上的方法，那个方法快

In [327]:
diseases_used_for_test[0]

'Poliomyelitis'

In [329]:
%%time
for disease_name in list( annotated_phenotypes_info_of_diseases.keys() ):
    # 仅看训练集中的疾病
    if disease_name not in diseases_used_for_test:
        continue

    # disease_name = 'Poliomyelitis'

    # 
    # 该疾病文档中标注的表型及其关联属性
    info_of_annotated_phenotypes = annotated_phenotypes_info_of_diseases[disease_name]

    # 对于该疾病文档中标注的每一个表型
    for pheno_id in info_of_annotated_phenotypes:
        # 获取该变量中存储的表型相关信息
        pheno_info = info_of_annotated_phenotypes[pheno_id]


        # 设置该表型属性槽的默认值 (同时记录判断依据)
        default_values_of_attributes = {}

        # 这里其实没有触发信号，但还是这样保留着吧
        for attribute_name in list_of_attributes:
            if attribute_name == 'Assertion':
                default_values_of_attributes.setdefault( attribute_name, ['Present',''] )
            else:
                default_values_of_attributes.setdefault( attribute_name, ['None',''] )   

        #  
        # 之前的一个问题 (这种表型的属性槽设置为全部默认即可) 数量不多，9个
        if 'pheno_context' not in pheno_info:
            # print(disease_name)
            if 'pred_attribute_values' not in annotated_phenotypes_info_of_diseases[disease_name][pheno_id]:
                annotated_phenotypes_info_of_diseases[disease_name][pheno_id].setdefault( 'pred_attribute_values', 
                                                                                        default_values_of_attributes )
            else:
                annotated_phenotypes_info_of_diseases[disease_name][pheno_id]['pred_attribute_values'] =        default_values_of_attributes               
            # count += 1
            continue     


        # 表型的名称
        pheno_name = pheno_info['pheno_name']
        # 表型所在的句子 (有的表型没有"pheno_context") )
        pheno_text = pheno_info['pheno_context']
        # 表型在句子中的位置 (注意pheno_epos-1)才是表型最后一个字符串的位置
        pheno_spos, pheno_epos = pheno_info['pheno_pos_in_sent']     
        pheno_epos = pheno_epos-1
        # 表型的关联属性
        pheno_labels = pheno_info['associated_attributes']  

        # 使用机器学习模型根据表型的上下文表征预测属性 (更新default_attribute_values字典)
        # 生成该表型的用于机器学习的表征，以便调用模型进行预测
        # 注意有两个属性没有机器学习模型
        for target_attribute in dict_of_trained_classifiers:
            # pheno_features = [] --> pred_label
            # 1
            # 对目标属性的取值进行数字编码
            codes_of_attribute_values = {}

            # default的属性取值设置为0
            if target_attribute == 'Assertion':
                codes_of_attribute_values.setdefault( "Present", 0 )
            else:
                codes_of_attribute_values.setdefault( "None", 0 )  


            # 观察目标属性在语料集中的出现情况
            # 观察目标属性在训练集中的出现情况
            present_of_attribute_values = stat_occurences_of_attribute_values(target_attribute, diseases_used_for_training)
            # 该属性在训练集中出现的类别数 len( present_of_attribute_values )

            # 对出现过的目标属性取值进行数字编码
            for attribute_value in present_of_attribute_values:
                if attribute_value != 'Present':
                    # 赋予编号
                    # 只有当该属性值具有的样本数大于类别数时才考虑
                    count_of_attribute_value = present_of_attribute_values[ attribute_value ] 
                    # 不设限制
                    if count_of_attribute_value >= 0 :
                        code_of_attribute_value = len( codes_of_attribute_values )
                        codes_of_attribute_values.setdefault( attribute_value , code_of_attribute_value )  

            # 
            # codes_of_attribute_values = {}
            mapping_codes_to_values = {}
            for attribute_value in codes_of_attribute_values:
                mapping_codes_to_values.setdefault( codes_of_attribute_values[attribute_value], attribute_value )


            # 2
            # 获取该属性不同属性取值的触发词
            # 不考虑default的属性取值
            triggers_of_attribute_values = {}

            for index, row in df_of_attribute_triggers[ df_of_attribute_triggers['属性英文名称'] == target_attribute ].iterrows():
                attribute_name = row['属性英文名称']
                attribute_value = row['标准取值']
                attribute_triggers = row['触发词']

                # 跳过必选属性的默认取值Present
                if attribute_value == 'Present':
                    continue

                # 写入属性取值-触发词变量
                if attribute_triggers != '0':
                    triggers_of_attribute_values.setdefault( attribute_value, attribute_triggers.split(';') )    


            # 3
            # 将 triggers_of_attribute_values 转换为 trigger 到 标准取值的形式，便于处理
            mapping_btw_trigger_and_std_value = {}

            for attribute_value in triggers_of_attribute_values:
                for trigger_word in triggers_of_attribute_values[attribute_value]:
                    mapping_btw_trigger_and_std_value.setdefault( trigger_word, attribute_value )

            # 4
            # 对pheno_text进行处理，生成其表征
            pheno_features = []

            # 对句子进行带位置信息的tokenize操作
            span_generator = TreebankWordTokenizer().span_tokenize(pheno_text)  

            # 对句子的token进行小写化处理和词形还原处理
            text_words_cleaned_with_pos = []
            text_words_cleaned = []

            for span in span_generator:
                # token原始形式
                raw_word = pheno_text[ span[0]:span[1] ]
                # token处理后形式
                cleaned_word = wordnet_lemmatizer.lemmatize( raw_word.lower() )
                # 记录token原始形式的位置和token处理后形式
                text_words_cleaned_with_pos.append( (span[0], span[1], cleaned_word) ) 
                # 记录token处理后形式
                text_words_cleaned.append( cleaned_word )


            # 各分量特征矢量构建
            # 1. 不同属性取值的触发词的出现情况  [] 维度等于属性取值的数目，排序按code排列
            # 2. 如果出现，相对于中心词的位置 (token数)    
            # 3. 如果出现，与中心词之间是否存在语义分割?
            # 句中的触发信号 出现 1/未出现 0
            vector_of_trigger_signal    = [0]*len( codes_of_attribute_values )
            # 触发信号到核心词的距离 未出现 对应 -1; 出现 距离的绝对值/100；超过100个token设置为1
            # 不能设置为0，因为0是有特殊意义的
            vector_of_relative_distance = [-1]*len( codes_of_attribute_values )
            # 触发信号与核心词之间有无标点符号  (-)1 触发词未出现  (0)(无标点符号)  (1)(有标点符号)
            vector_of_semantic_split    = [0]*len( codes_of_attribute_values )


            # 扫描表型上下文中的触发词 (主要是看是哪一个属性取值的触发词)
            # 只要找到一个即可？还是所有的触发词都要找到？根据双向扫描的经验，应该是都要找到
            present_of_triggers = {}

            for raw_trigger_word in mapping_btw_trigger_and_std_value:
                # trigger_word小写处理
                trigger_word = raw_trigger_word.lower()

                # 避免错写的那种异常情况
                if trigger_word in ['',' ']:
                    continue     

                # 如果触发词是unigram，那么在 text_words_cleaned 寻找其存在
                if ' ' not in trigger_word:
                    # 观察该triger_word是否在text_words_cleaned(小写化和词形还原处理后)的列表中存在
                    # 因为可能有多次出现 用 trigger word in/index只能找到1个
                    for text_word_idx, text_word in enumerate( text_words_cleaned ):                    
                        # 避免一种异常情况
                        if ',' in text_word:
                            text_word = text_word.replace(',','')
                        #
                        if trigger_word == text_word:
                            # 提取 trigger_word 在句子中出现的位置
                            trigger_spos = text_words_cleaned_with_pos[ text_word_idx ][0]
                            trigger_epos = text_words_cleaned_with_pos[ text_word_idx ][1] -1
                            # 将之区间化，并记录到 present_of_triggers 中
                            trigger_interval = Interval( trigger_spos, trigger_epos )       
                            # 
                            if trigger_interval not in present_of_triggers:
                                # 该触发词触发的标准值
                                trigged_value = mapping_btw_trigger_and_std_value[trigger_word]
                                # 记录该区间存在的触发词及其触发的标准值
                                present_of_triggers.setdefault(  trigger_interval, (trigger_word, trigged_value) )                  
                # 如果触发词不是unigram, 直接在句子层面匹配(因为这种情况下产生冗余的可能性很小)
                else:
                    # ' ' in trigger_word
                    # 为了进一步排除干扰，还可以加一个空格 (由多个token组成的，就没有必要加空格了)
                    # 不然 severe case 遇到severe cases就歇菜了
                    if trigger_word in pheno_text:
                        # 提取 trigger_word 在句子中出现的位置
                        trigger_spos = pheno_text.index( trigger_word )
                        trigger_epos = trigger_spos + len(trigger_word) - 1  
                        # 将之区间化，并记录到 present_of_triggers 中
                        trigger_interval = Interval( trigger_spos, trigger_epos ) 
                        # 
                        if trigger_interval not in present_of_triggers:
                            # 该触发词触发的标准值
                            trigged_value = mapping_btw_trigger_and_std_value[trigger_word]
                            # 记录该区间存在的触发词及其触发的标准值
                            present_of_triggers.setdefault(  trigger_interval, (trigger_word, trigged_value) )                                               

            # 数量型触发词 , 用 of, people, case的存在减少假阳性
            # 频率数字生效的条件同 present_of_trigger_words (的确可以考虑合并进去)
            # 一个问题是 XXX in 26%, XXX ... 这种情况 (还好，长程触发信号无此问题)(没有进行逗号的替换)
            # 26% of case has XXX, XXX 
            # present_of_triggers.setdefault(  trigger_interval, (trigger_word, trigged_value) )        
            if re.search('\sof\s|\sin\s|\speople|\scase|\spatient|\sperson', pheno_text):
                # 搜索数值型触发信号
                for match_pt in re.finditer( '\d+(\.\d+)?%', pheno_text ):
                    # 观察该 match_pt去除%后的match_num 能否转化为数字
                    match_num = match_pt.group().replace('%','')
                    if match_num.replace( '.','',1 ).isdigit():
                        # 拿到频率数字
                        freq_num = float( match_num )
                        #
                        # 频率数字的位置
                        trigger_spos = pheno_text.index( match_pt.group() )
                        trigger_epos = trigger_spos + len( match_pt.group() ) -1    
                        # 将之区间化，并记录到 present_of_triggers 中
                        trigger_interval = Interval( trigger_spos, trigger_epos ) 
                        # 触发词是百分比
                        trigger_word = match_pt.group()                     
                        #                                      
                        # 根据频率数字判断频率取值
                        trigged_value = ""
                        # 
                        if freq_num == 100:
                            trigged_value = "Frequency:Obligate"
                        elif freq_num >=80 and freq_num <100:
                            trigged_value = "Frequency:Very_frequent"
                        elif freq_num >=30 and freq_num <80:
                            trigged_value = "Frequency:Frequent"
                        elif freq_num >=5 and freq_num <30:
                            trigged_value = "Frequency:Occasional"
                        elif freq_num >=1 and freq_num <5:
                            trigged_value = "Frequency:Very_rare"
                        # 记录该百分比触发信号及其对应触发的属性标准值
                        if trigged_value != "":
                            present_of_triggers.setdefault(  trigger_interval, (trigger_word, trigged_value) ) 



            # 初始触发信号扫描结果 present_of_triggers = {}
            # 对找到的触发信号进行去冗余
            # present_of_triggers = {}    
            # trigger_interval, (trigger_word, std_value)
            excluded_trigger_keys = set()

            for trigger_interval_A in present_of_triggers:
                for trigger_interval_B in present_of_triggers:
                    if trigger_interval_A != trigger_interval_B:
                        # 看区间A或区间B是否被包含?
                        if trigger_interval_A in trigger_interval_B:
                            excluded_trigger_keys.add( trigger_interval_A ) 
                        elif trigger_interval_B in trigger_interval_A:
                            excluded_trigger_keys.add( trigger_interval_B ) 
            
            # 
            for excluded_trigger_key in excluded_trigger_keys:
                del present_of_triggers[excluded_trigger_key]


            # 如果存在该属性的属性值的触发信号
            if len( present_of_triggers ) != 0 :    
                # 对触发信号根据位置进行排序
                sorted_trigger_positions = []
                for trigger_interval in present_of_triggers:
                    sorted_trigger_positions.append( (trigger_interval.lower_bound, trigger_interval.upper_bound) )
                sorted_trigger_positions.sort()

                # 还原成Interval
                sorted_trigger_interval = [ Interval(trigger_pos[0], trigger_pos[1]) for trigger_pos in sorted_trigger_positions]
                        

                # 从左到右阅读trigger信号
                for trigger_interval in sorted_trigger_interval:
                    # 触发词及其触发的属性标准值
                    trigger_word, trigged_value = present_of_triggers[trigger_interval]

                    # 如果是虚假触发信号，跳过    
                    if 'PseudoTrigger' in trigged_value:
                        continue

                    # 触发信号的起止位置
                    trigger_spos = trigger_interval.lower_bound
                    trigger_epos = trigger_interval.upper_bound

                    # 计算触发信号与核心词之间的距离，以及与核心词之间是否存在语义分割
                    token_based_distance = -1     # 触发词未出现
                    with_semantic_split  = False  # 没有语义分割

                    # 若触发词在表型的左边
                    if trigger_epos < pheno_spos :
                        # 相对距离计算
                        # 提取两者之间的文本
                        text_btw   = pheno_text[ trigger_epos+1:pheno_spos ]
                        # 计数两者之间的token数目 (不包括标点符号在内)
                        tokens_btw = word_tokenize(text_btw)
                        tokens_btw = [token for token in tokens_btw if token.isalpha()]
                        # left 
                        token_based_distance = len(tokens_btw)
                        # 距离归一化
                        if token_based_distance <=100:
                            token_based_distance = token_based_distance/100
                        else:
                            token_based_distance = 1
                        #
                        # 两者之间是否存在语义分割？
                        if re.search(',|;|:', text_btw):
                            with_semantic_split = True
                    # 若触发词在表型的右边
                    elif trigger_spos > pheno_epos:
                        # 相对距离计算
                        # 提取两者之间的文本
                        text_btw   = pheno_text[ pheno_epos+1:trigger_spos ]
                        # 计数两者之间的token数目 (不包括标点符号在内)
                        tokens_btw = word_tokenize(text_btw)
                        tokens_btw = [token for token in tokens_btw if token.isalpha()]
                        # right + 
                        token_based_distance = len(tokens_btw)
                        # 距离归一化
                        if token_based_distance <=100:
                            token_based_distance = token_based_distance/100     
                        else:
                            token_based_distance = 1                                               
                        #
                        # 两者之间是否存在语义分割？
                        if re.search(',|;|:', text_btw):
                            with_semantic_split = True
                    # 若触发词就位于表型概念中
                    elif trigger_spos >= pheno_spos and trigger_epos <= pheno_epos:
                        # 相对距离设置为0                                                
                        token_based_distance = 0
                        # 没有语义分割
                        with_semantic_split  = False
                    #
                    # 触发信号存在，触发信号距离
                    # token_based_distance = -1     # 触发词未出现
                    # with_semantic_split  = False  # 没有语义分割                    
                    # 基于上述计算更新该属性触发信号的存在情况
                    # 更新特征矢量，同一标准取值有多个特征矢量仅更新最近的那一个信号
                    # 
                    # 首先找到触发词对应的属性标准取值对应的数字编码
                    # 只考虑需要预测的属性类型的触发信号(Nocturnal不在其中)
                    if trigged_value in codes_of_attribute_values:
                        # 触发词触发的属性标准值对应的数字编码
                        trigged_value_idx = codes_of_attribute_values[ trigged_value ]                    
                        # 如果现在还没有该属性值的触发信号
                        if vector_of_trigger_signal[ trigged_value_idx ] == 0:
                            # 触发信号出现
                            vector_of_trigger_signal[ trigged_value_idx ] = 1                    
                            # 相对距离
                            vector_of_relative_distance[ trigged_value_idx ] = token_based_distance
                        else:
                            if token_based_distance < vector_of_relative_distance[ trigged_value_idx ]:
                                # 触发信号出现
                                vector_of_trigger_signal[ trigged_value_idx ] = 1                    
                                # 相对距离
                                vector_of_relative_distance[ trigged_value_idx ] = token_based_distance

            # 4
            # 特征矢量拼接
            # vector_of_trigger_signal    = [0]*len( codes_of_attribute_values )
            # vector_of_relative_distance = [0]*len( codes_of_attribute_values )
            # vector_of_semantic_split    = [0]*len( codes_of_attribute_values )
            pheno_features = vector_of_trigger_signal + vector_of_relative_distance
            

            # 5
            # 根据特征矢量进行预测
            pred_value_code = dict_of_trained_classifiers[target_attribute].predict( [pheno_features] )[0]
            # print( target_attribute, pheno_features, pred_value_code)

            # 
            pred_value_name =  mapping_codes_to_values[pred_value_code]

            # 更新 default 属性槽
            default_values_of_attributes[target_attribute][0] = pred_value_name

        # 默认属性槽全部更新完后
        # 记录更改后的 default_values_of_attributes 到该表型的信息种
        # 为 annotated_phenotypes_info_of_diseases 增加一个 pred_attribute_values key
        if 'pred_attribute_values' not in annotated_phenotypes_info_of_diseases[disease_name][pheno_id]:
            annotated_phenotypes_info_of_diseases[disease_name][pheno_id].setdefault( 'pred_attribute_values', 
                                                                                    default_values_of_attributes )
        else:
            annotated_phenotypes_info_of_diseases[disease_name][pheno_id]['pred_attribute_values'] = default_values_of_attributes                           

CPU times: user 3min 30s, sys: 16.1 s, total: 3min 46s
Wall time: 3min 46s


In [439]:
# annotated_phenotypes_info_of_diseases[disease_name]

In [440]:
# 测试集评估

In [330]:
# 所有/单个属性的预测准确率 (accuracy)
# 所有属性带权重的预测准确率 (weighted accuracy) -- 这里没有单个的说法了

# 对于不同的属性
# {"Severity":[表型的属性列表]}
y_preds = {}
y_trues = {}

for attribute in list_of_attributes:
    y_preds.setdefault(attribute,[])
    y_trues.setdefault(attribute,[])

# 
for disease_name in list( annotated_phenotypes_info_of_diseases.keys() ):
    # 仅看测试集中的疾病
    if disease_name not in diseases_used_for_test:
        continue

    # print(disease_name)

    # 该疾病文档中标注的表型及其关联属性
    info_of_annotated_phenotypes = annotated_phenotypes_info_of_diseases[disease_name]

    # 对于该疾病文档中标注的每一个表型
    for pheno_id in info_of_annotated_phenotypes:
        # 获取该变量中存储的表型相关信息
        pheno_info = info_of_annotated_phenotypes[pheno_id]

        # 该表型人工标注的属性
        string_of_true_attribute_values = pheno_info['associated_attributes']   
        # 如果人工标注的属性中有Stage:Stage_1, 去除
        true_attribute_values = []
        
        for attribute_value in string_of_true_attribute_values.split(';'):
            if 'Stage:Stage_' not in attribute_value:
                true_attribute_values.append( attribute_value )


        # 写入到 y_trues 
        for attribute_name in y_trues:
            # 这里只有可选属性的时候会触发
            if attribute_name not in string_of_true_attribute_values:
                # 设为 attribute_name +':' + 'None'
                y_trues[attribute_name].append('None')
            else:
                # 如果 attribute_name 存在, 直接使用 true_attribute_values 中的值
                for attribute_value in true_attribute_values:
                    selected_attribute, selected_value = attribute_value.split(':')
                    #
                    if attribute_name == selected_attribute:
                        y_trues[attribute_name].append( selected_value )


        # 对应的，该表型算法预测的属性
        pred_attribute_values = pheno_info['pred_attribute_values']     

        # 写入到 y_preds 
        for attribute_name in y_preds:
            # 这条规则是不会触发的
            if attribute_name not in pred_attribute_values:
                # 设为 attribute_name +':' + 'None'
                y_preds[attribute_name].append('None')    
            else:
                # 如果 attribute_name 存在
                for selected_attribute in pred_attribute_values:
                    selected_value = pred_attribute_values[selected_attribute][0]
                    #
                    if attribute_name == selected_attribute:
                        y_preds[attribute_name].append( selected_value )                

In [331]:
# 根据 y_preds 和 y_trues 计算各个属性预测的准确率
# 不同属性预测正确的数目

for attribute_name in y_trues:
    # 该属性预测正确的计数
    count_correct_pred_all = 0
    # 该属性各分类预测正确的计数
    count_correct_pred_eachs = {}

    true_labels = y_trues[attribute_name]
    pred_labels = y_preds[attribute_name]

    # 计算一致性
    for true_label, pred_label in zip( true_labels, pred_labels ):
        if true_label == pred_label:
            count_correct_pred_all +=1
            # 
            if true_label not in count_correct_pred_eachs:
                count_correct_pred_eachs.setdefault( true_label, 1 )
            else:
                count_correct_pred_eachs[true_label] +=1
    
    # 
    print("\n*******")
    print("属性名称:", attribute_name )
    print("预测正确率:", len(true_labels), count_correct_pred_all, round( count_correct_pred_all/len(true_labels), 3) )
    print( '%.3f'%( count_correct_pred_all/len(true_labels) ), 
            '(' + str(count_correct_pred_all) + '/' + str(len(true_labels) ) + ')' )    
    # 需要知道各分类的数目, 用Counter来统计各分类
    # print("各分类预测正确率:")
    # result = Counter(true_labels)
    # for label_name in result:
    #     label_count = result[label_name]
    #     correct_pred = count_correct_pred_eachs[label_name]
    #     print( label_count, correct_pred, round( correct_pred/label_count, 2) )



*******
属性名称: Assertion
预测正确率: 1104 727 0.659
0.659 (727/1104)

*******
属性名称: Severity
预测正确率: 1104 930 0.842
0.842 (930/1104)

*******
属性名称: Temporal
预测正确率: 1104 957 0.867
0.867 (957/1104)

*******
属性名称: Sensation
预测正确率: 1104 1036 0.938
0.938 (1036/1104)

*******
属性名称: Color
预测正确率: 1104 985 0.892
0.892 (985/1104)

*******
属性名称: Age
预测正确率: 1104 1062 0.962
0.962 (1062/1104)

*******
属性名称: Gender
预测正确率: 1104 1090 0.987
0.987 (1090/1104)

*******
属性名称: Frequency
预测正确率: 1104 856 0.775
0.775 (856/1104)

*******
属性名称: Distribution
预测正确率: 1104 1049 0.95
0.950 (1049/1104)

*******
属性名称: SOI
预测正确率: 1104 1089 0.986
0.986 (1089/1104)


In [332]:
# 计算测试集上各属性预测的weighted accuracy (它比宏平均更好)
# 核心指标
# overall evaluation metrics for task 2a
# accuracy
# weighted accuracy
# per-slot weighted accuracy

In [333]:
# 维基语料集中总的标注的表型实体数量
total_annotated_phenotypes = 0

counts_for_assertion_values  = stat_occurences_of_attribute_values('Assertion', list_of_diseases)
for assertion_value in counts_for_assertion_values:
    total_annotated_phenotypes += counts_for_assertion_values[assertion_value]

total_annotated_phenotypes

4019

In [334]:
# 计算每一个属性取值的prevalence
prevalences_of_attribute_values = {}

for attribute_name in list_of_attributes:
    counts_for_attribute_values = stat_occurences_of_attribute_values(attribute_name, list_of_diseases)

    tmpdict = {}

    sum_of_values = 0 
    for attribute_value in counts_for_attribute_values:
        #
        count_of_value = counts_for_attribute_values[attribute_value]
        sum_of_values += count_of_value
        #
        value_prevalence = count_of_value/total_annotated_phenotypes
        tmpdict.setdefault( attribute_value, value_prevalence )
    
    # 对于None这种情况 (可选属性都有None)
    if attribute_name != 'Assertion':
        count_of_none = total_annotated_phenotypes - sum_of_values
        value_prevalence = count_of_none/total_annotated_phenotypes
        tmpdict.setdefault( 'None', value_prevalence )
    
    #
    prevalences_of_attribute_values.setdefault( attribute_name, tmpdict )

# prevalences_of_attribute_values


In [335]:
# 
# overall_accuracy  = 0
# overall_weighted_acc = 0

list_of_unweighted_accuracy_for_phenotypes = []
list_of_weighted_accuracy_for_phenotypes   = []

# 每一个属性槽的weighted acc
dict_of_weighted_accuracy_for_slots = {}

# 对于4020个表型，每一个属性，其取值有一个权重，记录为[]; 这一取值是否正确，记录一个 weighted identify
# 然后分别求和，然后可计算
for attribute_name in list_of_attributes:
    dict_of_weighted_accuracy_for_slots.setdefault( attribute_name, ([],[]) )


for disease_name in list( annotated_phenotypes_info_of_diseases.keys() ):
    # 仅看测试集中的疾病
    if disease_name not in diseases_used_for_test:
        continue

    # print(disease_name)

    # 该疾病文档中标注的表型及其关联属性
    info_of_annotated_phenotypes = annotated_phenotypes_info_of_diseases[disease_name]

    # 对于该疾病文档中标注的每一个表型
    for pheno_id in info_of_annotated_phenotypes:
        # 获取该变量中存储的表型相关信息
        pheno_info = info_of_annotated_phenotypes[pheno_id]

        # 
        # 该表型人工标注的属性
        string_of_true_attribute_values = pheno_info['associated_attributes']   

        # 仅考虑 list_of_attributes 中的属性 (12个)
        # 'Assertion:Possible'
        true_attribute_values = {}
        
        # 对于这12个属性
        for attribute_name in list_of_attributes:
            # 观察它们是否存在于人工标注中
            # 如果存在，直接赋予人工标注的值
            if attribute_name + ':' in string_of_true_attribute_values:
                # 将这个值找出来
                for selected_attribute_value in string_of_true_attribute_values.split(';'):
                    selected_attribute, selected_value = selected_attribute_value.split(':')
                    if selected_attribute == attribute_name:
                        true_attribute_values.setdefault(attribute_name, selected_value)
                        break
            # 如果不存在，赋予默认值 (Present or None)
            elif attribute_name + ':' not in string_of_true_attribute_values:
                if attribute_name == 'Assertion':
                    true_attribute_values.setdefault(attribute_name, 'Present')
                else:
                    true_attribute_values.setdefault(attribute_name, 'None')


        #
        # 该表型算法预测的属性
        # 原始的pred_attribute_values包含了 detected cue word , 可以去掉
        pred_attribute_values = {}

        for attribute_name in pheno_info['pred_attribute_values']:
            predicted_value = pheno_info['pred_attribute_values'][attribute_name][0]
            pred_attribute_values.setdefault( attribute_name, predicted_value )

        
        # 
        # true_attribute_values = {}
        # pred_attribute_values = {}

        # 
        # 先考虑 per-phenotype unweighted accuracy
        # per-phenotype
        # K : number of slots
        number_of_slots = len( list_of_attributes )

        per_phenotype_unweighted_accuracy = 0
        for attribute_name in list_of_attributes:
            # identify function : I(true_for_slot_k, pred_for_slot_k)
            if true_attribute_values[attribute_name] == pred_attribute_values[attribute_name]:
                per_phenotype_unweighted_accuracy += 1
            else:
                per_phenotype_unweighted_accuracy += 0
        
        # 记录该表型的 identify score
        list_of_unweighted_accuracy_for_phenotypes.append( per_phenotype_unweighted_accuracy/number_of_slots )

        
        #
        # 再考虑 per-phenotype unweighted accuracy
        # 就是 Identify 打分要带上该属性值的权重了
        # 12个属性槽上取值的权重之和 
        per_phenotype_weighted_identify = 0
        sum_of_weights_on_slots = 0

        for attribute_name in list_of_attributes:
            # 该属性对应的取值
            true_selected_value = true_attribute_values[attribute_name]
            # 该属性取值的权重
            weight_of_attribute_value = 1 - prevalences_of_attribute_values[attribute_name][true_selected_value]
            # 这些属性取值对应的权重之和
            sum_of_weights_on_slots += weight_of_attribute_value

            # identify function : I(true_for_slot_k, pred_for_slot_k)
            if true_attribute_values[attribute_name] == pred_attribute_values[attribute_name]:
                # 引入该属性值的权重
                per_phenotype_weighted_identify += 1*weight_of_attribute_value
            else:
                per_phenotype_weighted_identify += 0*weight_of_attribute_value    

        # 记录该表型的 weighted identify score
        list_of_weighted_accuracy_for_phenotypes.append( per_phenotype_weighted_identify/sum_of_weights_on_slots )


        #
        # for per-slot accuracy
        # 对于该表型的每一个属性槽
        for attribute_name in list_of_attributes:
            # 该属性对应的取值
            true_selected_value = true_attribute_values[attribute_name]
            # 该属性取值的权重
            weight_of_attribute_value = 1 - prevalences_of_attribute_values[attribute_name][true_selected_value]
            # 该属性取值的identify score
            identify_of_attribute_value = 0

            # identify function : I(true_for_slot_k, pred_for_slot_k)
            if true_attribute_values[attribute_name] == pred_attribute_values[attribute_name]:
                # 引入该属性值的权重
                identify_of_attribute_value = 1*weight_of_attribute_value
            else:
                identify_of_attribute_value = 0*weight_of_attribute_value  

            # 记录该表型每一个属性槽上的加权得分
            # identify_of_attribute_value += 1*weight_of_attribute_value 其实就是1和0
            # 属性取值的权重
            dict_of_weighted_accuracy_for_slots[attribute_name][1].append( weight_of_attribute_value )
            # 取值正确或错误的加权打分
            dict_of_weighted_accuracy_for_slots[attribute_name][0].append( identify_of_attribute_value )


In [336]:
len( list_of_weighted_accuracy_for_phenotypes ), len( list_of_unweighted_accuracy_for_phenotypes )

(1104, 1104)

In [337]:
# unweighted acc
sum( list_of_unweighted_accuracy_for_phenotypes )/len( list_of_unweighted_accuracy_for_phenotypes )

0.8859601449275284

In [338]:
# weighted acc
sum( list_of_weighted_accuracy_for_phenotypes )/len( list_of_weighted_accuracy_for_phenotypes )

0.7745842908604559

In [339]:
# per-slot weighted accuracy
for attribute_name in list_of_attributes:
    # dict_of_weighted_accuracy_for_slots[attribute_name]
    list_of_identity, list_of_weights = dict_of_weighted_accuracy_for_slots[attribute_name]
    print( attribute_name, '%.3f'%(sum(list_of_identity)/sum(list_of_weights)) )

Assertion 0.676
Severity 0.891
Temporal 0.759
Sensation 0.888
Color 0.794
Age 0.823
Gender 0.964
Frequency 0.765
Distribution 0.909
SOI 0.788
