## 字典树提取命名实体

### 定义工具

In [57]:
include("../src/readfiles.jl")
# include("../src/rectify.jl")
# include("../src/prefixtree.jl")
# include("../src/xlsx.jl")

导入成功，内容概要：
    | 变量名 | 说明 |
    | ---- | ---- |
    | zh_triples/en_triples | 三元组 |
    | zh_subs/en_subs | 头实体 |
    | zh_objs/en_objs | 尾实体 |
    | zh_rels/en_rels | 关系 |
    | zh_entity/en_entity | 实体（头和尾） |
    | ILLs | 英文 => 中文对齐 |
    | ILLs_zh_en | 中文 => 英文对齐|
    | train_data | 训练集 |
    | MT_en/zh(_rev) | 三元组翻译 |


### 初始化字典树

In [58]:
# 字典数据
en_words = unique!(vcat(en_subs, collect(keys(ILLs))))
en_words = [replace(word, '_'=>' ') for word in en_words]
zh_words = unique!(vcat(zh_subs, collect(values(ILLs))))
zh_words = [replace(word, '_'=>' ') for word in zh_words]
words = unique!(vcat(en_words, zh_words))
words = [replace(word, '_'=>' ') for word in words]

# 生成字典树
dict, dict_en, dict_zh = PrefixTree(), PrefixTree(), PrefixTree()
for word in words
    add_node!(dict, word)
end
for word in en_words
    add_node!(dict_en, word)
end
for word in zh_words
    add_node!(dict_zh, word)
end

### 抽取命名实体

In [59]:
# NER 的主要函数
# 抽取实体仅出现在头实体上
# 很可能是单向的，下一步 check 利于简化关系抽取
function get_subject(que)
    # 从所有头实体以及 ILLs 中匹配
    subs = remove_subcase(search_valid_word(dict, que))
    length(subs) == 1 && return subs
    # 剩下的大部分是中文问题，带英文关键字
    # 从英文实体以及 ILLs 英文实体中匹配
    subs = remove_subcase(search_valid_word(dict_en, que))
    length(subs) == 1 && return subs
    # 从中文实体以及 ILLs 中文实体中匹配
    subs = remove_subcase(search_valid_word(dict_zh, que))
    length(subs) == 1 && return subs
    return remove_subcase(search_valid_word(dict, que))
end
# 实体带有空格，用字典树进行处理，复杂度 O(实体长度*问题长度)

get_subject (generic function with 1 method)

#### 训练集

In [60]:
# 测试数据
txts = rstrip(read(open("NER_data/valid_ques.txt", "r"), String))
que_and_sub = Dict{String,String}(split(txt, '\t') for txt in split(txts,'\n'))
for (que, sub) in que_and_sub
    que_and_sub[que] = replace(sub, '_'=>' ')
end

LoadError: BoundsError: attempt to access 1-element Vector{SubString{String}} at index [2]

In [33]:
# 在训练集上测试，valids 为存在唯一解的情况，fails 为多解情况
valids, fails = Tuple{String, String}[], String[]
for (que, _) in que_and_sub
    subs = get_subject(que)
    length(subs) == 1 ? push!(valids, (que, first(subs))) : push!(fails, que)
end
println(length(valids))
count(que_and_sub[que] == sub for (que, sub) in valids)

13187


13185

In [7]:
# 检验唯一解情况的提取结果是否正确 
# 只有两个错误
for (que, sub) in valids
    que_and_sub[que] == sub || println(que)
end

Quel est le fabricant de Choses liées aux Microsoft Lumia 950 XL, un smartphone mobile Windows 10 développé par Microsoft?
黑莓有限公司推出的基于android的slider智能手机BlackBerry Priv的相关产品（作品）属于什么类型


In [18]:
que1 = "Quel est le fabricant de Choses liées aux Microsoft Lumia 950 XL, un smartphone mobile Windows 10 développé par Microsoft?"
println(que_and_sub[que1])
remove_subcase(search_valid_word(dict, que1))

Microsoft Lumia 950 XL


2-element Vector{String}:
 "Windows 10"
 "Microsoft Lumia 950 XL"

In [20]:
que2 = "黑莓有限公司推出的基于android的slider智能手机BlackBerry Priv的相关产品（作品）属于什么类型"
remove_subcase(search_valid_word(dict_zh, que2))
# triple_zh: BlackBerry_Priv	related	BlackBerry_Classic

2-element Vector{String}:
 "智能手机"
 "BlackBerry Priv"

In [12]:
get_subject("加拿大安大略省的一个政党安大略進步保守黨持有席位的议会所属的党派是什么")

2-element Vector{String}:
 "加拿大"
 "安大略進步保守黨"

In [35]:
fails

299-element Vector{String}:
 "日本皇室附属分支的成员華頂博信的父亲的出生地是哪里"
 "非洲南部的一个王国斯威士兰的最大城市的信息来源于谁"
 "受英国天体物理学家约瑟琳·贝尔·伯奈尔影响的人在哪里上班"
 "古埃及女王美麗奈茨的子女的前一任是谁"
 "加拿大安大略省的一个政党安大略進步保守黨持有席位的议会所属的党派是什么"
 "加拿大第23任总理；皮埃尔·特鲁多的长子賈斯汀·杜魯多的父母的父母是谁"
 "英国多塞特的一个城镇和平民教区比明斯特的郡东部是什么地方"
 "美国职业篮球队休斯敦火箭的教练是谁的上一任"
 "英国牛津郡白马谷的一个集镇和平民教区旺塔奇的郡西部是什么地方"
 "荷兰的一个省弗莱福兰省的最大城市的电话号码是哪个国家的"
 "古埃及帝国和迦南叛军之间的一场战斗米吉多战役的将领的爸爸是谁"
 "美国传教士羅啻的国籍所属的区域西北部是什么地方"
 "罗马皇后加拉·普拉西提阿的后裔是哪个帝国的朝代"
 ⋮
 "澳大利亚维多利亚州的一座城市謝珀頓的郡东南边的地方叫什么"
 "乌兹别克斯坦足球运动员法浩特·塔德吉耶夫的国家队的位置属于哪个区划"
 "英格兰林肯郡的一个集镇和平民教区斯利福德的郡西南边的地方叫什么"
 "西班牙的国家公园卡夫雷拉島國家公園的所属群岛的邮政编码类型是什么"
 "谁是捷克足球运动员伊万·哈谢克的出生国家的皇帝"
 "日本王子（1902-1953）秩父宮雍仁親王的父亲的信仰宗教是什么"
 "阿尔巴尼亚足球运动员鲁迪·瓦塔的国家队的举办地被谁租用"
 "新罗女王善德女王的父亲的配偶是哪位"
 "法国国王（1544-1560）弗朗索瓦二世 (法兰西)的母亲的后裔的上一代是什么"
 "越南统治者阮福淍的父亲的下一任是谁"
 "罗马帝国的皇后，君士坦丁大帝的母亲（250-330）聖海倫納的后裔属于哪个帝国"
 "丹麦独立视频游戏开发商和出版商Playdead所在国家有什么代表"

In [26]:
# fails 放在 multi_sols.txt
# 可行策略是取多个匹配中字符长度最大的一个

# 训练集准确率
13183/14077

0.9364921503161185

注记：
   - 加上多匹配 299 个问题，比率还能再提高
   - 加上错位的问题，比率还有进一步提升

#### 测试集

In [2]:
# 测试集结果
txts = rstrip(read(open("../data/valid_data.txt", "r"), String))
ques = String[strip(last(split(txt, '\t'))) for txt in split(txts, '\n')]
ques = rectify_que.(ques)

1500-element Vector{String}:
 "where is the constituency of th" ⋯ 34 bytes ⋯ ", a hong kong politician, from?"
 "who is the successor of the par" ⋯ 36 bytes ⋯ "Tavistock, an irish politician?"
 "who is the leader of the administrative region to which 热那亚总督 belongs?"
 "who preceded the parent of 富兰克林·德拉诺·罗斯福三世, an american economist?"
 "which draft team does the autho" ⋯ 80 bytes ⋯ "ariq and peter gunz, belong to?"
 "who is before the parent of 贾瓦哈" ⋯ 71 bytes ⋯ " minister of india (1889-1964)?"
 "where does the location provinc" ⋯ 21 bytes ⋯ " a mine in mongolia, belong to?"
 "who does the origin of Aryabhata (crater) influence's name?"
 "what is the genre of the author" ⋯ 33 bytes ⋯ "ng performed by carlos santana?"
 "which title does the parent of " ⋯ 44 bytes ⋯ " american economist, belong to?"
 "what is the parent moutain of the client of Ira Goldstein?"
 "who did a job before the parent of Francis Russell, Marquess of Tavistock?"
 "what is the type of the people," ⋯ 32 bytes ⋯ " n

In [12]:
methods(rectify_que)

In [13]:
# 统计
valids, fails = Tuple{String, String}[], String[]
for que in ques
    subs = get_subject(que)
    length(subs) == 1 ? push!(valids, (que, first(subs))) : push!(fails, que)
end
filter(i->isempty(get_subject(i)), fails)

4-element Vector{String}:
 "who is the father of G-related event related personsurjara-Pratihara?"
 "which national team does S's coacheattle Sounders FC 2 belong to?"
 "what is the club that S's head coacheattle Sounders FC 2 belongs to?"
 "which sport does 2's coach014–15 SC Freiburg season play?"

#### 机器翻译

In [20]:
valids, fails = Tuple{String, String}[], String[]
for que in ques
    subs = get_subject(que)
    !isempty(subs) ? push!(valids, (que, argmax(length, subs))) : push!(fails, que)
end
filter(i->isempty(get_subject(i)), fails)

4-element Vector{String}:
 "who is the father of G-related event related personsurjara-Pratihara?"
 "which national team does S's coacheattle Sounders FC 2 belong to?"
 "what is the club that S's head coacheattle Sounders FC 2 belongs to?"
 "which sport does 2's coach014–15 SC Freiburg season play?"

In [27]:
valids_ner = [replace(que, sub => "NER") for (que, sub) in valids]
french_ques = filter(isfrench, valids_ner)
chinese_ques = filter(!isfrench, valids_ner)

501-element Vector{String}:
 "whose ancestor is the father of" ⋯ 29 bytes ⋯ "o dynasty of korea (955-981),？?"
 "美国犹他州达吉特县的一个城镇NER南部的地点西南部的地点属于哪个区划"
 "和NER名称一样的人或事物属于什么类型"
 "与NER有相同名称的人、事、物对谁产生了影响"
 "NER的所在城市南部的地点的区划是什么"
 "NER的所在城市西北部的地点的所属区域是什么"
 "NER的选举获胜者的继任者是哪位"
 "NER的位置北部的地点的区划是什么"
 "NER的头衔的首任是谁的上一任"
 "NER的头衔的第一人在什么地方出生的"
 "美国犹他州顶峰县的一个城镇NER西南部的地点北部的地点的所属区域是什么"
 "NER的所在省份北部的地点所属的国家区域是哪里"
 "NER，1991年版的国际足联女足世界杯，its的举办地东北部的地点领导人的头衔是什么"
 ⋮
 "NER的总部地址南部的地点的所属区域是什么"
 "足球队的一个赛季NER的位置的上一代领导人的头衔有哪些"
 "NER的所属地点的行政中心的领导人叫什么"
 "美国犹他州桑皮特县的一个城镇NER南部的地点东部的地点的时区是什么"
 "1854年至1859年帕尔马和皮亚琴察的公爵NER的后裔的最后任有哪些后代"
 "NER的举办地南部的地点属于哪个时区"
 "美国犹他州米勒德县的一个城镇NER东南部的地点北部的地点属于哪个时区"
 "NER，一项体育赛事，其所在城市西部的城市是哪个区的"
 "NER的头衔的第一人的君王是谁"
 "NER北部的地点西部的地点属于哪个时区"
 "NER的举办地西部的地点的省会是哪里"
 "NER的举办地的继任者的上一代是什么"

In [33]:
save_xlsx("../data/translate/french_valid.xlsx", french_ques)

995×1 Matrix{String}:
 "where is the constituency of th" ⋯ 26 bytes ⋯ ", a hong kong politician, from?"
 "who is the successor of the parent of NER, an irish politician?"
 "who is the leader of the administrative region to which NER belongs?"
 "who preceded the parent of NER, an american economist?"
 "which draft team does the autho" ⋯ 60 bytes ⋯ "ariq and peter gunz, belong to?"
 "who is before the parent of NER" ⋯ 54 bytes ⋯ " minister of india (1889-1964)?"
 "where does the location province of NER, a mine in mongolia, belong to?"
 "who does the origin of NER influence's name?"
 "what is the genre of the author of NER, a song performed by carlos santana?"
 "which title does the parent of NER, an american economist, belong to?"
 "what is the parent moutain of the client of NER?"
 "who did a job before the parent of NER?"
 "what is the type of the people, things and things with the same name as NER?"
 ⋮
 "Savez-vous quel fuseau horaire " ⋯ 73 bytes ⋯ " community of catalonia, spain?"


In [34]:
save_xlsx("../data/translate/chinese_valid.xlsx", chinese_ques)

501×1 Matrix{String}:
 "whose ancestor is the father of" ⋯ 29 bytes ⋯ "o dynasty of korea (955-981),？?"
 "美国犹他州达吉特县的一个城镇NER南部的地点西南部的地点属于哪个区划"
 "和NER名称一样的人或事物属于什么类型"
 "与NER有相同名称的人、事、物对谁产生了影响"
 "NER的所在城市南部的地点的区划是什么"
 "NER的所在城市西北部的地点的所属区域是什么"
 "NER的选举获胜者的继任者是哪位"
 "NER的位置北部的地点的区划是什么"
 "NER的头衔的首任是谁的上一任"
 "NER的头衔的第一人在什么地方出生的"
 "美国犹他州顶峰县的一个城镇NER西南部的地点北部的地点的所属区域是什么"
 "NER的所在省份北部的地点所属的国家区域是哪里"
 "NER，1991年版的国际足联女足世界杯，its的举办地东北部的地点领导人的头衔是什么"
 ⋮
 "NER的总部地址南部的地点的所属区域是什么"
 "足球队的一个赛季NER的位置的上一代领导人的头衔有哪些"
 "NER的所属地点的行政中心的领导人叫什么"
 "美国犹他州桑皮特县的一个城镇NER南部的地点东部的地点的时区是什么"
 "1854年至1859年帕尔马和皮亚琴察的公爵NER的后裔的最后任有哪些后代"
 "NER的举办地南部的地点属于哪个时区"
 "美国犹他州米勒德县的一个城镇NER东南部的地点北部的地点属于哪个时区"
 "NER，一项体育赛事，其所在城市西部的城市是哪个区的"
 "NER的头衔的第一人的君王是谁"
 "NER北部的地点西部的地点属于哪个时区"
 "NER的举办地西部的地点的省会是哪里"
 "NER的举办地的继任者的上一代是什么"

In [39]:
lower(st) = replace(st, 
    "What "=>"what ", 
    "Which "=>"which ", 
    "Where " => "where ",
    "Do " => "do ",
    "In which " => "in which ",
    "Who " => "who ",
    "How " => "how "
)

lower (generic function with 1 method)

In [40]:
chinese_en_ques = lower.(read_xlsx("../data/translate/MT_chinese_valid.xlsx"))
french_en_ques = lower.(read_xlsx("../data/translate/MT_french_valid.xlsx"))

995-element Vector{String}:
 "where is the constituency of th" ⋯ 26 bytes ⋯ ", a hong kong politician, from?"
 "who is the successor of the parent of NER, an Irish politician?"
 "who is the leader of the administrative region to which NER belongs?"
 "who preceded the parent of NER, an american economist?"
 "which draft team does the autho" ⋯ 60 bytes ⋯ "ariq and peter gunz, belong to?"
 "who is before the parent of NER" ⋯ 54 bytes ⋯ " minister of india (1889-1964)?"
 "where does the location province of NER, a mine in mongolia, belong to?"
 "who does the origin of NER influence's name?"
 "what is the genre of the author of NER, a song performed by carlos santana?"
 "which title does the parent of NER, an american economist, belong to?"
 "what is the parent mountain of the client of NER?"
 "who did a job before the parent of NER?"
 "what is the type of the people, things and things with the same name as NER?"
 ⋮
 "do you know what time zone is i" ⋯ 66 bytes ⋯ " community of catalonia, s

In [41]:
open("../data/translate/valid.txt", "w") do io
    for que in vcat(chinese_en_ques, french_en_ques)
        println(io, que)
    end
end

In [42]:
valid_ques = vcat(chinese_en_ques, french_en_ques)

1496-element Vector{String}:
 "whose ancestor is the father of" ⋯ 30 bytes ⋯ " dynasty of korea (955-981),? ?"
 "A town in Dudgett County, Utah," ⋯ 75 bytes ⋯ "west, which division belongs to"
 "what type of person or thing with the same name as NER"
 "who is affected by the person, thing, or thing with the same name as NER?"
 "what is the zoning of the locat" ⋯ 19 bytes ⋯ " the city where NER is located?"
 "what is the area of the locatio" ⋯ 21 bytes ⋯ " the city where NER is located?"
 "who is the successor of NER's election winner?"
 "what is the zoning of the location in the north of NER?"
 "who is the last of the title of NER?"
 "where was the first person with the title of NER born?"
 "A town in Pinnacle County, Utah" ⋯ 62 bytes ⋯ " and the location in the north?"
 "what is the country region of t" ⋯ 39 bytes ⋯ " province where NER is located?"
 "NER, the 1991 edition of the FI" ⋯ 57 bytes ⋯ "er in the northeast of its host"
 ⋮
 "do you know what time zone is i" ⋯ 66 bytes ⋯ " co

In [47]:
txts = rstrip(read(open("../data/translate/train_data.txt", "r"), String))
ques = String[strip(first(split(txt, '\t'))) for txt in split(txts, '\n')]
ques = rectify_que.(ques)

13755-element Vector{String}:
 "which dynasty does the legislature of NER's country belong to?"
 "what is the nationality of the leader of NER?"
 "what is the location of NER? what constituency is the location in the west?"
 "who is the president of NER's alma mater"
 "NER, a Chinese politician (1913-2010) who nominated his title"
 "awhat division does the archipelago of the city NER in Cuba belong to?"
 "who is the agent of the person in charge of NER?"
 "which party did the Japanese politician NER play belong to?"
 "which country system is used fo" ⋯ 25 bytes ⋯ "hed by the location of the NER?"
 "what is the predecessor of the previous generation of the descendants of NER"
 "where is the capital (or provin" ⋯ 27 bytes ⋯ "ies and regions affected by NER"
 "what is the governing body of the leaders of NER?"
 "what is the name of the ocean to the northeast of NER's largest city?"
 ⋮
 "which sport does the tenant of the location of NER play?"
 "who is the monarch of the one that is after

In [50]:
lowercase.(ques)

13755-element Vector{String}:
 "which dynasty does the legislature of ner's country belong to?"
 "what is the nationality of the leader of ner?"
 "what is the location of ner? what constituency is the location in the west?"
 "who is the president of ner's alma mater"
 "ner, a chinese politician (1913-2010) who nominated his title"
 "awhat division does the archipelago of the city ner in cuba belong to?"
 "who is the agent of the person in charge of ner?"
 "which party did the japanese politician ner play belong to?"
 "which country system is used fo" ⋯ 25 bytes ⋯ "hed by the location of the ner?"
 "what is the predecessor of the previous generation of the descendants of ner"
 "where is the capital (or provin" ⋯ 27 bytes ⋯ "ies and regions affected by ner"
 "what is the governing body of the leaders of ner?"
 "what is the name of the ocean to the northeast of ner's largest city?"
 ⋮
 "which sport does the tenant of the location of ner play?"
 "who is the monarch of the one that is after

In [51]:
unique_ques = Set(lowercase.(ques))
count(in(unique_ques), lowercase.(valid_ques))

514