## 前缀树

### 工具

#### 定义对象和方法

In [1]:
"前缀树"
Base.@kwdef mutable struct PrefixTree
    isend::Bool = false
    children = Dict{Char,PrefixTree}()
end

"给前缀树增加单词"
function add_node!(node::PrefixTree, word::String)::Nothing
    for c in word
        children = node.children
        haskey(children, c) || (children[c] = PrefixTree())
        node = children[c]
    end
    node.isend = true
    return nothing
end

"在字符串里匹配字典单词"
function search_valid_word(node::PrefixTree, word::String)
    res, n, word = String[], length(word), collect(word)
    for i in 1:n
        # 检索 word[i:end]
        dict = node
        for j in i:n
            haskey(dict.children, word[j]) || break # 不存在到该位置的路径
            dict = dict.children[word[j]] # 切换到该节点
            dict.isend && push!(res, join(word[i:j]))
        end
    end
    res
end

search_valid_word

In [2]:
"删除子串"
function remove_subcase(subs)
    # (n = length(subs)) == 1 && return subs
    sort!(subs, by=length)
    res, n = String[], length(subs)
    for (i, sub) in enumerate(subs)
        any(j->occursin(sub, subs[j]), (i+1):n) || push!(res, sub)
    end
    res
end

remove_subcase

#### 测试

In [3]:
words = ["伊格尔·茨维塔诺维奇", "克罗地亚", "伊万·哈谢克", "伊莎貝拉_(帕爾馬郡主)", "克里斯·麦克尼尔利", "克罗地亚王国", "克罗泽群岛", "伊格内修斯·库图·阿昌庞"];
dict_zh = PrefixTree()
for word in words
    add_node!(dict_zh, word)
end

In [4]:
question = "克罗地亚足球运动员伊格尔·茨维塔诺维奇的出生国家西南边的地方叫什么"
search_valid_word(dict_zh, question)

2-element Vector{String}:
 "克罗地亚"
 "伊格尔·茨维塔诺维奇"

### 读取数据

In [5]:
;cd ../data

/home/rex/work_space/7 others/ccks/CCKS-mKGQA/data


In [6]:
# 中文三元组，实体，关系
txts = rstrip(read(open("extract/triple_zh.txt", "r"), String))
zh_triples = [NTuple{3,String}(split(txt, '\t')) for txt in split(txts, '\n')]
# zh_objs = unique!(vcat(first.(zh_triples), last.(zh_triples)))
zh_subs = unique(first.(zh_triples))
zh_rels = unique(triple[2] for triple in zh_triples)

# 英文三元组，实体，关系
txts = rstrip(read(open("extract/triple_en.txt", "r"), String))
en_triples = [NTuple{3,String}(split(txt, '\t')) for txt in split(txts, '\n')]
# en_objs = unique!(vcat(first.(en_triples), last.(en_triples)))
en_subs = unique(first.(en_triples))
en_rels = unique(triple[2] for triple in en_triples)

# ILLs 对齐文件
txts = rstrip(read(open("extract/ILLs(zh-en).txt", "r"), String))
ILLs = Dict{String,String}(split(txt, '\t') for txt in split(txts,'\n'))

# 测试数据
txts = rstrip(read(open("NER_data/valid_ques.txt", "r"), String))
que_and_sub = Dict{String,String}(split(txt, '\t') for txt in split(txts,'\n'))
for (que, sub) in que_and_sub
    que_and_sub[que] = replace(sub, '_'=>' ')
end

### 字典树初始化及测试

In [71]:
en_words = unique!(vcat(en_subs, collect(keys(ILLs))))
en_words = [replace(word, '_'=>' ') for word in en_words]
zh_words = unique!(vcat(zh_subs, collect(values(ILLs))))
zh_words = [replace(word, '_'=>' ') for word in zh_words]
words = unique!(vcat(en_words, zh_words))
words = [replace(word, '_'=>' ') for word in words]

137468-element Vector{String}:
 "Government of the Republic of China in Guangzhou"
 "Zhang Xiaoya"
 "Random Thoughts (Faye Wong album)"
 "Republic of China presidential election, 1913"
 "French frigate Descartes"
 "French submarine Archimède"
 "Charles Simonyi"
 "HSwMS Thordön"
 "Iraqi parliamentary election, 1984"
 "Silulu A'etonu"
 "Zhou dynasty (690–705)"
 "Lakas–CMD"
 "USRC James Madison (1807)"
 ⋮
 "鲁昂"
 "任天堂"
 "靜岡市"
 "国防部 (以色列)"
 "道格拉斯飞行器公司"
 "奥斯陆"
 "馬頭角"
 "費爾菲爾德 (康乃狄克州)"
 "加兹温"
 "普爾城足球會"
 "孙文盛"
 "视频文件格式"

In [73]:
dict = PrefixTree()
for word in words
    add_node!(dict, word)
end
dict_en = PrefixTree()
for word in en_words
    add_node!(dict_en, word)
end
dict_zh = PrefixTree()
for word in zh_words
    add_node!(dict_zh, word)
end

In [83]:
# 出生-来自尾实体
# 馬紹爾城	settlementType	出生
question = "克罗地亚足球运动员伊格尔·茨维塔诺维奇的出生国家西南边的地方叫什么"
subs = search_valid_word(dict, question)
remove_subcase(subs)

2-element Vector{String}:
 "克罗地亚"
 "伊格尔·茨维塔诺维奇"

In [82]:
subs = search_valid_word(dict, "which format does the cause of 夜櫻作戰, a japanese biological warfare plan during world war ii, belong to?")
remove_subcase(subs)

1-element Vector{String}:
 "夜櫻作戰"

In [81]:
question = "Savez-vous qui est l’héritier de Le chef de Second Czechoslovak Republic, une république de 1938-1939 en europe centrale/orientale?"
subs = search_valid_word(dict, question)
remove_subcase(subs)

1-element Vector{String}:
 "Second Czechoslovak Republic"

### 实战

In [84]:
function get_subject(que)
    subs = remove_subcase(search_valid_word(dict, que))
    length(subs) == 1 && return subs
    subs = remove_subcase(search_valid_word(dict_en, que))
    length(subs) == 1 && return subs
    subs = remove_subcase(search_valid_word(dict_zh, que))
    length(subs) == 1 && return subs
    return remove_subcase(search_valid_word(dict, que))
end

get_subject (generic function with 1 method)

In [86]:
## 提取唯一解的情形
valids = Tuple{String, String}[]
fails = String[]
for (que, _) in que_and_sub
    subs = get_subject(que)
    length(subs) == 1 ? push!(valids, (que, first(subs))) : push!(fails, que)
end
count(que_and_sub[que] == sub for (que, sub) in valids)

13185

In [56]:
## 剩余部分从英文实体中提取唯一解
en_valids = Tuple{String, String}[]
en_fails = String[]
for que in fails
    subs = search_valid_word(dict_en, que)
    subs = remove_subcase(subs)
    length(subs) == 1 ? push!(en_valids, (que, first(subs))) : push!(en_fails, que)
end

In [67]:
cor = 0
for (que, sub) in en_valids
    que_and_sub[que] == sub || println(que)
end

黑莓有限公司推出的基于android的slider智能手机BlackBerry Priv的相关产品（作品）属于什么类型


In [66]:
search_valid_word(dict, "黑莓有限公司推出的基于android的slider智能手机BlackBerry Priv的相关产品（作品）属于什么类型")

3-element Vector{String}:
 "智能手机"
 "BlackBerry"
 "BlackBerry Priv"

In [70]:
open("NER_data/multi_en_sols.txt", "w") do io
    for que in en_fails
        println(io, que)
    end
end

In [41]:
"克罗地亚足球运动员伊格尔·茨维塔诺维奇的出生国家西南边的地方叫什么" in fails

false

In [43]:
que = "土耳其政治家Veli Ağbaba的领导人的首相是谁"
search_valid_word(dict, que)

2-element Vector{String}:
 "土耳其"
 "Veli Ağbaba"

In [44]:
open("NER_data/multi_sol.txt", "w") do io
    for que in fails
        println(io, que)
    end
end

In [49]:
maximum(search_valid_word(dict, "2010 FIFA World Cup qualification – AFC Fourth Round，一项国际足球比赛，其举办地的继任者的继任者是哪位"))

"国际"