## 字典树提取命名实体

### 定义工具

In [1]:
"前缀树"
Base.@kwdef mutable struct PrefixTree
    isend::Bool = false
    children = Dict{Char,PrefixTree}()
end

"给前缀树增加单词"
function add_node!(node::PrefixTree, word::String)::Nothing
    for c in word
        children = node.children
        haskey(children, c) || (children[c] = PrefixTree())
        node = children[c]
    end
    node.isend = true
    return nothing
end

"在字符串里匹配字典单词"
function search_valid_word(node::PrefixTree, word::String)
    res, n, word = String[], length(word), collect(word)
    for i in 1:n
        # 检索 word[i:end]
        dict = node
        for j in i:n
            haskey(dict.children, word[j]) || break # 不存在到该位置的路径
            dict = dict.children[word[j]] # 切换到该节点
            dict.isend && push!(res, join(word[i:j]))
        end
    end
    res
end

"删除子串"
function remove_subcase(subs)
    # (n = length(subs)) == 1 && return subs
    sort!(subs, by=length)
    res, n = String[], length(subs)
    for (i, sub) in enumerate(subs)
        any(j->occursin(sub, subs[j]), (i+1):n) || push!(res, sub)
    end
    res
end

remove_subcase

### 初始化字典树

In [3]:
;cd ../data

/home/rex/work_space/7 others/ccks/CCKS-mKGQA/data


In [5]:
# 中文头实体
txts = rstrip(read(open("extract/triple_zh.txt", "r"), String))
zh_subs = unique!([first(split(txt, '\t')) for txt in split(txts, '\n')])

# 英文头实体
txts = rstrip(read(open("extract/triple_en.txt", "r"), String))
en_subs = unique!([first(split(txt, '\t')) for txt in split(txts, '\n')])

# ILLs 对齐文件
txts = rstrip(read(open("extract/ILLs(zh-en).txt", "r"), String))
ILLs = Dict{String,String}(split(txt, '\t') for txt in split(txts,'\n'))

# 测试数据
txts = rstrip(read(open("NER_data/valid_ques.txt", "r"), String))
que_and_sub = Dict{String,String}(split(txt, '\t') for txt in split(txts,'\n'))
for (que, sub) in que_and_sub
    que_and_sub[que] = replace(sub, '_'=>' ')
end

In [7]:
# 字典数据
en_words = unique!(vcat(en_subs, collect(keys(ILLs))))
en_words = [replace(word, '_'=>' ') for word in en_words]
zh_words = unique!(vcat(zh_subs, collect(values(ILLs))))
zh_words = [replace(word, '_'=>' ') for word in zh_words]
words = unique!(vcat(en_words, zh_words))
words = [replace(word, '_'=>' ') for word in words]

# 生成字典树
dict, dict_en, dict_zh = PrefixTree(), PrefixTree(), PrefixTree()
for word in words
    add_node!(dict, word)
end
for word in en_words
    add_node!(dict_en, word)
end
for word in zh_words
    add_node!(dict_zh, word)
end

### 抽取命名实体

In [8]:
function get_subject(que)
    subs = remove_subcase(search_valid_word(dict, que))
    length(subs) == 1 && return subs
    subs = remove_subcase(search_valid_word(dict_en, que))
    length(subs) == 1 && return subs
    subs = remove_subcase(search_valid_word(dict_zh, que))
    length(subs) == 1 && return subs
    return remove_subcase(search_valid_word(dict, que))
end

get_subject (generic function with 1 method)

In [27]:
# 数据测试
valids, fails = Tuple{String, String}[], String[]
for (que, _) in que_and_sub
    subs = get_subject(que)
    length(subs) == 1 ? push!(valids, (que, first(subs))) : push!(fails, que)
end
count(que_and_sub[que] == sub for (que, sub) in valids)

13185

In [28]:
for (que, sub) in valids
    que_and_sub[que] == sub || println(que)
end

Quel est le fabricant de Choses liées aux Microsoft Lumia 950 XL, un smartphone mobile Windows 10 développé par Microsoft?
黑莓有限公司推出的基于android的slider智能手机BlackBerry Priv的相关产品（作品）属于什么类型


In [29]:
que = "Quel est le fabricant de Choses liées aux Microsoft Lumia 950 XL, un smartphone mobile Windows 10 développé par Microsoft?"
search_valid_word(dict, que)

6-element Vector{String}:
 "Microsoft"
 "Microsoft Lumia"
 "Microsoft Lumia 950"
 "Microsoft Lumia 950 XL"
 "Windows 10"
 "Microsoft"

In [25]:
# 训练集结果
13187/14077

0.9367763017688427

In [31]:
# 测试集结果
txts = read(open("valid_data.txt", "r"), String)
ques = String[rstrip(last(split(txt, '\t'))) for txt in split(rstrip(txts), '\n')]

1500-element Vector{String}:
 "where is the constituency of the one who is alongside Felix Chung, a hong kong politician, from?"
 "who is the successor of the parent of Francis Russell, Marquess of Tavistock, an irish politician?"
 "who is the leader of the administrative region to which 热那亚总督 belongs?"
 "who preceded the parent of 富兰克林·德拉诺·罗斯福三世, an american economist?"
 "which draft team does the author of The Way It's Goin' Down, a 1998 single by shaquille o'neal, dj quik, lord tariq and peter gunz, belong to?"
 "who is before the parent of 贾瓦哈拉尔·尼赫鲁, an indian lawyer, statesman, and writer, first prime minister of india (1889-1964)?"
 "where does the location province of Khuut coal mine, a mine in mongolia, belong to?"
 "who does the origin of A's nameryabhata (crater) influence?"
 "what is the genre of the author of They All Went to Mexico, a song performed by carlos santana?"
 "which title does the parent of 富兰克林·德拉诺·罗斯福三世, an american economist, belong to?"
 "what is the parent 

In [32]:
valids, fails = Tuple{String, String}[], String[]
for que in ques
    subs = get_subject(que)
    length(subs) == 1 ? push!(valids, (que, first(subs))) : push!(fails, que)
end

In [33]:
length(valids)

1463

In [24]:
length(valids)/1500

0.9753333333333334