## 路径还原

注：存在 86 个重复三元组，提交不知道会不会遇到这些情形
- 比如假设 triple1 即为 en 也为 zh，那么
- en -> zh -> en 和 zh -> zh -> en 都将作为答案

In [2]:
include("../src/CCKS-mKGQA.jl")
include("../src/extractdata.jl")
include("../src/translatedata.jl")
include("../src/ettalign.jl")
include("../src/distance.jl")
include("../src/submitpath.jl")
mkpath("submit")

"submit"

### 初始化

In [47]:
# 使用第一次结果
filepath = "ILLs_1"
newILLs = ILLs1 = readtuples("EA_data/$filepath/ETT_Pairs.txt"; size=2)
filter!(i->i[1] != "Gregoire_Kayibanda", newILLs);

In [55]:
# 使用第二次结果
filepath = "ILLs_2"
newILLs = ILLs2 = readtuples("EA_data/$filepath/ILLs_predict-7.6.txt"; size=2)
filter!(i->i[1] != "Gregoire_Kayibanda", newILLs);

In [17]:
# 使用第三次结果
filepath = "ILLs_3"
newILLs = ILLs3 = readtuples("EA_data/$filepath/ILLs_predict_zh_en.txt"; size=2)
filter!(i->i[1] ∈ wordset && i[2] ∈ wordset, newILLs);

In [2]:
# 使用官方解
filepath = "ILLs_official/"
newILLs = ILLs;

In [3]:
# 使用 wiki_ILLs
filepath = "ILLs_wiki/"
newILLs = wiki_ILLs;

### 翻译和构建

In [4]:
# ILLs 单词翻译
mt_newILLs = [getindex.(Ref(dict_words), pair) for pair in newILLs]
filter!(i->i[1]!=i[2], unique!(mt_newILLs))
dict_ILLs = ill2dict(mt_newILLs)
ill_valid_ners = [get(dict_ILLs, ner, ner) for ner in last.(mt_valid_ques_ner)]

# ILLs 元组翻译
triple_byills(triple) = triple_byills(triple...)
triple_byills(sub, rel, obj) = (get(dict_ILLs, sub, sub), rel, get(dict_ILLs, obj, obj))
triple_raw2ill(triple...) = triple_byills(triple_byMT(triple...))
ill_triples = triple_raw2ill.(triples)

# 构建边集
edges = DefaultDict{String, Vector{Tuple}}(Vector{Tuple})
for (sub, rel, obj) in ill_triples
    push!(edges[sub], (rel, obj))
end

# 元组逆向
f(raw_triple) = String.(triple_raw2ill(raw_triple[2:end]))
raw_triples = vcat(tuplejoin.(Ref(("zh",)), zh_triples), tuplejoin.(Ref(("en",)), en_triples))
dict_triples_rev = triple2dict(f, raw_triples)
println("重复元组的信息")
sort_count(length.(values(dict_triples_rev)))

重复元组的信息


OrderedDict{Int64, Int64} with 4 entries:
  1 => 255733
  2 => 5823
  3 => 30
  4 => 4

### 精确匹配

In [49]:
submit_id = 17
sols = split.(strip.(split(strip(read(open("predict/predict_data_2.txt", "r"), String)), '\n')),'\t')
noans = "<http://dbpedia.org/resource/Francis_Russell,_Marquess_of_Tavistock>#<http://dbpedia.org/property/parents>#<http://dbpedia.org/resource/John_Russell,_4th_Duke_of_Bedford>#<http://zh.dbpedia.org/resource/第四代贝德福德公爵约翰·罗素>#<http://zh.dbpedia.org/property/successor>#<http://zh.dbpedia.org/resource/丹尼尔·芬奇，第八代温奇尔西伯爵>";

fails = String[]
notgood = []
open("submit/submit_$(submit_id).txt", "w") do io
    println(io, "id\tans_path")
    for (i, sol) in enumerate(sols)
        que, rels = sol[1], sol[3:end]
        ner, rawner = ill_valid_ners[i], valid_ques_ner[i][2]
        res = precise_submit(ner, rels, edges, dict_triples_rev)
        if isempty(res)
            push!(fails, que)
            println(io, i-1, '\t', noans)
        else
            betterres = filter(str->occursin(rawner, str), res)
            isempty(betterres) ? push!(notgood, (i, rawner, ner, res)) : (res = betterres)
            println(io, i-1, '\t', first(res))
        end
    end
end
length(fails), length(notgood)

(667, 1)

### 模糊匹配

In [7]:
submit_id = "28"
sols = split.(strip.(split(strip(read(open("predict/predict_data_0708.txt", "r"), String)), '\n')),'\t')
noans = "<http://dbpedia.org/resource/Francis_Russell,_Marquess_of_Tavistock>#<http://dbpedia.org/property/parents>#<http://dbpedia.org/resource/John_Russell,_4th_Duke_of_Bedford>#<http://zh.dbpedia.org/resource/第四代贝德福德公爵约翰·罗素>#<http://zh.dbpedia.org/property/successor>#<http://zh.dbpedia.org/resource/丹尼尔·芬奇，第八代温奇尔西伯爵>";

fails = []
notgood = []
none = String[]
open("submit/submit_$(submit_id).txt", "w") do io
    println(io, "id\tans_path")
    for (i, sol) in enumerate(sols)
        que, rels = sol[1], sol[3:end]
        ner, rawner = ill_valid_ners[i], valid_ques_ner[i][2]
        res = precise_submit(ner, rels, edges, dict_triples_rev)
        if isempty(res) # 匹配不到答案
            push!(fails, [que, rels])
            for best in 1:8
                res = vague_submit(ner, rels, ill_triples, dict_triples_rev;best=best) # 寻找模糊路径
                isempty(res) || break
            end
            isempty(res) && (push!(none, que); res = [noans])
        else
            betterres = filter(str->occursin(rawner, str), res)
            isempty(betterres) ? push!(notgood, (i, rawner, ner, res)) : (res = betterres)
        end
        println(io, i-1, '\t', first(res))
    end
end
println("无解\t路径推理可解\t完全无解")
println([length(fails), length(fails)-length(none), length(none)])

[187, 34, 153]


提交结果分析

 | ILLs | 准确率 | 完全无解 | 使用路径推理 | 缺二跳(验证集/图谱) | 缺三跳(验证集/图谱) | 
 | ---- | ----- | ----- | ----- | ------ | ---- |
 | 官方 | 0.3513 |573 | 323 | 156/52268 | 352/67437 | 
 | ILLs2 | | 551 | 221 | 98/47455 | 263/61439 | 
 | ILLs1 | 0.374 | 543 | 210 | 80/44396 | 209/57285 |
 | ILLs3 | 0.3833 | 535 | 183 | 61/37283 | 137/48256 |
 | wiki | 0.422 | 478 | 134 | 6/38070 | 103/49921 |
 | ill3 模型 + wiki| 0.692 | 175 | 12 | - | - |
 | ill3 模型 + ill3|  | 301 | 162 | - | - |


In [9]:
fails

187-element Vector{Any}:
 Any["where is the constituency of the one who is alongside NER, a hong kong politician, from?", SubString{String}["alongside", "honorificsuffix"]]
 Any["who starred in the sponsor of NER, a wwe's 2012 night of champions pay-per-views?", SubString{String}["sponsor", "format"]]
 Any["who is the regnet of the children of NER, a british politician (1836-1914)?", SubString{String}["children", "primeminister"]]
 Any["what is the title of the successor of NER?", SubString{String}["issue"]]
 Any["what is the predecessor of the combatant of NER?", SubString{String}["combatant", "headquarters"]]
 Any["who takes the place of the children of NER, a british politician (1836-1914)?", SubString{String}["children", "successor"]]
 Any["what is the job of the singer who participated in NER?", SubString{String}["extra", "origin", "legislature"]]
 Any["which label does NER belong to related events?", SubString{String}["event", "label"]]
 Any["what is the religion of the man who i

### 测试对齐

In [7]:
include("../src/regexpr.jl")
function get_rel(link)
    zh = get_txt(zh_rel_reg, link)
    dict_rels[isempty(zh) ? get_txt(en_rel_reg, link) : zh]
end

get_rel (generic function with 1 method)

In [8]:
sols = last.(split.(split(strip(read(open("submit_11.txt", "r"), String)), '\n')[2:end], '\t'))
sols = [sol[2:3:end] for sol in split.(sols, '#')]
sols = [get_rel.(sol) for sol in sols];

In [61]:
submit_id = 21
noans = "<http://dbpedia.org/resource/Francis_Russell,_Marquess_of_Tavistock>#<http://dbpedia.org/property/parents>#<http://dbpedia.org/resource/John_Russell,_4th_Duke_of_Bedford>#<http://zh.dbpedia.org/resource/第四代贝德福德公爵约翰·罗素>#<http://zh.dbpedia.org/property/successor>#<http://zh.dbpedia.org/resource/丹尼尔·芬奇，第八代温奇尔西伯爵>";

fails = String[]
notgood = []
none = String[]
open("submit/submit_$(submit_id).txt", "w") do io
    println(io, "id\tans_path")
    for (i, rels) in enumerate(sols)
        que = mt_valid_ques[i]
        ner, rawner = ill_valid_ners[i], valid_ques_ner[i][2]
        res = precise_submit(ner, rels, edges, dict_triples_rev)
        if isempty(res) # 匹配不到答案
            push!(fails, que)
            res = vague_submit(ner, rels, ill_triples, dict_triples_rev) # 寻找模糊路径
            isempty(res) && (push!(none, que); res = [noans])
        else
            betterres = filter(str->occursin(rawner, str), res)
            isempty(betterres) ? push!(notgood, (i, rawner, ner, res)) : (res = betterres)
        end
        println(io, i-1, '\t', first(res))
    end
end
length(fails), length(notgood), length(none)

(718, 0, 535)

### 对齐程度分析

In [None]:
# ILLs 3
# 535 完全无解 183 路径推理
# 训练集
# 缺二跳 37283 缺三跳 48256
# 验证集
# 缺二跳 61 缺三跳 137

In [16]:
# wiki ILLs
# 478 完全无解 134 路径推理
# 训练集
# 缺二跳 38070 缺三跳 49921
# 验证集
# 缺二跳 6 缺三跳 103

### 关系抽取

In [106]:
for que in mt_valid_ques
    nears = top10(que)
end

1500-element Vector{String}:
 "where is the constituency of th" ⋯ 26 bytes ⋯ ", a hong kong politician, from?"
 "who is the successor of the parent of NER, an Irish politician?"
 "who is the leader of the administrative region to which NER belongs?"
 "who preceded the parent of NER, an american economist?"
 "which draft team does the autho" ⋯ 60 bytes ⋯ "ariq and peter gunz, belong to?"
 "who is before the parent of NER" ⋯ 54 bytes ⋯ " minister of india (1889-1964)?"
 "where does the location province of NER, a mine in mongolia, belong to?"
 "who does the origin of NER influence's name?"
 "what is the genre of the author of NER, a song performed by carlos santana?"
 "which title does the parent of NER, an american economist, belong to?"
 "what is the parent mountain of the client of NER?"
 "who did a job before the parent of NER?"
 "what is the type of the people, things and things with the same name as NER?"
 ⋮
 "do you know what time zone is i" ⋯ 66 bytes ⋯ " community of catalonia, 

In [247]:
rel_rules = (
    r"(where|what|who|which|do you know) " => " ",
    r" (does|did|does|the|that|is|of) " => " ",
    r"^(does|did|does|the|that|is) " => " ",
    r", (a|an) .*?[,?]" => "",
    r" {2,}" => " ",
    # "belong to" => "",
    "?" => "",
    "NER's" => "",
    "NER" => ""
)
template_train_ques = String[]
for que in mt_train_ques
    que = strip(replace(que, rel_rules...))
    que = strip(replace(que, rel_rules...))
    que = strip(replace(que, rel_rules...))
    push!(template_train_ques, que)
end

template_valid_ques = String[]
for que in mt_valid_ques
    que = strip(replace(que, rel_rules...))
    que = strip(replace(que, rel_rules...))
    que = strip(replace(que, rel_rules...))
    push!(template_valid_ques, que)
end

# open("template/valid_ques.txt", "w") do io
#     # for que in mt_valid_ques
    
# end

In [248]:
filter(∈(Set(template_train_ques), template_valid_ques)

650-element Vector{String}:
 "constituency one alongside from"
 "successor parent"
 "location province belong to"
 "origin influence's name"
 "genre author"
 "title parent belong to"
 "parent mountain client"
 "a job before parent"
 "profession one after election"
 "before commander"
 "a job before one after election"
 "spoken by name included in"
 "married author"
 ⋮
 "title The winner"
 "level management at Southeast Site"
 "Model Regulation The province located"
 "time zone in East location"
 "nationality Election after"
 "eponymous Alma Mater"
 "administrative center Western County"
 "title Election after"
 "was Election after before"
 "time zone in South spain"
 "are eponymous affected people"
 "predecessors The leader"

In [224]:
txts = []
for (i, (que, _, rels...)) in enumerate(mt_train_ques_rels)
    newque = strip(replace(que, rel_rules...))
    newque = strip(replace(newque, rel_rules...))
    newque = strip(replace(newque, rel_rules...))
    occursin("where", newque) && (push!(txts, join([newque, '|', rels...], '\t')))
end
open("template/where.txt", "w") do io
    for txt in unique(txts)
        println(io, txt)
    end
end

In [None]:
txt

In [240]:
mt_train_ques_rels[[93, 223,227, 381]]

4-element Vector{Vector{SubString{String}}}:
 ["what is the leader of NER?", "Galactic_Empire_(Star_Wars)", "leader", "occupation"]
 ["what is the eponym of NER?", "Drygalski_(crater)", "eponym", "occupation"]
 ["what is the eponym of NER?", "Agatharchides_(crater)", "eponym", "occupation"]
 ["what is the singer of NER?", "I'm_Talking_with_My_Mouth", "extra", "occupation"]

In [244]:
mysols = []
for (i, que) in enumerate(template_ques)
    word = split(que)
    length(word) == 1 && println(join(mt_train_ques_rels[i][[1,3:end...]], '\t'))
    # length(word) == 1 && push!(mysols, [i, word])
    # length(word) == 2 && push!(mysols, [i, word])
end
mysols

what is the leader of NER?	leader	occupation
what is the eponym of NER?	eponym	occupation
what is the eponym of NER?	eponym	occupation
what is the singer of NER?	extra	occupation
what is the singer of NER, an album by alexz johnson?	extra	occupation
what is the singer of NER, a 2014 animated film by phil lord and chris miller?	extra	profession
what is the parent of NER, a hong kong banker and politician?	parents	occupation
what is the parent of NER, a hong kong businessman?	parents	occupation
what is the parent of NER, an american businessman and diplomat (1914-1993)?	parents	occupation
what is the author of NER?	chronology	profession
what is the author of NER?	chronology	profession
what is the author of NER, an album by vixx?	chronology	profession
what is the author of NER?	chronology	profession
what is the client of NER, a villa?	client	occupation
what is the author of NER, a live album?	chronology	occupation
what is the author of NER?	chronology	occupation
what is the author of NER,

Any[]

In [231]:
mt_train_ques_rels[[5808, 7670, 7929, 12689]]

4-element Vector{Vector{SubString{String}}}:
 ["where is the NER incident", "Weimar_Republic", "event", "location"]
 ["where is NER's team?", "Holsten-Deutschland-Cup", "team", "ground"]
 ["who is the producer of the creator of Danny Elfman's compilation album NER?", "Nightmare_Revisited", "chronology", "producer"]
 ["do you know where is the NER name", "List_of_administrative_divisions_of_Ili", "name", "location"]

In [125]:
match(r"(\w*) of (\w*)", template_ques[1])

RegexMatch("constituency of one", 1="constituency", 2="one")

In [139]:
filter(!isnothing, match.(r"^(.*)of(.*)$", template_ques))

1137-element Vector{Union{Nothing, RegexMatch}}:
 RegexMatch("constituency of one alongside", 1="constituency ", 2=" one alongside")
 RegexMatch("successor of parent of", 1="successor of parent ", 2="")
 RegexMatch("leader of administrative region to belongs", 1="leader ", 2=" administrative region to belongs")
 RegexMatch("preceded parent of", 1="preceded parent ", 2="")
 RegexMatch("draft team author of", 1="draft team author ", 2="")
 RegexMatch("before parent of", 1="before parent ", 2="")
 RegexMatch("location province of", 1="location province ", 2="")
 RegexMatch("origin of influence's name", 1="origin ", 2=" influence's name")
 RegexMatch("genre of author of", 1="genre of author ", 2="")
 RegexMatch("title parent of", 1="title parent ", 2="")
 RegexMatch("parent mountain of client of", 1="parent mountain of client ", 2="")
 RegexMatch("a job before parent of", 1="a job before parent ", 2="")
 RegexMatch("type of people, things and things with same name as", 1="type ", 2=" peopl

In [138]:
filter(!isnothing, match.(r"^(.*)of(.*)of(.*)$", template_ques))

485-element Vector{Union{Nothing, RegexMatch}}:
 RegexMatch("successor of parent of", 1="successor ", 2=" parent ", 3="")
 RegexMatch("genre of author of", 1="genre ", 2=" author ", 3="")
 RegexMatch("parent mountain of client of", 1="parent mountain ", 2=" client ", 3="")
 RegexMatch("profession of one after election of", 1="profession ", 2=" one after election ", 3="")
 RegexMatch("one known as prime minister of one after election of", 1="one known as prime minister ", 2=" one after election ", 3="")
 RegexMatch("headquarters of combatant of", 1="headquarters ", 2=" combatant ", 3="")
 RegexMatch("title of eponym of", 1="title ", 2=" eponym ", 3="")
 RegexMatch("leader of combatant of", 1="leader ", 2=" combatant ", 3="")
 RegexMatch("house of commander of", 1="house ", 2=" commander ", 3="")
 RegexMatch("successor of leader of", 1="successor ", 2=" leader ", 3="")
 RegexMatch("location of alma mater of", 1="location ", 2=" alma mater ", 3="")
 RegexMatch("succession of commander of"

In [136]:
filter(!isnothing, match.(r"^(\w*) of (\w*) of (\w*) of$", template_ques))

11-element Vector{Union{Nothing, RegexMatch}}:
 RegexMatch("legislature of country of birth of", 1="legislature", 2="country", 3="birth")
 RegexMatch("source of information of combatant of", 1="source", 2="information", 3="combatant")
 RegexMatch("timezong of east of south of", 1="timezong", 2="east", 3="south")
 RegexMatch("timezong of east of south of", 1="timezong", 2="east", 3="south")
 RegexMatch("deathpalce of successor of descendant of", 1="deathpalce", 2="successor", 3="descendant")
 RegexMatch("timezong of west of location of", 1="timezong", 2="west", 3="location")
 RegexMatch("title of consort of ruler of", 1="title", 2="consort", 3="ruler")
 RegexMatch("successor of successor of descendants of", 1="successor", 2="successor", 3="descendants")
 RegexMatch("successor of successor of descendants of", 1="successor", 2="successor", 3="descendants")
 RegexMatch("form of title of venue of", 1="form", 2="title", 3="venue")
 RegexMatch("father of ruler of descendants of", 1="father", 

In [95]:
"匹配最近 10 个问题"
templates = unique(mt_train_ques)
top10 = nearby(templates)

(::Base.Fix1{typeof(nearby), Vector{String}}) (generic function with 1 method)