# 数据分析

训练集语言
- 1-5040 英文
- 5041-9507 中文
- 9508-14051 法语

匹配策略
1. 精确匹配

### 初始化

In [59]:
include("../src/CCKS-mKGQA.jl")
include("../src/loaddata/extractdata.jl") # 原始数据
include("../src/loaddata/translatedata.jl") # 翻译数据
include("../src/tools/ettalign.jl") # 对齐工具
include("../src/datatype.jl") # 数据结构
include("../src/tools/distance.jl")
mt_train_rels = [querel[3:end] for querel in mt_train_ques_rels];

In [2]:
# illpath = "ILLs_1"
# illpath = "ILLs_2"
# illpath = "ILLs_3"
# illpath = "ILLs_4"
illpath = "ILLs_official"
# illpath = "ILLs_wiki"
include("../src/loaddata/illsdata.jl");

### 精确匹配

In [10]:
# 训练集已有关系
dict_precise_ques = DefaultDict{String, Vector{Tuple}}(Vector{Tuple})
for (que, _, rels...) in mt_train_ques_rels
    Tuple(rels) ∈ dict_precise_ques[que] && continue
    push!(dict_precise_ques[que], Tuple(rels))
end

In [16]:
que = "where does the man who influenced NER work?"
dict_precise_ques[que]

2-element Vector{Tuple}:
 ("influences", "work_institutions")
 ("influences", "workplaces")

### 去停用词

In [2]:
# 去停用词
function simplify_que(que)
    que = strip(replace(lowercase(que), rules...))
    newque = strip(replace(que, rules...))
    while newque != que
        que = newque
        newque = strip(replace(que, rules...))
    end
    que
end

function quesinfo(id)
    println("原始问句 | 翻译问句 | 简化问句 | 提交三元组")
    println(train_ques_ner[id][1])
    println(mt_train_ques[id])
    println(current_ques[id])
    println.(train_sols[id])
end

quesinfo (generic function with 1 method)

In [3]:
middlewords = "(does|did|does|do|are|is|was|he|she|her|his|they|the|that|a|an|of" # 介词助动词
middlewords *= "|[\\d-]+|used to|belong(s|)( to|)|involved (in|)|zoning" # 短语
middlewords *= "|in|to|ner's|at|by|ners|ner|one|from)"

rules = (
    r"(.*)/ .*" => s"\g<1>", # 重复问句
    r"(who did a job|do you know|the car )" => "", # 平凡短语
    r", (a|an) .*?[,?]" => "", # 修饰词
    "'s " => " ",
    r"'s$" => "",
    Regex("^$middlewords ") => "",
    Regex(" $middlewords ") => " ",
    Regex(" $middlewords\$") => "",
    "united states" => "",
    r"(which|where|who|when|what's|what|how|whose) " => " ", # 疑问词
    "?" => "",
    "," => "",
    "-" => " ",
    r"\(.*\)" => "",
    r" {2,}" => " ", # 消除空格
);

In [18]:
# 处理训练集
# 问题集
current_ques = simplify_que.(mt_train_ques)
# 问题关系
current_rels = [querel[3:end] for querel in mt_train_ques_rels]
traversed = Set{String}()
jump2, jump3 = Tuple[], Tuple[]
for (i, (que, rels)) in enumerate(zip(current_ques, current_rels))
    txt = join([que,'|', rels...], '\t')
    # txt ∈ traversed ? continue : push!(traversed, txt)
    length(rels) == 2 ? push!(jump2, (i, que, rels)) : push!(jump3, (i, que, rels))
end
sort!(jump2; by=i->length(split(i[2], ' ')))
sort!(jump3; by=i->length(split(i[2], ' ')));

### 方案4-辅助生成

In [56]:
# 二跳问题以及正则模板
test_ques = filter(i->length(split(i[2]))==2, jump2) # 筛选两单词的问题
tpls = Vector{Regex}(undef, length(jump2)) # 从两单词问题提取正则模板
train_men2rel = Vector{Dict}(undef, length(mt_train_ques)) # 训练集中的 mention => relation 字典

# 提取 mens
ind, mens, (rel1, rel2) = test_ques[1]
men1, men2 = mens = split(mens)

# 正向匹配距离更大
if dist(men1, rel1) + dist(men2, rel2) ≥ dist(men1, rel2) + dist(men2, rel1)
    men1, men2 = men2, men1
end

# 建立字典
train_men2rel[ind] = men2rel = Dict(men1 => rel1, men2 => rel2)

# 生成模板
que = mt_train_ques[ind]
tpls[1] = tpl = Regex(replace(que, rel2 => "(?P<men2>.*)", rel1 => "(?P<men1>.*)"))

# 模板用于提取字典
inds = filter(ismatch(tpl), mt_train_ques)

9-element Vector{String}:
 "what is the logo of the car that NER is related to?"
 "what is the basis of the car that NER is related to?"
 "what is the basis of the car that NER is related to?"
 "what is the series of the car that NER is related to?"
 "what is the series of the car that NER is related to?"
 "what is the series of the car that NER is related to?"
 "what is the title of the car that NER is related to?"
 "what is the title of the car that NER is related to?"
 "what is the type of the car that NER is related to?"

14051-element Vector{Vector{SubString{String}}}:
 ["known_for", "title_leader"]
 ["related", "logo"]
 ["result", "format"]
 ["client", "parent"]
 ["client", "parent"]
 ["client", "parent"]
 ["client", "parent"]
 ["countries_affected", "official_text"]
 ["countries_affected", "official_text"]
 ["countries_affected", "leader_name"]
 ["countries_affected", "leader_name"]
 ["notable_works", "author"]
 ["notable_works", "author"]
 ⋮
 ["birth_place", "northwest", "leader_title"]
 ["southwest", "north", "timezone_dst"]
 ["location", "west", "constituency_westminster"]
 ["predecessor", "title", "incumbent"]
 ["subject", "successor", "successor"]
 ["stadium", "n", "canton"]
 ["death_place", "southeast", "subdivision_type"]
 ["candidate", "predecessor", "title"]
 ["location", "northwest", "subdivision_name"]
 ["title", "appointer", "branch"]
 ["nationalteam", "location", "subdivision_type"]
 ["stadium", "seat", "region"]

In [None]:
for ind in inds
    dict = match(tpl, mt_train_ques[ind])
    men1, men2 = dict["men1"], dict["men2"]
    rel1, rel2 = 
    train_men2rel[ind] = men2rel = Dict(men1 => rel1, men2 => rel2)
end

In [47]:
mt_train_ques_rels[findall(ismatch(Regex("what is NER's (.*)'s (.*)")), mt_train_ques)]

7-element Vector{Vector{SubString{String}}}:
 ["what is NER's coach's ex", "Houston_Rockets", "coach", "before"]
 ["what is NER's father's ex", "Konoe_Tadabo", "father", "before"]
 ["what is NER's coach's ex", "turkey_national_football_team", "coach", "before"]
 ["what is NER's father's ex", "Princess_Yongchang_(Northern_Qi)", "father", "before"]
 ["what is NER's father's alma mater", "Toshihiko_Torama", "father", "alma_mater"]
 ["what is NER's coach's ex", "croatian_football_federation", "coach", "before"]
 ["what is NER's father's ex", "Jade_City", "father", "before"]

In [31]:
que = mt_train_ques[2]
rel2, rel1 = split(simplify_que(que))
reg = Regex(replace(que, rel2 => "(?P<rel2>.*)", rel1 => "(?P<rel1>.*)"))
res = match(reg, que)

RegexMatch("what is the logo of the car that NER is related to", rel2="logo", rel1="related")

In [38]:
res["rel1"]

"related"

In [44]:
for i in findall(ismatch(reg), mt_train_ques)
    que = mt_train_ques[i]
    res = match(reg, que)
    rel1, rel2 = res["rel1"], res["rel2"]
    println(que, '\t', rel1, '\t', rel2, "|\t|", join(current_rels[i], ' '))
end

what is the logo of the car that NER is related to?	related	logo|	|related logo
what is the basis of the car that NER is related to?	related	basis|	|related based_on
what is the basis of the car that NER is related to?	related	basis|	|related based_on
what is the series of the car that NER is related to?	related	series|	|related series
what is the series of the car that NER is related to?	related	series|	|related series
what is the series of the car that NER is related to?	related	series|	|related series
what is the title of the car that NER is related to?	related	title|	|related title
what is the title of the car that NER is related to?	related	title|	|related title
what is the type of the car that NER is related to?	related	type|	|related type


In [30]:
for (key, rels) in dict_rel_align
    if !haskey(dict_rel_align2, key)
        dict_rel_align2[key] = dict_rel_align[key]
    else
        rs = dict_rel_align2[key]
        append!(rs, filter(∉(rs), dict_rel_align[key]))
    end
end

### 方案一，相似度匹配 + 人工纠正

In [23]:
# 筛选一跳问题
smallcases = filter(i->length(split(i[2])) == 1, jump2)
trainques, trainrels = getindex.(smallcases, 2), getindex.(smallcases, 3)

# 按相似度构建字典
dict_rel_align = DefaultDict{String, Vector{String}}(Vector{String})
for (que, (r1, r2)) in zip(trainques, trainrels)
    rel = dist(que, r1) < dist(que, r2) ? r1 : r2
    rel ∈ dict_rel_align[que] && continue
    push!(dict_rel_align[que], rel)
end
counter(length.(values(dict_rel_align)))

Accumulator{Int64, Int64} with 3 entries:
  2 => 2
  3 => 1
  1 => 36

In [24]:
# 打印当前结果
for (rel, val) in dict_rel_align
    isempty(val) && continue
    length(val) == 1 && val[1] == rel && continue
    println(rel, '\t', val)
end

addition	["industry"]
chairman	["president"]
come	["parent"]
incident	["event"]
singer	["extra"]
northwest	["nw"]
parent	["parents"]
people	["name"]
subject	["theme"]
creators	["chronology"]
manager	["sponsor"]
author	["profession", "occupation"]
sponsors	["sponsor"]
master	["industry"]
mayor	["canton"]
descendants	["issue"]
creator	["occupation", "artist", "after"]
descendant	["successor", "issue"]
next	["alongside"]
head	["location_city"]
successor	["issue"]
father	["after"]
play	["alongside"]
battle	["battles"]


In [25]:
# 删除强相关
delete!.(Ref(dict_rel_align), ["addition", "come", "subject", "people", "next", "successor"])
# 纠正转义
rectify_rules = (
    "author" => ["chronology"],
    "master" => ["controlledby"],
    "mayor" => ["chieftown"],
    "descendants" => ["issue"],
    "descendant" => ["issue"],
    "creator" => ["chronology"],
    "head" => ["sponsor"],
    "next to" => ["alongside"], # 弱晒
    "father" => ["father", "parent", "parents"], # 考虑“关系对齐”
    "key people" => ["key_people"],
    "important people" => ["key_people"],
    "people" => ["name"]
)
for (key, val) in rectify_rules
    dict_rel_align[key] = val
end
# 记录弱相关
dict_weak = Dict(
    "successor" => ["issue", "children"] # 后代近形词
)

Dict{String, Vector{String}} with 1 entry:
  "successor" => ["issue", "children"]

In [None]:
# 不确定时，查询原问句
# quesinfo(6387)

# 可能错误的项，查询其他问题
jump2[findall(contains("successor"), getindex.(jump2, 2))]

# 可能错误的项，反向查询
# filter(i->"name" in last(i), jump2)

# 查询训练集原题
# filter(i->occursin("people", lowercase(i[1])) && !("key_people" ∈ i), mt_train_ques_rels)
# query_rel("alongside")

In [51]:
open("analyse/rel_align_1.txt", "w") do io
    for (key, val) in dict_rel_align
        println(io, key, "|\t|", join(val, '\t'))
    end
end

### 方案二-倒序匹配 + 人工纠正

In [26]:
smallcases = filter(i->length(split(i[2])) == 2, jump2)
trainwords, trainrels = split.(getindex.(smallcases, 2), ' '), getindex.(smallcases, 3)

# 按相似度构建字典
dict_rel_align2 = DefaultDict{String, Vector{String}}(Vector{String})
for ((w1, w2), (r1, r2)) in zip(trainwords, trainrels)
    # 就近匹配
    # r1, r2 = dist(w1, r1) + dist(w2, r2) < dist(w1, r2) + dist(w2, r1) ? (r1, r2) : (r2, r1)
    # 旧方案
    # rel1 ∉ dict_rel_align2[w1] && push!(dict_rel_align2[w1], rel1)
    # rel2 ∉ dict_rel_align2[w2] && push!(dict_rel_align2[w2], rel2)
    # 反向加入单词，并考虑词频
    (haskey(dict_rel_align, w1) && r2 ∈ dict_rel_align[w1]) || push!(dict_rel_align2[w1], r2)
    (haskey(dict_rel_align, w2) && r1 ∈ dict_rel_align[w2]) || push!(dict_rel_align2[w2], r1)
end
sort_count(length.(values(dict_rel_align2)));

In [27]:
sorted_rel_align2 = DefaultDict{String, AbstractDict}(AbstractDict)
for (key, val) in dict_rel_align2
    sorted_rel_align2[key] = sort_count(val) # 可以只用 counter, sort 方便观察
end
valid_rel_align2 = DefaultDict{String, Vector{String}}(Vector{String})
for (key, val) in sorted_rel_align2
    # 距离小或者频次高
    valids = filter(i->dist(i, key) ≤ 3 || val[i] ≥ 3, keys(val))
    valids = sort!(collect(valids); by=i->-val[i])
    isempty(valids) && continue
    # 排除已经得到的部分
    valid_rel_align2[key] = filter(∉(dict_rel_align[key]), valids)
end
valid_rel_align2;

In [None]:
# 不确定时，查询原问句
# quesinfo(5153)

# 可能错误的项，查询其他问题
(jump2[findall(contains("offspring"), getindex.(jump2, 2))])
# jump3[findall(contains("timeline"), getindex.(jump3, 2))]
# jump2[findall(i->occursin("born", i[2]) && !occursin("birth_place", i[2]), jump2)]

# 可能错误的项，反向查询
# filter(i->"education" in last(i), jump2)

# 查询训练集原题
# filter(i->occursin("city", lowercase(i[1])) && "location_city" ∈ i, mt_train_ques_rels)
# filter(i->occursin("largest city", lowercase(i[1])), mt_train_ques_rels)

# 已有结果
# dict_rel_align["product"]

In [28]:
valids = collect(valid_rel_align2)
i = 1
for (key, val) in valids[72:end]
    length(val) == 1 && val[1] == key && continue
    println(i, ' ', key, '\t', join(val, '\t'))
    i += 1
end

1 gender	genre
2 city	city	location_city	largest_city
3 offspring	issue
4 predecessor	predecessor	before	after	father	successor
5 starring	sponsor
6 largest	east
7 preceded	preceded_by
8 buried	issue
9 empire	dynasty
10 predecessors	predecessor
11 legislature	legislature	succession
12 year	years
13 winner	result	after_election
14 leaders	leader	residence
15 developers	developer
16 designed	designer
17 rules	regent
18 affected	countries_affected
19 led	leader
20 events	event
21 responsible	products	industry
22 homelands	countryofbirth
23 book	notable_works
24 happened	event
25 president	president	chairman
26 county	shire_county	region
27 originated	children
28 ex	before	coach	father	prev
29 famous	location	fam
30 next	alongside	succession	after	before	father	death_place
31 married	spouse
32 birthplace	birth_place	origin	eponym
33 province	state/province	province
34 team	division
35 actor	starring
36 positioning	position
37 addition	extra
38 era	era	eponym
39 come	eponym	client	sponsor	n

In [17]:
# 删除强相关
delete!.(Ref(valid_rel_align2), ["descendant", "sponsor", "lead", "successor"])
# 纠正转义
rectify_rules = (
    "beside ner" => ["alongside"],
    "died" => ["death_place"],
    "product" => ["products"],
    "spouses" => ["spouse"],
    "general" => ["commander"],
    "represents" => ["deputy"],
    "represent" => ["deputy"],
    "address" => ["residence", "honorific_prefix", "ground"],
    "timeline" => ["chronology"],
    "area" => ["area", "subdivision_name", "region"],
    "after" => ["successor", "next_show"],
    "graduate" => ["education"], # 正序
    "came" => ["birth_place"],
    "job" => ["ccupation", "profession"],
    "influence" => ["influenced", "influences"],
    "influenced" => ["influenced", "influences"],
    "east" => ["east"],
    "born" => ["birth_place"], # 顺序正向
    "customer" => ["client"],
    "located" => ["location", "country", "state", "city"], # 放在末尾
    "work" => ["work_institutions", "workplaces"],
    "succeed" => ["succession"], # 顺序正向
    "death" => ["death_place"],
    "eponymous" => ["eponym"],
    "live" => ["residence"],
    "largest city" => ["largest_city", "largest_city_name"],
    "city" => []
)
for (key, val) in rectify_rules
    valid_rel_align2[key] = val
end
# 完全相等
same_rels = ["title", "title leader", "location", "after election",
    "spouse", "coach", "eponym", "alongside"]
for rel in same_rels
    valid_rel_align2[rel] = [replace(rel, ' '=>'_')]
end

### 方案3-词频筛选

In [None]:
pairs = collect(sort_count(getindex.(mt_triples, 2)))

In [None]:
consecutive(que::AbstractString, k::Int) = consecutive(split(que, ' '), k)
consecutive(words::Vector, k::Int) = [join(words[i:i+k-1], '_') for i in 1:length(words)-k+1]

In [None]:
rels = unique!(vcat(current_rels...))
rel = rels[5]
num = count(i->rel ∈ i, current_rels)
println(rel, '\t', num)
sort_count(vcat(split.(current_ques[findall(i->rel ∈ i, current_rels)], ' ')...))
# sort_count(vcat(consecutive.(current_ques[findall(i->rel ∈ i, current_rels)], 2)...))

In [None]:
rel, _ = pairs[4]
println(rel)
ques = simplify_que.(query_rel(rel))
println(length(ques))
words = vcat(unique!.(split.(ques))...)
sort_count(words)