# 数据分析

预处理
1. 将 2/3 跳进行分类
2. 剔除无关信息，比如 do you know/ who did a job 等
3. 找出错误问题（策略待拟）

关系抽取
1. 识别关系在实体中的对齐，以及关系之间的对齐
2. 识别问题中的关系模式

语言模式
- 1-5040 英文
- 5041-9507 中文
- 9508-14051 法语

### 初始化

In [None]:
include("../src/CCKS-mKGQA.jl")
include("../src/loaddata/extractdata.jl") # 原始数据
include("../src/loaddata/translatedata.jl") # 翻译数据
include("../src/tools/ettalign.jl") # 对齐工具
include("../src/datatype.jl") # 数据结构
include("../src/tools/distance.jl")

In [None]:
# illpath = "ILLs_1"
# illpath = "ILLs_2"
# illpath = "ILLs_3"
# illpath = "ILLs_4"
illpath = "ILLs_official"
# illpath = "ILLs_wiki"
include("../src/loaddata/illsdata.jl");

In [None]:
function simplify_que(que)
    que = strip(replace(lowercase(que), rules...))
    newque = strip(replace(que, rules...))
    while newque != que
        que = newque
        newque = strip(replace(que, rules...))
    end
    que
end
query_rel(rel) = [que for (que, ner, rels...) in mt_train_ques_rels if rel ∈ rels]
function quesinfo(id)
    println("原始问句，翻译问句，简化问句，提交三元组")
    println(train_ques_ner[id][1])
    println(mt_train_ques[id])
    println(current_ques[id])
    println.(train_sols[id])
end

### 预处理及数据分析

In [None]:
middlewords = "(does|did|does|do|are|is|was|he|she|her|his|they|the|that|a|an|of" # 介词助动词
middlewords *= "|[\\d-]+|used to|belong(s|)( to|)|involved (in|)|zoning" # 短语
middlewords *= "|in|to|ner's|at|by|ners|ner|one|from)"

rules = (
    # 重复问句
    r"(.*)/ .*" => s"\g<1>",
    # 平凡语句
    r"(who did a job|do you know|the car )" => "",
    # 修饰词
    r", (a|an) .*?[,?]" => "",
    # 介词助动词等
    "'s " => " ",
    r"'s$" => "",
    Regex("^$middlewords ") => "",
    Regex(" $middlewords ") => " ",
    Regex(" $middlewords\$") => "",
    "united states" => "",
    # 疑问词
    r"(which|where|who|when|what's|what|how|whose) " => " ",
    "?" => "",
    "," => "",
    "-" => " ",
    r"\(.*\)" => "",
    r" {2,}" => " ",
);

In [None]:
# 处理训练集
# 问题集
current_ques = simplify_que.(mt_train_ques)
# 问题关系
current_rels = [querel[3:end] for querel in mt_train_ques_rels]
traversed = Set{String}()
jump2, jump3 = Tuple[], Tuple[]
for (i, (que, rels)) in enumerate(zip(current_ques, current_rels))
    txt = join([que,'|', rels...], '\t')
    # txt ∈ traversed ? continue : push!(traversed, txt)
    length(rels) == 2 ? push!(jump2, (i, que, rels)) : push!(jump3, (i, que, rels))
end
sort!(jump2; by=i->length(split(i[2], ' ')))
sort!(jump3; by=i->length(split(i[2], ' ')))

open("analyse/jump2.txt", "w") do io
    for (id, que, rels) in jump2
        println(io, id, '\t', que, "|\t|", join(rels, '\t'))
    end
end
open("analyse/jump3.txt", "w") do io
    for (id, que, rels) in jump3
        println(io, id, '\t', que, "|\t|", join(rels, '\t'))
    end
end

In [None]:
sort_count(@. length(split(getindex(jump2, 2), ' ')))

In [None]:
sort_count(@. length(split(getindex(jump3, 2), ' ')))

### 方案一，相似度匹配 + 人工纠正

In [None]:
# 筛选一跳问题
smallcases = filter(i->length(split(i[2])) == 1, jump2)
trainques, trainrels = getindex.(smallcases, 2), getindex.(smallcases, 3)

# 按相似度构建字典
dict_rel_align = DefaultDict{String, Vector{String}}(Vector{String})
for (que, (r1, r2)) in zip(trainques, trainrels)
    rel = dist(que, r1) < dist(que, r2) ? r1 : r2
    rel ∈ dict_rel_align[que] && continue
    push!(dict_rel_align[que], rel)
end
counter(length.(values(dict_rel_align)))

In [None]:
# 打印当前结果
for (rel, val) in dict_rel_align
    isempty(val) && continue
    length(val) == 1 && val[1] == rel && continue
    println(rel, '\t', val)
end

In [None]:
# 删除强相关
delete!.(Ref(dict_rel_align), ["addition", "come", "subject", "people", "next", "successor"])
# 纠正转义
rectify_rules = (
    "author" => ["chronology"],
    "master" => ["controlledby"],
    "mayor" => ["chieftown"],
    "descendants" => ["issue"],
    "descendant" => ["issue"],
    "creator" => ["chronology"],
    "head" => ["sponsor"],
    "next to" => ["alongside"], # 弱晒
    "father" => ["father", "parent", "parents"], # 考虑“关系对齐”
    "key people" => ["key_people"],
    "important people" => ["key_people"],
    "people" => ["name"]
)
for (key, val) in rectify_rules
    dict_rel_align[key] = val
end
# 记录弱相关
dict_weak = Dict(
    "successor" => ["issue", "children"] # 后代近形词
)

In [None]:
# 不确定时，查询原问句
# quesinfo(6387)

# 可能错误的项，查询其他问题
jump2[findall(contains("successor"), getindex.(jump2, 2))]

# 可能错误的项，反向查询
# filter(i->"name" in last(i), jump2)

# 查询训练集原题
# filter(i->occursin("people", lowercase(i[1])) && !("key_people" ∈ i), mt_train_ques_rels)
# query_rel("alongside")

In [None]:
open("analyse/rel_align_1.txt", "w") do io
    for (key, val) in dict_rel_align
        println(io, key, "|\t|", join(val, '\t'))
    end
end

### 方案二-倒序匹配 + 人工纠正

In [None]:
smallcases = filter(i->length(split(i[2])) == 2, jump2)
trainwords, trainrels = split.(getindex.(smallcases, 2), ' '), getindex.(smallcases, 3)

# 按相似度构建字典
dict_rel_align2 = DefaultDict{String, Vector{String}}(Vector{String})
for ((w1, w2), (r1, r2)) in zip(trainwords, trainrels)
    # 就近匹配
    # r1, r2 = dist(w1, r1) + dist(w2, r2) < dist(w1, r2) + dist(w2, r1) ? (r1, r2) : (r2, r1)
    # 旧方案
    # rel1 ∉ dict_rel_align2[w1] && push!(dict_rel_align2[w1], rel1)
    # rel2 ∉ dict_rel_align2[w2] && push!(dict_rel_align2[w2], rel2)
    # 反向加入单词，并考虑词频
    (haskey(dict_rel_align, w1) && r2 ∈ dict_rel_align[w1]) || push!(dict_rel_align2[w1], r2)
    (haskey(dict_rel_align, w2) && r1 ∈ dict_rel_align[w2]) || push!(dict_rel_align2[w2], r1)
end
sort_count(length.(values(dict_rel_align2)));

In [None]:
sorted_rel_align2 = DefaultDict{String, AbstractDict}(AbstractDict)
for (key, val) in dict_rel_align2
    sorted_rel_align2[key] = sort_count(val) # 可以只用 counter, sort 方便观察
end
valid_rel_align2 = DefaultDict{String, Vector{String}}(Vector{String})
for (key, val) in sorted_rel_align2
    # 距离小或者频次高
    valids = filter(i->dist(i, key) ≤ 3 || val[i] ≥ 3, keys(val))
    valids = sort!(collect(valids); by=i->-val[i])
    isempty(valids) && continue
    # 排除已经得到的部分
    valid_rel_align2[key] = filter(∉(dict_rel_align[key]), valids)
end
valid_rel_align2;

In [None]:
# 不确定时，查询原问句
# quesinfo(5153)

# 可能错误的项，查询其他问题
(jump2[findall(contains("offspring"), getindex.(jump2, 2))])
# jump3[findall(contains("timeline"), getindex.(jump3, 2))]
# jump2[findall(i->occursin("born", i[2]) && !occursin("birth_place", i[2]), jump2)]

# 可能错误的项，反向查询
# filter(i->"education" in last(i), jump2)

# 查询训练集原题
# filter(i->occursin("city", lowercase(i[1])) && "location_city" ∈ i, mt_train_ques_rels)
# filter(i->occursin("largest city", lowercase(i[1])), mt_train_ques_rels)

# 已有结果
# dict_rel_align["product"]

In [37]:
rels = vcat(current_rels[findall(i->occursin(" election of ", lowercase(i[1])), mt_train_ques_rels)]...)
sort_count(rels)

OrderedDict{SubString{String}, Int64} with 45 entries:
  "after_election" => 317
  "birth_place"    => 19
  "death_place"    => 17
  "monarch"        => 11
  "successor"      => 11
  "religion"       => 10
  "nationality"    => 10
  "state"          => 10
  "branch"         => 10
  "president"      => 10
  "title"          => 10
  "spouse"         => 10
  "profession"     => 10
  "primeminister"  => 10
  "country"        => 10
  "constituency"   => 10
  "party"          => 10
  "alma_mater"     => 10
  "occupation"     => 9
  "residence"      => 9
  "predecessor"    => 9
  "before"         => 8
  "battles"        => 8
  "order"          => 8
  "children"       => 8
  ⋮                => ⋮

In [None]:
valids = collect(valid_rel_align2)
i = 1
for (key, val) in valids[72:end]
    length(val) == 1 && val[1] == key && continue
    println(i, ' ', key, '\t', join(val, '\t'))
    i += 1
end

In [None]:
# 删除强相关
delete!.(Ref(valid_rel_align2), ["descendant", "sponsor", "lead", "successor"])
# 纠正转义
rectify_rules = (
    "beside ner" => ["alongside"],
    "died" => ["death_place"],
    "product" => ["products"],
    "spouses" => ["spouse"],
    "general" => ["commander"],
    "represents" => ["deputy"],
    "represent" => ["deputy"],
    "address" => ["residence", "honorific_prefix", "ground"],
    "timeline" => ["chronology"],
    "area" => ["area", "subdivision_name", "region"],
    "after" => ["successor", "next_show"],
    "graduate" => ["education"], # 正序
    "came" => ["birth_place"],
    "job" => ["ccupation", "profession"],
    "influence" => ["influenced", "influences"],
    "influenced" => ["influenced", "influences"],
    "east" => ["east"],
    "born" => ["birth_place"], # 顺序正向
    "customer" => ["client"],
    "located" => ["location", "country", "state", "city"], # 放在末尾
    "work" => ["work_institutions", "workplaces"],
    "succeed" => ["succession"], # 顺序正向
    "death" => ["death_place"],
    "eponymous" => ["eponym"],
    "live" => ["residence"],
    "largest city" => ["largest_city", "largest_city_name"],
    "city" => []
)
for (key, val) in rectify_rules
    valid_rel_align2[key] = val
end
# 完全相等
same_rels = ["title", "title leader", "location", "after election",
    "spouse", "coach", "eponym", "alongside"]
for rel in same_rels
    valid_rel_align2[rel] = [replace(rel, ' '=>'_')]
end

### 方案3-词频筛选

In [None]:
pairs = collect(sort_count(getindex.(mt_triples, 2)))

In [None]:
consecutive(que::AbstractString, k::Int) = consecutive(split(que, ' '), k)
consecutive(words::Vector, k::Int) = [join(words[i:i+k-1], '_') for i in 1:length(words)-k+1]

In [None]:
rels = unique!(vcat(current_rels...))
rel = rels[5]
num = count(i->rel ∈ i, current_rels)
println(rel, '\t', num)
sort_count(vcat(split.(current_ques[findall(i->rel ∈ i, current_rels)], ' ')...))
# sort_count(vcat(consecutive.(current_ques[findall(i->rel ∈ i, current_rels)], 2)...))

In [None]:
rel, _ = pairs[4]
println(rel)
ques = simplify_que.(query_rel(rel))
println(length(ques))
words = vcat(unique!.(split.(ques))...)
sort_count(words)