# 数据分析

预处理
1. 将 2/3 跳进行分类
2. 剔除无关信息，比如 do you know/ who did a job 等
3. 找出错误问题（策略待拟）

关系抽取
1. 识别关系在实体中的对齐，以及关系之间的对齐
2. 识别问题中的关系模式

语言模式
- 1-5040 英文
- 5041-9507 中文
- 9508-14051 法语

In [4]:
include("../src/CCKS-mKGQA.jl")
include("../src/loaddata/extractdata.jl") # 原始数据
include("../src/loaddata/translatedata.jl") # 翻译数据
include("../src/tools/ettalign.jl") # 对齐工具
include("../src/datatype.jl") # 数据结构

In [9]:
# illpath = "ILLs_1"
# illpath = "ILLs_2"
# illpath = "ILLs_3"
# illpath = "ILLs_4"
illpath = "ILLs_official"
# illpath = "ILLs_wiki"
include("../src/loaddata/illsdata.jl");

In [6]:
ismatch(reg::Regex, txt) = !isnothing(match(reg, txt))
ismatch(reg::Regex) = Base.Fix1(ismatch, reg)
query_rel(rel) = [que for (que, ner, rels...) in mt_train_ques_rels if rel ∈ rels]

query_rel (generic function with 1 method)

In [61]:
function simplify_que(que)
    que = strip(replace(lowercase(que), rules...))
    newque = strip(replace(que, rules...))
    while newque != que
        que = newque
        newque = strip(replace(que, rules...))
    end
    que
end
function quesinfo(id)
    println("原始问句，翻译问句，简化问句，提交三元组")
    println(train_ques_ner[id][1])
    println(mt_train_ques[id])
    println(current_ques[id])
    println.(train_sols[id])
end

quesinfo (generic function with 1 method)

In [80]:
quesinfo(5572)

原始问句，翻译问句，简化问句，提交三元组
在2014–15 AFC Ajax season里的人士属于哪个青年俱乐部
which youth club do people in NER belong to?
youth club people
("en", "2014–15_AFC_Ajax_season", "name", "Ricardo_Kishna")
("zh", "里卡多·基舒拿", "youthclubs", "ADO海牙")


2-element Vector{Nothing}:
 nothing
 nothing

In [83]:
middlewords = "(does|did|does|do|are|is|was|he|she|her|his|they|the|that|a|an|of" # 介词助动词
middlewords *= "|[\\d-]+|used to|belong(s|)( to|)|involved (in|)|zoning" # 短语
middlewords *= "|in|to|ner's|at|by|ners|ner|one|from)"
"united states,(which|where|who|when|what's|what|how|whose)"

rules = (
    # 重复问句
    r"(.*)/ .*" => s"\g<1>",
    # 平凡语句
    r"(who did a job|do you know|the car )" => "",
    # 修饰词
    r", (a|an) .*?[,?]" => "",
    # 介词助动词等
    "'s " => " ",
    r"'s$" => "",
    Regex("^$middlewords ") => "",
    Regex(" $middlewords ") => " ",
    Regex(" $middlewords\$") => "",
    "united states" => "",
    # 疑问词
    r"(which|where|who|when|what's|what|how|whose) " => " ",
    "?" => "",
    "," => "",
    "-" => " ",
    r"\(.*\)" => "",
    r" {2,}" => " ",
);

In [88]:
sort_count(@. length(split(getindex(jump3, 2), ' ')))

OrderedDict{Int64, Int64} with 18 entries:
  4  => 539
  5  => 535
  3  => 535
  6  => 300
  7  => 168
  8  => 97
  9  => 70
  10 => 48
  2  => 40
  11 => 26
  12 => 17
  13 => 12
  16 => 3
  15 => 3
  14 => 3
  19 => 2
  20 => 1
  18 => 1

In [87]:
sort_count(@. length(split(getindex(jump2, 2), ' ')))

OrderedDict{Int64, Int64} with 18 entries:
  3  => 1502
  2  => 1248
  4  => 891
  5  => 484
  6  => 287
  7  => 228
  8  => 132
  9  => 83
  1  => 70
  10 => 51
  11 => 24
  12 => 16
  13 => 8
  14 => 4
  20 => 3
  15 => 2
  19 => 1
  18 => 1

In [84]:
current_ques = simplify_que.(mt_train_ques)
current_rels = [querel[3:end] for querel in mt_train_ques_rels]
traversed = Set{String}()
jump2, jump3 = Tuple{Int, String, String}[], Tuple{Int, String, String}[]
for (i, (que, rels)) in enumerate(zip(current_ques, current_rels))
    txt = [que, join(rels, '\t')]
    join(txt, '\t') ∈ traversed ? continue : push!(traversed, join(txt, '\t'))
    length(rels) == 2 ? push!(jump2, (i, txt...)) : push!(jump3, (i, txt...))
end
sort!(jump2; by=i->length(split(i[2], ' ')))
sort!(jump3; by=i->length(split(i[2], ' ')))

open("analyse/jump2.txt", "w") do io
    for (id, que, rels) in jump2
        println(io, id, '\t', que, "|\t|", rels)
    end
end
open("analyse/jump3.txt", "w") do io
    for (id, que, rels) in jump3
        println(io, id, '\t', que, "|\t|", rels)
    end
end

In [6]:
open("valid_ques.txt", "w") do io
    for que in ill_valid_ques
        println(io, simplify_que(que))
    end
end

### 统计词频

In [184]:
pairs = collect(sort_count(getindex.(mt_triples, 2)));

In [187]:
rel, _ = pairs[4]
println(rel)
ques = simplify_que.(query_rel(rel))
println(length(ques))
words = vcat(unique!.(split.(ques))...)
sort_count(words)

location
951


OrderedDict{SubString{String}, Int64} with 317 entries:
  "location"       => 774
  "to"             => 162
  "in"             => 140
  "team"           => 138
  "event"          => 132
  "national"       => 130
  "east"           => 104
  "name"           => 101
  "north"          => 86
  "leader"         => 82
  "west"           => 80
  "subdivision"    => 72
  "type"           => 70
  "title"          => 58
  "located"        => 57
  "position"       => 56
  "capital"        => 48
  "starts"         => 47
  "administrative" => 45
  "established"    => 43
  "zone"           => 40
  "eastern"        => 40
  "northwest"      => 38
  "time"           => 38
  "division"       => 38
  ⋮                => ⋮