### 处理内容
1. 提取三元组的关键词
2. 训练集分解为：问题 + 解答
3. 删除重复数据

In [2]:
;cd ../data

/home/rex/work_space/7 others/ccks/CCKS-mKGQA/data


### 关键词提取
存储在 `extract` 目录下

### 第一部分
ILLs(zh-en).txt, triple_en.txt, triple_zh.txt

In [3]:
en_obj_reg = r"<http://dbpedia.org/resource/(.*)>"
en_rel_reg = r"<http://dbpedia.org/property/(.*)>"
zh_obj_reg = r"<http://zh.dbpedia.org/resource/(.*)>"
zh_rel_reg = r"<http://zh.dbpedia.org/property/(.*)>"

r"<http://zh.dbpedia.org/property/(.*)>"

In [4]:
get_txt(reg, txt) = match(reg, txt).captures[1]
function line2triple(line; en=true)
    txts = split(line, ' ')
    rel, obj = en ? (en_rel_reg, en_obj_reg) : (zh_rel_reg, zh_obj_reg)
    Tuple(get_txt.([obj, rel, obj], txts))
end

line2triple (generic function with 1 method)

In [145]:
line_en = "<http://dbpedia.org/resource/Gokuraku-ji_(Kamakura)> <http://dbpedia.org/property/denomination> <http://dbpedia.org/resource/Shingon_Buddhism>"
line_zh = "<http://zh.dbpedia.org/resource/中华人民共和国外交部> <http://zh.dbpedia.org/property/c> <http://zh.dbpedia.org/resource/中華民國外交部>"
println(line2triple(line_en))
println(line2triple(line_zh;en=false))

("Gokuraku-ji_(Kamakura)", "denomination", "Shingon_Buddhism")
("中华人民共和国外交部", "c", "中華民國外交部")


In [146]:
# 处理英文三元组
output_io = open("extract/triple_en.txt", "w")
open("triple_en.txt", "r") do io
    for i in 1:162544
        txt = join(line2triple(readline(io)), '\t')
        write(output_io, txt, "\n")
    end
end
close(output_io)

In [214]:
## 处理乱码
rel_rules = (
    r"\d上一節目" => "上一節目",
    r"\d+下一節目" => "下一節目",
    "région" => "region",
    "département" => "department");

In [215]:
# 处理中文三元组
output_io = open("extract/triple_zh.txt", "w")
open("triple_zh.txt", "r") do io
    for i in 1:104941
        txt = join(line2triple(readline(io); en=false), '\t')
        txt = replace(txt, rel_rules...)
        write(output_io, txt, "\n")
    end
end
close(output_io)

In [7]:
# 处理实体对齐
filename = "ILLs(zh-en).txt"
output_io = open("extract/$filename", "w")
open(filename, "r") do io
    for i in 1:13636
        txt = split(readline(io), ' ')
        write(output_io, get_txt(en_obj_reg, txt[1]), '\t',
            get_txt(zh_obj_reg, txt[3]), '\n')
    end
end
close(output_io)

In [8]:
# 删除重复信息
txts = rstrip(read(open("extract/ILLs(zh-en).txt", "r"), String))
doubles = NTuple{2, String}[Tuple(split(txt, '\t')) for txt in split(txts, '\n')]
unique_doubles = unique(doubles)
println(length(doubles),'\t', length(unique_doubles))
# 13636	13249
# 存在重复，比如：Princess_Alexia_of_Greece_and_Denmark

# 补充知识对齐
new_rel = (
    "Huizhou" => "惠州市",
    "Leonid_Brezhnev" => "列昂尼德·伊里奇·勃列日涅夫",
    "Li_Qingzhao" => "李清照",
    "Sun_Lianzhong" => "孫連仲"
)

open("extract/ILLs(zh-en).txt", "w") do io
    for (en, zh) in unique_doubles
        println(io, en, '\t', zh)
    end
    for (en, zh) in new_rel
        println(io, en,'\t', zh)
    end
end

13636	13249


### 第二部分
train_data.txt

In [None]:
# 删除重复信息
function unique_txts(infile, outfile)
    text = String[]
    open(infile, "r") do io
        for _ in 1:14262
            push!(text, readline(io))
        end
    end
    output = open(outfile, "w")
    for line in unique!(text)
        write(output, line, '\n')
    end
    close(output)
end
unique_txts("train_data.txt", "train_data_unique.txt")

In [150]:
"""获取三元组信息"""
function get_info(triple)
    s1 = match(zh_obj_reg, first(triple))
    isnothing(s1) || return "zh\t" * join(get_txt.(
            [zh_obj_reg, zh_rel_reg, zh_obj_reg], triple),'\t')
    return "en\t" * join(get_txt.(
            [en_obj_reg, en_rel_reg, en_obj_reg], triple),'\t')
end

"""拆分问题和答案"""
function QandA(txt)
    que, ans = split(txt, '\t')
    ans = split(ans, '#')
    len = Int(length(ans)/3)
    "{$(len)} $(que)\n" *
    join([get_info(ans[3 * i - 2: 3 * i]) for i in 1:len],'\n','\n')
end

QandA

In [151]:
txt = "which official language is used by the country that is affected by 1957 Mongolia earthquake?	<http://dbpedia.org/resource/1957_Mongolia_earthquake>#<http://dbpedia.org/property/countriesAffected>#<http://dbpedia.org/resource/Mongolia>#<http://zh.dbpedia.org/resource/蒙古国>#<http://zh.dbpedia.org/property/官方文字>#<http://zh.dbpedia.org/resource/蒙古字母>"
print(QandA(txt))

{2} which official language is used by the country that is affected by 1957 Mongolia earthquake?
en	1957_Mongolia_earthquake	countriesAffected	Mongolia
zh	蒙古国	官方文字	蒙古字母

In [152]:
# 处理训练集
output_io = open("extract/train_data.txt", "w")
# dict = Dict{Char,Int}('2'=>0, '3'=>0)
open("train_data_unique.txt", "r") do io
    for i in 1:14077
        println(output_io, QandA(readline(io)), '\n')
    end
end
close(output_io)

In [154]:
## 检查训练集的解答与关键字
solutions = Dict{String, Vector{NTuple{4, String}}}()
# ques, sols =  String[], Vector{NTuple{4, String}}[] # 存在多解的问题
open("extract/train_data.txt", "r") do io
    for _ in 1:14077
        que = readline(io)
        ind, que = parse(Int, que[2]), que[5:end]
        # haskey(solutions, que) && (push!(ques, que);push!(sols,solutions[que]))
        solutions[que] = [Tuple(split(readline(io), '\t')) for _ in 1:ind]
        readline(io)
    end
end
length(solutions)

13770

### 统计词频

In [216]:
# 中文三元组，实体，关系
txts = rstrip(read(open("extract/triple_zh.txt", "r"), String))
zh_triples = [NTuple{3,String}(split(txt, '\t')) for txt in split(txts, '\n')]
zh_objs = unique!(vcat(first.(zh_triples), last.(zh_triples)))
zh_rels = unique(triple[2] for triple in zh_triples)
println((length(zh_objs), length(zh_rels), 2*length(zh_triples)/length(zh_objs)))

# 英文三元组，实体，关系
txts = rstrip(read(open("extract/triple_en.txt", "r"), String))
en_triples = [NTuple{3,String}(split(txt, '\t')) for txt in split(txts, '\n')]
en_objs = unique!(vcat(first.(en_triples), last.(en_triples)))
en_rels = unique(triple[2] for triple in en_triples)
length(en_objs), length(en_rels),2*length(en_triples)/length(en_objs)

(77541, 1141, 2.7067228949845887)


(122525, 1639, 2.6532381146704753)

In [218]:
# 记录中文关系
non_en_rels = filter(i->!all(isascii,i), zh_rels)
open("extract/purezh_rels.txt", "w") do io
    for line in non_en_rels
        println(io, line)
    end
end

In [202]:
using DataStructures

In [226]:
# 中文实体频次
zh_count_obj = Dict(counter(vcat(first.(zh_triples), last.(zh_triples))))
sort(zh_count_obj, by=i->zh_count_obj[i], rev=true)

OrderedDict{String, Int64} with 77541 entries:
  "欧洲中部时… => 1992
  "欧洲中部夏… => 1972
  "省_(西班牙… => 1277
  "西班牙行政… => 1237
  "國家"       => 914
  "美國行政區… => 822
  "美利堅合眾… => 494
  "加泰罗尼亚" => 409
  "巴西行政区… => 406
  "匈牙利行政… => 364
  "美國"       => 316
  "美国"       => 292
  "英語"       => 280
  "卡斯蒂利亚… => 278
  "北美山区时… => 273
  "北美山区日… => 262
  "歌手"       => 212
  "德国行政区… => 210
  "县_(德国)"  => 194
  "天主教"     => 188
  "犹他州"     => 177
  "国家"       => 161
  "阿拉斯区"   => 159
  "北美中部时… => 157
  "犹他州行政… => 153
  ⋮            => ⋮

In [228]:
# 中文关系频次
zh_count_rel = Dict(counter(triple[2] for triple in zh_triples))
sort(zh_count_rel, by=i->zh_count_rel[i], rev=true)

OrderedDict{String, Int64} with 1141 entries:
  "subdivisionName" => 8287
  "subdivisionType" => 8049
  "timezone"        => 3943
  "predecessor"     => 3509
  "successor"       => 3300
  "title"           => 3044
  "genre"           => 1996
  "arrondissement"  => 1698
  "canton"          => 1645
  "timezone1Dst"    => 1593
  "name"            => 1566
  "location"        => 1547
  "timezoneDst"     => 1434
  "starring"        => 1392
  "almaMater"       => 1391
  "party"           => 1331
  "type"            => 1317
  "after"           => 1281
  "before"          => 1256
  "religion"        => 1223
  "stadium"         => 1210
  "office"          => 1159
  "discoverySite"   => 1153
  "clubs"           => 1077
  "city"            => 1028
  ⋮                 => ⋮

In [227]:
# 英文实体频次
en_count_obj = Dict(counter(vcat(first.(en_triples), last.(en_triples))))
sort(en_count_obj, by=i->en_count_obj[i], rev=true)

OrderedDict{String, Int64} with 122525 entries:
  "Countries_of_the_world"                   => 4211
  "Voivodeships_of_Poland"                   => 2840
  "List_of_sovereign_states"                 => 2003
  "List_of_countries"                        => 1519
  "Autonomous_communities_of_Spain"          => 1138
  "Political_divisions_of_the_United_States" => 715
  "Federal_Information_Processing_Standard"  => 641
  "Regions_of_the_Czech_Republic"            => 510
  "ZIP_code"                                 => 492
  "Districts_of_Serbia"                      => 470
  "Postal_code"                              => 414
  "Districts_of_the_Czech_Republic"          => 412
  "Provinces_and_territories_of_Canada"      => 410
  "Provinces_of_Spain"                       => 402
  "Auckland"                                 => 366
  "Municipalities_of_Serbia"                 => 343
  "New_Zealand"                              => 343
  "Captain_(association_football)"           => 325
  "Province

In [229]:
# 英文关系频次
en_count_rel = Dict(counter(triple[2] for triple in en_triples))
sort(en_count_rel, by=i->en_count_rel[i], rev=true)

OrderedDict{String, Int64} with 1639 entries:
  "subdivisionType" => 24072
  "title"           => 11363
  "stadium"         => 9072
  "location"        => 8705
  "before"          => 5006
  "after"           => 4925
  "birthPlace"      => 3433
  "name"            => 2993
  "club"            => 2777
  "predecessor"     => 2493
  "team"            => 2078
  "north"           => 2005
  "postalCodeType"  => 1976
  "south"           => 1829
  "east"            => 1819
  "west"            => 1719
  "issue"           => 1687
  "seat"            => 1631
  "deathPlace"      => 1490
  "studio"          => 1480
  "northeast"       => 1357
  "southeast"       => 1279
  "blankName"       => 1278
  "southwest"       => 1252
  "northwest"       => 1237
  ⋮                 => ⋮