# 实体对齐

注：由于存在重复的三元组，提交不知道会不会区分这些情形
- 比如假设 tri1 即为 en 也为 zh，那么
- en -> zh -> en 和 zh -> zh -> en 都将作为答案

In [1]:
include("../src/CCKS-mKGQA.jl")
include("../src/extractdata.jl")
include("../src/translatedata.jl")
include("../src/ettalign.jl")
mkpath.("EA_data/" .* (["", "ILLs_1", "ILLs_2", "ILLs_official", "ILLs_wiki"]))

5-element Vector{String}:
 "EA_data"
 "EA_data/ILLs_1"
 "EA_data/ILLs_2"
 "EA_data/ILLs_official"
 "EA_data/ILLs_wiki"

### 模型对齐-旧

In [2]:
# ILLs => 对齐字典
path = "ILLs_1"
newILLs = split(strip(read(open("EA_data/$path/ETT_Pairs.txt", "r"), String)), '\n')
newILLs = Tuple.(split.(newILLs, '\t'))
filter!(i->i[1] != "Gregoire_Kayibanda", newILLs);

### 模型对齐-新

In [3]:
# ILLs => 对齐字典
path = "ILLs_2"
newILLs = split(strip(read(open("EA_data/$path/ILLs_predict-7.6.txt", "r"), String)), '\n')
newILLs = Tuple.(split.(newILLs, '\t'));

### 原始 ILLs

In [4]:
path = "ILLs_official"
newILLs = ILLs;

### wiki ILLs

In [5]:
path = "ILLs_wiki"
newILLs = wiki_ILLs;

In [6]:
mt_newILLs = [(dict_words[en], dict_words[zh]) for (en, zh) in newILLs]
filter!(i->i[1]!=i[2], unique!(mt_newILLs))
println(length.([newILLs, mt_newILLs])) # 13055 => 9802
dict_ILLs = ill2dict(mt_newILLs)

# 翻译三元组
mt_triples = triple_byMT.(triples)

# 对齐三元组
triple_byills(triple) = triple_byills(triple...)
triple_byills(sub, rel, obj) = (get(dict_ILLs, sub, sub), rel, get(dict_ILLs, obj, obj))
ill_triples = triple_byills.(mt_triples)

# 验证集
ill_valid_ners = [get(dict_ILLs, ner, ner) for ner in last.(mt_valid_ques_ner)]
ill_valid_ques = first.(mt_valid_ques_ner)

# 训练集
ill_train_ners = [get(dict_ILLs, ner, ner) for ner in getindex.(mt_train_ques_rels, 2)]
ill_train_ques = mt_train_ques;
ill_train_sols = [triple_byills.(triple_byMT.(getindex.(sols, Ref(2:4)))) for sols in train_sols]
ill_train_rels = [getindex.(sols, 2) for sols in ill_train_sols];

[42072, 29655]


### 测试

In [7]:
using DataStructures
edges = DefaultDict{String, Vector{Tuple}}(Vector{Tuple})
for (sub, rel, obj) in ill_triples
    push!(edges[sub], (rel, obj))
end

subs = unique!(first.(ill_triples))
# 第一跳
jumps = Dict(sub => [length(edges[sub])] for sub in subs)
# 第二跳
for sub in subs
    push!(jumps[sub], sum(haskey(jumps, obj) && jumps[obj][1] for (rel, obj) in edges[sub]))
end
# 第三跳
for sub in subs
    push!(jumps[sub], sum(haskey(jumps, obj) && jumps[obj][2] for (rel, obj) in edges[sub]))
end

In [45]:
# 官方 ILLs
println([count(iszero, getindex.(values(jumps), i)) for i in 1:3])
count(i->jumps[i][2]==0, ill_valid_ners), count(i->jumps[i][3]==0, ill_valid_ners)

[0, 52268, 67437]


(156, 352)

In [32]:
# 旧 ILLs
println([count(iszero, getindex.(values(jumps), i)) for i in 1:3])
count(i->jumps[i][2]==0, ill_valid_ners), count(i->jumps[i][3]==0, ill_valid_ners)

[0, 44396, 57285]


(80, 209)

In [10]:
# 新 ILLs
println([count(iszero, getindex.(values(jumps), i)) for i in 1:3])
count(i->jumps[i][2]==0, ill_valid_ners), count(i->jumps[i][3]==0, ill_valid_ners)

[0, 47455, 61439]


(98, 263)

In [8]:
# wiki ILLs
println([count(iszero, getindex.(values(jumps), i)) for i in 1:3])
count(i->jumps[i][2]==0, ill_valid_ners), count(i->jumps[i][3]==0, ill_valid_ners)

[0, 38070, 49921]


(6, 103)

In [7]:
function sort_count(list)
    c_list = counter(list)
    sort(Dict(c_list), by=i->c_list[i], rev=true)
end

sort_count (generic function with 1 method)

### 保存

In [46]:
open("EA_data/$path/triples.txt", "w") do io
    for triple in ill_triples
        println(io, join(triple, '\t'))
    end
end

open("EA_data/$path/train_ques.txt", "w") do io
    for (que, ner, rels) in zip(ill_train_ques, ill_train_ners, ill_train_rels)
        println(io, que, '\t', ner, '\t', join(rels, '\t'))
    end
end

open("EA_data/$path/train_data.txt", "w") do io
    for (que, ner, sols) in zip(ill_train_ques, ill_train_ners, ill_train_sols)
        println(io, que, '\t', ner)
        for sol in sols
            println(io, join(sol, '\t'))
        end
        println(io)
    end
end

open("EA_data/$path/valid_ques.txt", "w") do io
    for (que, ner) in zip(ill_valid_ques, ill_valid_ners)
        println(io, que, '\t', ner)
    end
end