## 实体对齐

### 读取数据

In [1]:
;cd ../data

/home/rex/work_space/7 others/ccks/CCKS-mKGQA/data


In [2]:
# 中文三元组，实体，关系
txts = rstrip(read(open("extract/triple_zh.txt", "r"), String))
zh_triples = [NTuple{3,String}(split(txt, '\t')) for txt in split(txts, '\n')]
zh_objs = unique!(vcat(first.(zh_triples), last.(zh_triples)))
zh_rels = unique(triple[2] for triple in zh_triples)

# 英文三元组，实体，关系
txts = rstrip(read(open("extract/triple_en.txt", "r"), String))
en_triples = [NTuple{3,String}(split(txt, '\t')) for txt in split(txts, '\n')]
en_objs = unique!(vcat(first.(en_triples), last.(en_triples)))
en_rels = unique(triple[2] for triple in en_triples)

# ILLs 对齐文件
txts = rstrip(read(open("extract/ILLs(zh-en).txt", "r"), String))
ILLs = Dict{String,String}(split(txt, '\t') for txt in split(txts,'\n'))

# 训练数据
train_data = Dict{String, Vector{NTuple{4, String}}}()
open("extract/train_data.txt", "r") do io
    for _ in 1:14077
        que = readline(io)
        ind, que = parse(Int, que[2]), que[5:end]
        train_data[que] = [Tuple(split(readline(io), '\t')) for _ in 1:ind]
        readline(io)
    end
end

### 知识图谱对齐

#### 跨图谱是否需要对齐

In [4]:
"""判断三元组是否对齐"""
function isaligned(t1, t2)
    first(t1) == first(t2) && return true # 没有跨语言
    if first(t1) == "zh" # 不妨设 t1 为英文，判断能否借助 ILL 翻译为中文
        t1, t2 = t2, t1
    end
    target = (t2[2], t2[4]) # 头实体，尾实体
    any(in(target), [t1[2], t1[4]]) && return true # 有一处相同
    haskey(ILLs, t1[2]) && ILLs[t1[2]] in target && return true
    haskey(ILLs, t1[4]) && ILLs[t1[4]] in target && return true
    false
end
isaligned(tuples) = all(isaligned(tuples[i-1], tuples[i]) for i in 2:length(tuples)) 

isaligned (generic function with 2 methods)

In [5]:
# 判断是否存在没有对齐的三元组（跨）
ques, sols = String[], []
for (que, sol) in train_data
    isaligned(sol) || (push!(ques, que); push!(sols, sol))
end
ques # 空数组

String[]

#### 问题到图谱是否需要对齐

In [16]:
"判断 que 能否提取 triple 的头实体"
sub_in_que(que, triple) = occursin(replace(triple[2], '_'=>' '), que)

sub_in_que

In [15]:
ill_ques = String[] # 不能处理的问题
valid_ques = String[] # 正常处理的问题
a1, a2 = 0, 0
for (que, sol) in train_data
    sub_in_que(que, first(sol)) ? push!(valid_ques, que) : push!(ill_ques, que)
end

In [17]:
open("NER_data/ill_ques.txt", "w") do io
    for que in ill_ques
        println(io, que,'\t', first(train_data[que])[2])
    end
end
open("NER_data/valid_ques.txt", "w") do io
    for que in valid_ques
        println(io, que,'\t', first(train_data[que])[2])
    end
end

In [119]:
# 提取训练集的问题并保存
ques = collect(keys(train_data))
open("extract/train_questions.txt", "w") do io
    for que in ques
        println(io, que,'\t', first(train_data[que])[2])
    end
end

In [None]:
correct_rules = (
    r"(\w)'s name([\w( ]+\))" => s"\g<1>\g<2>'s name",
)

In [103]:
que = "what is the region of the origin of G's nameassendi (crater)?"
que = "who is the successor of S-related productsony Xperia E1?"
replace(que, r"(\w)-related product([\w \d]+)?" => s"\g<1>\g<2>-related product"

"what is the region of the origin of G's nameassendi (crater)?"

In [102]:
newspecial = String[]
for que in ques
    _, ob1, _, ob2 = first(train_data[que])
    ob1, ob2 = replace(ob1, '_' => ' '), replace(ob2, '_' => ' ')
    que = replace(que, reg=>s"\g<1>\g<2>'s name")
    any(occursin(que), [ob1, ob2]) || push!(newspecial, que)
end
println(length(newspecial))
open("special.txt", "w") do io
    for que in newspecial
        println(io, que, '\t', first(train_data[que]))
    end
end

216
