## 实体对齐
注：关于 ill_ques 怎么定义的

### 读取数据

In [17]:
cd("../data")
include("../src/readfiles.jl")

导入成功，内容概要：
    | 变量名 | 说明 |
    | ---- | ---- |
    | zh_triples/en_triples | 三元组 |
    | zh_subs/en_subs | 头实体 |
    | zh_objs/en_objs | 尾实体 |
    | zh_rels/en_rels | 关系 |
    | zh_entity/en_entity | 实体（头和尾） |
    | ILLs | 英文 => 中文对齐 |
    | ILLs_zh_en | 中文 => 英文对齐|
    | train_data | 训练集 |


In [18]:
# 训练数据
train_data = Dict{String, Vector{NTuple{4, String}}}()
open("extract/train_data.txt", "r") do io
    for _ in 1:14077
        que = readline(io)
        ind, que = parse(Int, que[2]), que[5:end]
        train_data[que] = [Tuple(split(readline(io), '\t')) for _ in 1:ind]
        readline(io)
    end
end

In [19]:
# 检查是否单向
function isoneway(sol)
    for i in 1:length(sol) - 1
        sub, obj = sol[i][4], sol[i+1][2]
        sub == obj && continue
        sub ∈ keys(ILLs) && ILLs[sub] == obj && continue
        sub ∈ keys(ILLs_zh_en) && ILLs_zh_en[sub] == obj && continue
        return false
    end
    true
end
all(isoneway, values(train_data))

true

### 知识图谱对齐
对训练集进行分析

#### 跨图谱是否需要对齐

In [4]:
"""判断三元组是否对齐"""
function isaligned(t1, t2)
    first(t1) == first(t2) && return true # 没有跨语言
    if first(t1) == "zh" # 不妨设 t1 为英文，判断能否借助 ILL 翻译为中文
        t1, t2 = t2, t1
    end
    target = (t2[2], t2[4]) # 头实体，尾实体
    any(in(target), [t1[2], t1[4]]) && return true # 有一处相同
    haskey(ILLs, t1[2]) && ILLs[t1[2]] in target && return true
    haskey(ILLs, t1[4]) && ILLs[t1[4]] in target && return true
    false
end
isaligned(tuples) = all(isaligned(tuples[i-1], tuples[i]) for i in 2:length(tuples)) 

isaligned (generic function with 2 methods)

In [5]:
# 判断是否存在没有对齐的三元组（跨）
ques, sols = String[], []
for (que, sol) in train_data
    isaligned(sol) || (push!(ques, que); push!(sols, sol))
end
ques # 空数组

String[]

#### 问题到图谱是否需要对齐

In [20]:
"判断 que 能否提取 triple 的头实体"
sub_in_que(que, triple) = occursin(replace(triple[2], '_'=>' '), que)

sub_in_que

In [21]:
ill_ques = String[] # 不能直接提取头实体的问题
valid_ques = String[] # 能直接提取头实体的问题
a1, a2 = 0, 0
for (que, sol) in train_data
    sub_in_que(que, first(sol)) ? push!(valid_ques, que) : push!(ill_ques, que)
end
ill_ques

284-element Vector{String}:
 "what is the timezong of C's southern regionabanes, Girona?"
 "which bay makes the origin of R's nameayleigh (lunar crater) famous?"
 "what is the program that is before the works that make M knownari Yamazaki?"
 "who does the origin of Y's nameoung (crater) influence?"
 "who is the producer of P's famous workshilip Eisner?"
 "Savez-vous ce qu’est la dynastie ?"
 "which language does I's famous workssmail Shahid belong to?"
 "which program is before A's famous worksyame Goriki?"
 "who is the director of the works that make L knownenore Aubert?"
 "what is the program that is before A's famous worksyame Goriki?"
 "who is the predecessor of S-related productsony Xperia E1?"
 "Savez-vous quels sont les pays participants"
 "who is the predecessor of S's r" ⋯ 81 bytes ⋯ "uctsamsung Galaxy Tab 7.0 Plus?"
 ⋮
 "which type of government does B related eventsyzantine Crete belong to?"
 "who did a job before L's related productsG G3 Stylus?"
 "who is the developer of N

In [None]:
que = "what is the region of the origin of G's nameassendi (crater)?"
correct_que = "what is the region of the origin of Gassendi (crater)'s name?"

In [8]:
newspecial = String[]
for que in ill_ques
    _, ob1, _, ob2 = first(train_data[que])
    ob1, ob2 = replace(ob1, '_' => ' '), replace(ob2, '_' => ' ')
    que = replace(que, r"(\w)'s name([\w( ]+\))" => s"\g<1>\g<2>'s name")
    any(occursin(que), [ob1, ob2]) || push!(newspecial, que)
end
println(length(newspecial))

216


In [17]:
# 匹配比较注意，避免把问题弄乱

In [22]:
# # 保存数据
# open("NER_data/ill_ques.txt", "w") do io
#     for que in ill_ques
#         println(io, que,'\t', first(train_data[que])[2])
#     end
# end
# open("NER_data/valid_ques.txt", "w") do io
#     for que in valid_ques
#         println(io, que,'\t', first(train_data[que])[2])
#     end
# end

# 保存问题
open("extract/valid_ques.txt", "w") do io
    for que in valid_ques
        println(io, que)
    end
end

In [119]:
# 提取训练集的问题并保存
ques = collect(keys(train_data))
open("extract/train_questions.txt", "w") do io
    for que in ques
        println(io, que,'\t', first(train_data[que])[2])
    end
end

In [None]:
correct_rules = (
    r"(\w)'s name([\w( ]+\))" => s"\g<1>\g<2>'s name",
)

In [103]:
que = "what is the region of the origin of G's nameassendi (crater)?"
que = "who is the successor of S-related productsony Xperia E1?"
replace(que, r"(\w)-related product([\w \d]+)?" => s"\g<1>\g<2>-related product"

"what is the region of the origin of G's nameassendi (crater)?"

In [14]:
newspecial = String[]
for que in ques
    _, ob1, _, ob2 = first(train_data[que])
    ob1, ob2 = replace(ob1, '_' => ' '), replace(ob2, '_' => ' ')
    que = replace(que, reg=>s"\g<1>\g<2>'s name")
    any(occursin(que), [ob1, ob2]) || push!(newspecial, que)
end
println(length(newspecial))
open("special.txt", "w") do io
    for que in newspecial
        println(io, que, '\t', first(train_data[que]))
    end
end

0
