# 知识图谱与问题翻译
新增文件：
- excel_data/ 为过程文件
- translate/triple_en_byzh.txt 对齐且翻译后的中文图谱
- translate/triples.txt 中英文图谱翻译合并的文件
- translate/train_data.txt 翻译后的训练集和知识图谱
- translate/train_ques.txt 训练集：问题，实体，关系1，关系2，，，
- translate/valid_ques.txt 测试集：问题，实体

最后的图谱还原使用 `include("../src/translate.jl")` 提供的字典

In [30]:
include("../src/xlsx.jl")
include("../src/loaddata.jl")
include("../src/regexpr.jl")
mkpath.(["excel_data", "translate"])

2-element Vector{String}:
 "excel_data"
 "translate"

### 实体对齐
> 注意：实体对齐之后，图谱就可以合并了，所以只需翻译关系

In [31]:
# 实体对齐合并
ILLs_zh_en = Dict(@. last(ILLs) => first(ILLs))
"用 ILLs 翻译实体和关系"
function zh2en_triple(triple)
    sub, rel, obj = triple
    sub ∈ keys(ILLs_zh_en) && (sub = ILLs_zh_en[sub])
    rel ∈ keys(ILLs_zh_en) && (rel = ILLs_zh_en[rel])
    obj ∈ keys(ILLs_zh_en) && (obj = ILLs_zh_en[obj])
    return (sub, rel, obj)
end
zh_triples_byILLs = zh2en_triple.(zh_triples)
# triples = unique!(vcat(en_triples, zh_triples_byILLs))
# # 检查重复性
# print(-length(triples)+length(zh_triples)+length(en_triples)) # 736

104941-element Vector{Tuple{String, String, String}}:
 ("Sichuan", "capital", "成都市")
 ("Sichuan", "西北", "青海")
 ("Sichuan", "北", "甘肃")
 ("Sichuan", "東北", "陕西")
 ("Sichuan", "西", "西藏")
 ("Sichuan", "東", "重庆")
 ("Sichuan", "南", "云南")
 ("Faye_Wong", "語言", "國語")
 ("Faye_Wong", "語言", "粤語")
 ("Faye_Wong", "語言", "英語")
 ("Faye_Wong", "配偶", "竇唯")
 ("Faye_Wong", "配偶", "李亞鵬")
 ("袁世凯", "monarch", "宣統帝")
 ⋮
 ("古特曼·贝拉", "managerclubs", "沃绍什体育俱乐部")
 ("港區_(東京都)", "自治體", "品川區")
 ("卡伦顿战役", "result", "詹姆斯党")
 ("演播时刻", "network", "英國廣播公司第二台")
 ("亚当·博格丹", "nationalteam", "Hungary_national_under-21_football_team")
 ("江陵之战_(208年)", "combatant", "孙权")
 ("江陵之战_(208年)", "commander", "曹仁")
 ("闲山岛海战", "commander", "九鬼嘉隆")
 ("Kyodo_Television", "majorShareholder", "關西電視台")
 ("稻生之戰", "place", "尾張國")
 ("阿利安薩石油", "league", "哥倫比亞足球甲級聯賽")
 ("長島一向一揆", "partof", "石山合戰")

### 知识图谱翻译

In [32]:
# 英文不需要翻译（预处理已做）
println(filter(!isascii, [triple[2] for triple in en_triples])) 

# 中文需要翻译（且不会包含法文）
purezh = filter(!isascii, unique!([triple[2] for triple in zh_triples])) 
# write_xlsx("excel_data/triple_rels.xlsx", purezh) # 写入文件

# 使用谷歌翻译 => MT_triple_rels.xlsx
# 格式标准化（驼峰式）
function rel_combine(txt::AbstractString)
    txts = split(txt, ' ')
    length(txts) == 1 && return txt
    first(txts) * join(uppercasefirst.(txts[2:end]))
end

pureen = rel_combine.(read_xlsx("excel_data/MT_triple_rels.xlsx"))

# 转化关系
zh2en_dict = Dict(zh=>en for (zh, en) in zip(purezh, pureen))
zh2en_rel(zh) = haskey(zh2en_dict, zh) ? zh2en_dict[zh] : zh

# 图谱翻译
en_triples_byzh = [(sub, zh2en_rel(rel), obj) for (sub, rel, obj) in zh_triples_byILLs]
triples = unique!(vcat(en_triples, en_triples_byzh))

# 检查重复
print(-length(triples)+length(zh_triples)+length(en_triples))

String[]
744

In [33]:
# 写入文件
open("translate/triple_en_byzh.txt", "w") do io
    for triple in en_triples_byzh
        println(io, join(triple, '\t'))
    end
end

open("translate/triples.txt", "w") do io
    for triple in triples
        println(io, join(triple, '\t'))
    end
end

### 训练集翻译

In [34]:
# 问句首字母小写
lower(st) = replace(st, 
    "What "=>"what ", 
    "Which "=>"which ", 
    "Where " => "where ",
    "Do " => "do ",
    "In which " => "in which ",
    "Who " => "who ",
    "How " => "how "
)

lower (generic function with 1 method)

In [35]:
# 替换实体名
train_ques = String[]
for (que, ner) in train_ques_ner
    ner, que = _shift(ner), replace(que, '？'=>'?')
    occursin(ner, que) && push!(train_ques, replace(que, ner=>"NER"))
end

# 分类
french_ques = filter(isfrench, train_ques)
chinese_ques = filter(!isfrench, train_ques)

# 保存文件
write_xlsx("excel_data/french_ques.xlsx", french_ques)
write_xlsx("excel_data/chinese_ques.xlsx", chinese_ques)

# 建立翻译关系
MT_french_ques = lower.(read_xlsx("excel_data/MT_french_ques.xlsx"))
MT_chinese_ques = lower.(read_xlsx("excel_data/MT_chinese_ques.xlsx"))
MT_french = Dict((=>).(french_ques, MT_french_ques))
MT_chinese = Dict((=>).(chinese_ques, MT_chinese_ques))
MT_que(que) = haskey(MT_french, que) ? MT_french[que] : MT_chinese[que]

# 翻译问题
MT_train_ques = MT_que.(train_ques)

# 翻译知识图谱
MT_triple_zh = Dict((=>).(zh_triples, en_triples_byzh))
MT_sol(sol::Tuple) =  first(sol) == "zh" ? MT_triple_zh[sol[2:end]] : sol[2:end]
MT_train_sols = [MT_sol.(sols) for sols in train_sols]

14077-element Vector{Vector{Tuple{String, String, String}}}:
 [("瑞曼", "knownFor", "State_Peace_and_Development_Council"), ("State_Peace_and_Development_Council", "titleLeader", "List_of_Presidents_of_Myanmar")]
 [("茶裏王", "related", "Ito_En"), ("Ito_En", "logo", "File:ITO_EN-logo.jpg")]
 [("夜櫻作戰", "result", "Surrender_of_Japan"), ("Surrender_of_Japan", "format", "Ogg")]
 [("Ball_Bearing_(advertisement)", "client", "Lexus"), ("Lexus", "parent", "豐田汽車")]
 [("Cake_(advertisement)", "client", "Škoda_Auto"), ("Škoda_Auto", "parent", "奧迪福斯集團")]
 [("Changes_(advertisement)", "client", "Volkswagen"), ("Volkswagen", "parent", "大众集团")]
 [("Lots_Road_South_Tower", "client", "Hutchison_Whampoa"), ("Hutchison_Whampoa", "parent", "長江和記實業")]
 [("1957_Mongolia_earthquake", "countriesAffected", "Mongolia"), ("Mongolia", "officialText", "Mongolian_script")]
 [("1957_Mongolia_earthquake", "countriesAffected", "Mongolia"), ("Mongolia", "officialText", "蒙古语西里尔字母")]
 [("262_Southwest_Anatolia_earthquake", "c

In [37]:
# 写入数据
open("translate/train_data.txt", "w") do io
    for (que, sols) in zip(MT_train_ques, MT_train_sols)
        println(io, que)
        for sol in sols
            println(io, join(sol, '\t'))
        end
        println(io)
    end
end

open("translate/train_ques.txt", "w") do io
    for (que, sols) in zip(MT_train_ques, MT_train_sols)
        println(io, join(vcat([que, sols[1][1]], [sol[2] for sol in sols]), '\t'))
    end
end

### 验证集翻译

In [38]:
# 读取数据，并记录无法匹配的位置
txts = split(strip(read(open("extract/valid_data_ner.txt", "r"), String)), '\n')
valid_ques_ner = [split(txt, '\t') for txt in txts]
println(findall(i->isempty(i[2]), valid_ques_ner))

Int64[]


In [40]:
# 数据分类
valid_ques_rep = [replace(que, _shift(ner) => "NER", '？'=>'?')
    for (que, ner) in valid_ques_ner]
valid_french_ques = filter(isfrench, valid_ques_rep)
valid_chinese_ques = filter(!isfrench, valid_ques_rep)

# 写入
# write_xlsx("excel_data/valid_french.xlsx", valid_french_ques)
# write_xlsx("excel_data/valid_chinese.xlsx", valid_chinese_ques)

# 谷歌翻译 => MT_xxx
# 读入
valid_french_byMT = lower.(read_xlsx("excel_data/MT_valid_french.xlsx"))
valid_chinese_byMT = lower.(read_xlsx("excel_data/MT_valid_chinese.xlsx"))
MT_valid_french = Dict((=>).(valid_french_ques, valid_french_byMT))
MT_valid_chinese = Dict((=>).(valid_chinese_ques, valid_chinese_byMT))
MT_valid_que(que) = haskey(MT_valid_french, que) ? MT_valid_french[que] : MT_valid_chinese[que]

# # 翻译问题
MT_valid_ques = MT_valid_que.(valid_ques_rep)

# 翻译命名实体-旧命名实体从 zh_triples, en_triples, ILLs 中匹配
open("translate/valid_ques.txt", "w") do io
    for (que, ner) in zip(MT_valid_ques, last.(valid_ques_ner))
        ner ∈ keys(ILLs_zh_en) && (ner = ILLs_zh_en[ner])
        println(io, que, '\t', ner)
    end
end