# 知识图谱与问题翻译

### 概要/说明：
- `include("translatedata.jl")` 导入翻译后的数据
- 翻译文件放在 `translate/`
- 函数工具 `../src/xlsx.jl` 和 `../src/translate.jl`

细节：
- 准备工作
    - 尝试了 Tensor2Tensor 的预训练模型，但官方文档只找到 `德文=>英文` 现成的工具（弃用）
    - 谷歌翻译支持 API 调用，但官方 API 需付费，而 GitHub 第三方编写的 API 不稳定（弃用）
    - 最后使用谷歌提供的文档翻译功能（免费，仅支持 pdf, excel 等，不支持 txt 文本）
    - 决定方案： Excel 进行数据交互，函数工具 `../src/xlsx.jl`
- 翻译训练集和验证集
    - 我们将问题分 `英法` 和 `汉语` 两类进行单语言翻译
    - 翻译前将实体名替换为 "NER" 

> 由于官方提供的 `ILLs(zh-en).txt` 不足以提供所有实体的对齐，三元组翻译放在后边进行

### 导入数据和工具

In [1]:
include("../src/CCKS-mKGQA.jl")
include("../src/xlsx.jl")
include("../src/extractdata.jl")
include("../src/translate.jl")

MT_questions

### 翻译三元组

#### 翻译关系

In [2]:
# 获取关系
rels = unique(getindex.(triples, 2))
rep_rels = replace.(rels, rel_rules...)
# 写入 Excel
# write_xlsx("excel_data/triples/rels.xlsx", rep_rels)
# 读取翻译
MT_rels = lowercase.(strip.(read_xlsx("excel_data/triples/rels_MT.xlsx")))
MT_rels = replace.(MT_rels, " "=>"", "ゲームジャンル" => "gamegenre", "ジャンル" => "genre")
dict_rels = Dict((=>).(rels, MT_rels));

In [3]:
open("translate/rels_raw2new.txt", "w") do io
    for (rel1, rel2) in zip(rels, MT_rels)
        println(io, rel1, '\t', rel2)
    end
end

#### 翻译实体

In [3]:
# 翻译英文图谱中的非英语实体
en_nonen = filter(!isenglish, en_words)
# write_xlsx("excel_data/triples/en_words/en_nonenglish.xlsx", _2space.(en_nonen))
MT_en_nonen = strip.(read_xlsx("excel_data/triples/en_words/en_nonenglish_mt.xlsx"))
MT_en_nonen = replace.(MT_en_nonen, ' ' => '_')
en_words_dict = Dict((=>).(en_nonen, MT_en_nonen))

# 翻译中文图谱中不落在英文实体的部分
zh_nonen = filter(∉(Set(en_words)), zh_words)
# write_xlsx("excel_data/triples/zh_words/zh_nonenglish.xlsx", _2space.(zh_nonen))
MT_zh_nonen = strip.(read_xlsx("excel_data/triples/zh_words/zh_nonenglish_en.xlsx"))
MT_zh_nonen = replace.(MT_zh_nonen, ' ' => '_')
zh_words_dict = Dict((=>).(zh_nonen, MT_zh_nonen))

# 翻译整合
dict_words = Dict(union(en_words_dict, zh_words_dict))
dict_words = Dict(word => get(dict_words, word, word) for word in words);

In [5]:
# 写入实体信息
word_io = open("translate/words_raw2new.txt", "w")
for word in words
    println(word_io, word, '\t', dict_words[word])
end
close(word_io)

#### 对齐文件

In [4]:
# 对齐实体：有 3k+ 在翻译后被对齐
MT_ILLs = [(dict_words[en], dict_words[zh]) for (en, zh) in ILLs]
MT_ILLs = filter(i->i[1]!=i[2], MT_ILLs);
length.((ILLs, MT_ILLs))

(13055, 9802)

In [7]:
# open("translate/ILLs(zh-en).txt", "w") do io
#     for (en, zh) in MT_ILLs
#         println(io, en, '\t', zh)
#     end
# end 

#### 三元组 

In [5]:
# 翻译三元组
mt_triple(triple) = mt_triple(triple...)
mt_triple(sub, rel, obj) = (dict_words[sub], dict_rels[rel], dict_words[obj])

mt_triple (generic function with 2 methods)

In [9]:
# open("translate/triples.txt", "w") do io
#     for triple in unique!(mt_triple.(triples))
#         println(io, join(triple, '\t'))
#     end
# end

### 训练集翻译

In [6]:
# 提取关系
sol2rel(sol) = getindex.(Ref(dict_rels), getindex.(sol, 3))

# 替换实体名
train_ques = [replace(que, _2space(ner) => "NER", '？'=>'?') for (que, ner) in train_ques_ner]
train_sols_mt = [mt_triple.([sol[2:end] for sol in sols]) for sols in train_sols]

# 保存问题
# MT_questions(train_ques, "excel_data/ques/train_ques")

# 读取翻译问题
MT_train_ques = MT_questions(train_ques, "excel_data/ques/train_ques";write=false);

In [11]:
# 写入文件
open("translate/train_ques.txt", "w") do io
    for (que, sols) in zip(MT_train_ques, train_sols_mt)
        ner, rels = sols[1][1], getindex.(sols, 2)
        println(io, que, '\t', ner, '\t', join(rels, '\t'))
    end
end

# open("translate/train_data.txt", "w") do io
#     for (que, sols) in zip(MT_train_ques, train_sols_mt)
#         ner = sols[1][1]
#         println(io, que, '\t', ner)
#         for sol in sols
#             println(io, join(sol, '\t'))
#         end
#         println(io)
#     end
# end

### 验证集翻译

In [7]:
# 读取数据
txts = split(strip(read(open("extract/valid_data_ner.txt", "r"), String)), '\n')
valid_ques_ner = [split(txt, '\t') for txt in txts]

# 关键词替换
valid_ques = [replace(que, _2space(ner) => "NER", '？'=>'?') for (que, ner) in valid_ques_ner]
valid_ners = getindex.(Ref(dict_words), last.(valid_ques_ner));

In [13]:
# 写入文件
# MT_questions(valid_ques, "excel_data/ques/valid_ques")

# 读取翻译文件
valid_ques_MT = MT_questions(valid_ques, "excel_data/ques/valid_ques"; write=false);

In [14]:
# 翻译命名实体-旧命名实体从 zh_triples, en_triples, ILLs 中匹配
open("translate/valid_ques.txt", "w") do io
    for (que, ner) in zip(valid_ques_MT, valid_ners)
        println(io, que, '\t', ner)
    end
end

### 维基对齐实体
**仅用于观察对齐模型准确性，不参与训练**

In [13]:
wiki_ILLs = readtuples("extract/wiki_ills.txt";size=2);

In [14]:
MT_wiki_ILLs = unique!([(dict_words[en], dict_words[zh]) for (en, zh) in wiki_ILLs])
MT_wiki_ILLs = filter(i->i[1]!=i[2], MT_wiki_ILLs)
println(length(MT_wiki_ILLs),'\t', length(wiki_ILLs))

29655	42072


In [15]:
filter(∉(Set(MT_wiki_ILLs)), MT_ILLs) # 存在包含关系

Tuple{String, String}[]

In [18]:
# # 10913 个实体在翻译后被对齐
# open("translate/ILLs_wiki.txt", "w") do io
#     for (en, zh) in MT_wiki_ILLs
#         println(io, en, '\t', zh)
#     end
# end