# 知识图谱与问题翻译
概要/说明：
    - 翻译文件放在 `translate/`，翻译过程的交互文件放在 `excel_data/`
    - Julia 中输入 `include("translatedata.jl")` 导入翻译后的数据

主要内容：
- 翻译准备工作
    - 尝试了 Tensor2Tensor 的预训练模型，但官方文档只找到 `德文=>英文` 现成的工具，数据和调参不会弄（弃用）
    - 谷歌翻译支持 API 调用，但官方 API 需付费，而 GitHub 第三方编写的 API 不稳定（弃用）
    - 最后使用谷歌提供的文档翻译功能（免费，但仅支持 pdf, excel 等，不支持 txt 文本）
    - 其中 PDF 翻译容易产生排版错位，所以决定用 Excel 进行数据交互
    - Excel 相关函数代码参见 `../src/xlsx.jl`
- 翻译知识图谱
    - 用官方提供的 ILLs 文件将中文三元组转化到英文三元组中
    - 将英文三元组以及对齐后的中文三元组写入 Excel 文件，并进行翻译
    - 读入翻译后的 Excel 文件，微调格式(统一小写等)
    - 将中英文结果合并去重，写入 `translate/`
- 翻译训练集和验证集
    - 由于谷歌翻译一次只能将处理一种语言，我们将问题分 `英法` 和 `汉语` 两类进行翻译
    - 为避免命名实体在翻译后丢失，我们先将实体名替换为 NER 再做翻译工作
    - 替换也利于区分语言，我们通过字符编码模式区分两类问题，分别进行翻译
    - 训练集翻译和提取结果放在 `translate/`
- 相关说明
    - 处理图谱和问题的函数代码放在 `../src/translate.jl` 文件下
    - 后续计算发现，即使先对齐再翻译，还是有少量对齐结果被破坏，比如
    ```jl
    emperor_go-mizunoo => emperor_go - mizunoo # 空格
    克里特埃米爾國 => emirate_of_crete => emirates_of_crete # 单复数
    ```

> 由于 ILLs 不足以提供所有的对齐关系，实体翻译还会进一步处理

In [15]:
include("../src/xlsx.jl")
include("../src/extractdata.jl")
include("../src/translate.jl")
mkpath.(["excel_data", "translate"])

2-element Vector{String}:
 "excel_data"
 "translate"

### 实体对齐

In [16]:
# 实体对齐合并
"用 ILLs 翻译实体和关系"
zh2en_byILLs(st::String) = haskey(ILLs_zh_en, st) ? ILLs_zh_en[st] : st
zh2en_byILLs(triple::Tuple) = zh2en_byILLs.(triple)
zh_triples_byILLs = zh2en_byILLs.(zh_triples)

104941-element Vector{Tuple{String, String, String}}:
 ("Sichuan", "capital", "成都市")
 ("Sichuan", "西北", "青海")
 ("Sichuan", "北", "甘肃")
 ("Sichuan", "東北", "陕西")
 ("Sichuan", "西", "西藏")
 ("Sichuan", "東", "重庆")
 ("Sichuan", "南", "云南")
 ("Faye_Wong", "語言", "國語")
 ("Faye_Wong", "語言", "粤語")
 ("Faye_Wong", "語言", "英語")
 ("Faye_Wong", "配偶", "竇唯")
 ("Faye_Wong", "配偶", "李亞鵬")
 ("袁世凯", "monarch", "宣統帝")
 ⋮
 ("古特曼·贝拉", "managerclubs", "沃绍什体育俱乐部")
 ("港區_(東京都)", "自治體", "品川區")
 ("卡伦顿战役", "result", "詹姆斯党")
 ("演播时刻", "network", "英國廣播公司第二台")
 ("亚当·博格丹", "nationalteam", "Hungary_national_under-21_football_team")
 ("江陵之战_(208年)", "combatant", "孙权")
 ("江陵之战_(208年)", "commander", "曹仁")
 ("闲山岛海战", "commander", "九鬼嘉隆")
 ("Kyodo_Television", "majorShareholder", "關西電視台")
 ("稻生之戰", "place", "尾張國")
 ("阿利安薩石油", "league", "哥倫比亞足球甲級聯賽")
 ("長島一向一揆", "partof", "石山合戰")

### 知识图谱翻译

In [18]:
# 写入对齐文件
# write_excel_triple(zh_triples_byILLs, "excel_data/triples/triple_zh.xlsx")

# 谷歌翻译后 => 读取翻译文件
zh_triples_MT = read_excel_triple("excel_data/triples/triple_zh.xlsx")
zh_triples_lower = [lowercase.(triple) for triple in zh_triples_MT]
en_triples_lower = [lowercase.(triple) for triple in en_triples]

# 合并三元组（重复个数 1904 个）
triples = vcat(en_triples_lower, zh_triples_lower)
length(en_triples) + length(zh_triples) - length(unique!(triples))

1904

In [19]:
# 写入文件
open("translate/triple_zh_lower.txt", "w") do io
    for triple in zh_triples_lower
        println(io, join(triple, '\t'))
    end
end

open("translate/triple_en_lower.txt", "w") do io
    for triple in en_triples_lower
        println(io, join(triple, '\t'))
    end
end

open("translate/triples.txt", "w") do io
    for triple in triples
        println(io, join(triple, '\t'))
    end
end

### 训练集翻译

In [6]:
# 筛选非乱码问题，并替换实体名
train_ques, new_sols = String[], Vector{NTuple{4, String}}[]
for ((que, ner), sol) in zip(train_ques_ner, train_sols)
    ner, que = _2space(ner), replace(que, '？'=>'?')
    if occursin(ner, que)
        push!(train_ques, replace(que, ner=>"NER"))
        push!(new_sols, sol)
    end
end

# 分类并保存文件
# MT_questions(train_ques, "excel_data/ques/train_ques")

# 谷歌翻译后 => 读取翻译文件
MT_train_ques = MT_questions(train_ques, "excel_data/ques/train_ques";write = false)

14051-element Vector{SubString{String}}:
 "what is the title leader of the bay that NER is famous for?"
 "what is the logo of the car that NER is related to?"
 "which format does the cause of " ⋯ 40 bytes ⋯ "during world war ii, belong to?"
 "what is the parent mountain of the client of NER?"
 "which mountain range does the client of NER belong to?"
 "which mountain range does the client of NER belong to?"
 "which mountain range does the c" ⋯ 17 bytes ⋯ "kyscraper in london, belong to?"
 "which official language is used by the country that is affected by NER?"
 "what is the official language o" ⋯ 20 bytes ⋯ "s affected by NER, an erdbeben?"
 "what is the leader name of the country that is affected by NER?"
 "who leads the country that is a" ⋯ 70 bytes ⋯ "south coasts of anatolia (262)?"
 "who is the author of the works that make NER known?"
 "who is the author of the works that make NER known?"
 ⋮
 "do you know who is the leader of the Northwest of NER's Birthplace"
 "what time zone us

In [7]:
# 翻译知识图谱
MT_triple_zh = Dict((=>).(zh_triples, zh_triples_lower))
MT_triple_en = Dict((=>).(en_triples, en_triples_lower))
MT_sol(sol::Tuple) =  first(sol) == "zh" ? MT_triple_zh[sol[2:end]] : MT_triple_en[sol[2:end]]
MT_train_sols = [MT_sol.(sols) for sols in new_sols]

14051-element Vector{Vector{Tuple{String, String, String}}}:
 [("reman", "knownfor", "state_peace_and_development_council"), ("state_peace_and_development_council", "titleleader", "list_of_presidents_of_myanmar")]
 [("king_of_tea", "related", "ito_en"), ("ito_en", "logo", "file:ito_en-logo.jpg")]
 [("night_sakura", "result", "surrender_of_japan"), ("surrender_of_japan", "format", "ogg")]
 [("ball_bearing_(advertisement)", "client", "lexus"), ("lexus", "parent", "toyota_cars")]
 [("cake_(advertisement)", "client", "škoda_auto"), ("škoda_auto", "parent", "audifoss_group")]
 [("changes_(advertisement)", "client", "volkswagen"), ("volkswagen", "parent", "volkswagen_group")]
 [("lots_road_south_tower", "client", "hutchison_whampoa"), ("hutchison_whampoa", "parent", "cheung_kong_hutchison_industries")]
 [("1957_mongolia_earthquake", "countriesaffected", "mongolia"), ("mongolia", "officialtext", "mongolian_script")]
 [("1957_mongolia_earthquake", "countriesaffected", "mongolia"), ("mongolia",

In [20]:
# 写入数据
open("translate/train_data.txt", "w") do io
    for (que, sols) in zip(MT_train_ques, MT_train_sols)
        que = refine_que(que)
        println(io, que)
        for sol in sols
            println(io, join(sol, '\t'))
        end
        println(io)
    end
end

open("translate/train_ques.txt", "w") do io
    for (que, sols) in zip(MT_train_ques, MT_train_sols)
        que = refine_que(que)
        println(io, join(vcat([que, sols[1][1]], [sol[2] for sol in sols]), '\t'))
    end
end

### 验证集翻译

In [9]:
# 读取数据
txts = split(strip(read(open("ner_data/valid_data_ner.txt", "r"), String)), '\n')
valid_ques_ner = [split(txt, '\t') for txt in txts]

# 关键词替换
valid_ques = [replace(que, _2space(ner) => "NER", '？'=>'?') for (que, ner) in valid_ques_ner]

# 分类和写入
# MT_questions(valid_ques, "excel_data/ques/valid_ques")

# 谷歌翻译 => MT_xxx
# 读入翻译文件
valid_ques_MT = MT_questions(valid_ques, "excel_data/ques/valid_ques"; write=false)

1500-element Vector{SubString{String}}:
 "where is the constituency of th" ⋯ 26 bytes ⋯ ", a hong kong politician, from?"
 "who is the successor of the parent of NER, an Irish politician?"
 "who is the leader of the administrative region to which NER belongs?"
 "who preceded the parent of NER, an american economist?"
 "which draft team does the autho" ⋯ 60 bytes ⋯ "ariq and peter gunz, belong to?"
 "who is before the parent of NER" ⋯ 54 bytes ⋯ " minister of india (1889-1964)?"
 "where does the location province of NER, a mine in mongolia, belong to?"
 "who does the origin of NER influence's name?"
 "what is the genre of the author of NER, a song performed by carlos santana?"
 "which title does the parent of NER, an american economist, belong to?"
 "what is the parent mountain of the client of NER?"
 "who did a job before the parent of NER?"
 "what is the type of the people, things and things with the same name as NER?"
 ⋮
 "do you know what time zone is i" ⋯ 66 bytes ⋯ " community of 

In [10]:
# 三元组
function readtriples(path)
    txts = strip(read(open(path, "r"), String))
    [NTuple{3, String}(split(txt, '\t')) for txt in split(txts, '\n')]
end
raw_en_triples = readtriples("raw_triple/triple_en.txt")
raw_zh_triples = readtriples("raw_triple/triple_zh.txt")

104941-element Vector{Tuple{String, String, String}}:
 ("四川省", "capital", "成都市")
 ("四川省", "西北", "青海")
 ("四川省", "北", "甘肃")
 ("四川省", "東北", "陕西")
 ("四川省", "西", "西藏")
 ("四川省", "東", "重庆")
 ("四川省", "南", "云南")
 ("王菲", "語言", "國語")
 ("王菲", "語言", "粤語")
 ("王菲", "語言", "英語")
 ("王菲", "配偶", "竇唯")
 ("王菲", "配偶", "李亞鵬")
 ("袁世凯", "monarch", "宣統帝")
 ⋮
 ("古特曼·贝拉", "managerclubs", "沃绍什体育俱乐部")
 ("港區_(東京都)", "自治體", "品川區")
 ("卡伦顿战役", "result", "詹姆斯党")
 ("演播时刻", "network", "英國廣播公司第二台")
 ("亚当·博格丹", "nationalteam", "匈牙利21歲以下國家足球隊")
 ("江陵之战_(208年)", "combatant", "孙权")
 ("江陵之战_(208年)", "commander", "曹仁")
 ("闲山岛海战", "commander", "九鬼嘉隆")
 ("共同電視", "majorShareholder", "關西電視台")
 ("稻生之戰", "place", "尾張國")
 ("阿利安薩石油", "league", "哥倫比亞足球甲級聯賽")
 ("長島一向一揆", "partof", "石山合戰")

In [12]:
ner_dict = Dict((=>).(first.(vcat(raw_en_triples, raw_zh_triples)), first.(triples)))

Dict{String, String} with 132085 entries:
  "Xenophon_Overton_Pindall"            => "xenophon_overton_pindall"
  "Jingzhou"                            => "jingzhou"
  "Vyškov_District"                     => "vyškov_district"
  "Bajmok"                              => "bajmok"
  "學托擴_(堪薩斯州)"                   => "chautauqua,_kansas"
  "Tiszabercel"                         => "tiszabercel"
  "贝尔纳多特王朝"                      => "bernadotte_dynasty"
  "天主教奥三棉示总教区"                => "catholic_archdiocese_of_osmanthus"
  "威廉·格萊斯頓"                       => "william_gladstone"
  "薩卡特卡斯礦工"                      => "zacatecas_miners"
  "La_Bazoque,_Calvados"                => "la_bazoque,_calvados"
  "沙陶"                                => "sáta"
  "Le_Mans"                             => "le_mans"
  "恐怖蠟像館"                          => "house_of_wax_(2005_film)"
  "Luxembourg_Stock_Exchange"           => "luxembourg_stock_exchange"
  "Beatriz_Corredor"                    => "beatriz_corredor"
  "M

In [14]:
# 翻译命名实体-旧命名实体从 zh_triples, en_triples, ILLs 中匹配
open("translate/valid_ques.txt", "w") do io
    for (que, ner) in zip(valid_ques_MT, last.(valid_ques_ner))
        ner ∈ keys(ner_dict) && (ner = ner_dict[ner])
        # que = refine_que(que)
        println(io, que, '\t', ner)
        @assert ner ∈ values(ner_dict)
    end
end