# 知识图谱与问题翻译

### 概要/说明：
- `include("translatedata.jl")` 导入翻译后的数据
- 翻译文件放在 `translate/`
- 函数工具 `../src/xlsx.jl` 和 `../src/translate.jl`

细节：
- 准备工作
    - 尝试了 Tensor2Tensor 的预训练模型，但官方文档只找到 `德文=>英文` 现成的工具（弃用）
    - 谷歌翻译支持 API 调用，但官方 API 需付费，而 GitHub 第三方编写的 API 不稳定（弃用）
    - 最后使用谷歌提供的文档翻译功能（免费，仅支持 pdf, excel 等，不支持 txt 文本）
    - 决定方案： Excel 进行数据交互，函数工具 `../src/xlsx.jl`
- 翻译训练集和验证集
    - 我们将问题分 `英法` 和 `汉语` 两类进行单语言翻译
    - 翻译前将实体名替换为 "NER" 

> 由于官方提供的 `ILLs(zh-en).txt` 不足以提供所有实体的对齐，三元组翻译放在后边进行

### 导入数据和工具

In [37]:
include("../src/xlsx.jl")
include("../src/extractdata.jl")
include("../src/translate.jl")
mkpath.(["excel_data", "translate", "translate/triples", "translate/words", "translate/ILLs"])

5-element Vector{String}:
 "excel_data"
 "translate"
 "translate/triples"
 "translate/words"
 "translate/ILLs"

### 翻译三元组

#### 翻译关系

In [4]:
# 关系中的少量规则修改
rel_rules = (
    "mittelstädte" => "medium-sized_towns",
    "écoulement" => "flow",
    "résidenceOfficielle" => "official_residence",
    "siège" => "headquarters",
    r"\d上一節目" => "上一節目",
    r"\d+下一節目" => "下一節目",
    "é" => "e");

# 获取关系
zh_rels = unique!(replace.(getindex.(zh_triples, 2), rel_rules...))
en_rels = unique!(replace.(getindex.(en_triples, 2), rel_rules...))
rels = union(zh_rels, en_rels)
println(length.([zh_rels, en_rels, rels]))

[1142, 1635, 2216]


In [5]:
# 写入 Excel
# write_xlsx("excel_data/triples/rels.xlsx", rels)
# 读取翻译
MT_rels = lowercase.(strip.(read_xlsx("excel_data/triples/rels_MT.xlsx")))
MT_rels = replace.(MT_rels, " "=>"", "ゲームジャンル" => "gamegenre", "ジャンル" => "genre")
# 构建翻译
dict_rels = Dict((=>).(rels, MT_rels))

Dict{String, String} with 2216 entries:
  "schoolTradition"           => "schooltradition"
  "action"                    => "action"
  "mainOrgan"                 => "mainorgan"
  "西"                        => "west"
  "garrisonLabel"             => "garrisonlabel"
  "namedFor"                  => "namedfor"
  "demoCd"                    => "democd"
  "upperUnit"                 => "upperunit"
  "municunit"                 => "municunit"
  "locationCity"              => "locationcity"
  "軍種"                      => "military"
  "blankData"                 => "blankdata"
  "subprefectures"            => "subprefectures"
  "分類"                      => "classification"
  "firstRider"                => "firstrider"
  "postNoms"                  => "postnoms"
  "worldPlace"                => "worldplace"
  "placeofbirth"              => "placeofbirth"
  "开国君主"                  => "foundingmonarch"
  "continentalcup1Qualifiers" => "continentalcup1qualifiers"
  "honorificPrefix"           => 

In [6]:
open("translate/words/rels_zh2en.txt", "w") do io
    for (rel1, rel2) in zip(rels, MT_rels)
        println(io, rel1, '\t', rel2)
    end
end

#### 翻译实体

In [40]:
# 英文实体
en_subs, en_objs = @. unique!([first(en_triples), last(en_triples)])
en_words = union(en_subs, en_objs)
enset = Set(en_words)

# 翻译英文图谱中的非英语实体
en_nonen = filter(!isenglish, en_words)
write_xlsx("excel_data/triples/en_words/en_nonenglish.xlsx", _2space.(en_nonen))
MT_en_nonen = strip.(read_xlsx("excel_data/triples/en_words/en_nonenglish_mt.xlsx"))
MT_en_nonen = replace.(MT_en_nonen, ' ' => '_')
en_words_dict = Dict((=>).(en_nonen, MT_en_nonen))
MT_enwords = get.(Ref(en_words_dict), en_words, en_words) 

# # 中文实体
zh_subs, zh_objs = @. unique!([first(zh_triples), last(zh_triples)])
zh_words = union(zh_subs, zh_objs)
zhset = Set(zh_words)

# # 翻译中文图谱中不落在英文实体的部分
zh_nonen = filter(∉(enset), zh_words)
write_xlsx("excel_data/triples/zh_words/zh_nonenglish.xlsx", _2space.(zh_nonen))
MT_zh_nonen = strip.(read_xlsx("excel_data/triples/zh_words/zh_nonenglish_en.xlsx"))
MT_zh_nonen = replace.(MT_zh_nonen, ' ' => '_')
zh_words_dict = Dict((=>).(zh_nonen, MT_zh_nonen))
MT_zhwords = get.(Ref(zh_words_dict), zh_words, zh_words)

# # 翻译整合
words = union(zh_words, en_words)
words_dict = Dict(union(zh_words_dict, en_words_dict))

Dict{String, String} with 116456 entries:
  "亞歷山大二世_(俄國)"    => "Alexander_II_(Russia)"
  "1914–15_NHA_season"     => "1914-15_NHA_season"
  "永遠的0"                => "forever_0"
  "巽他人"                 => "Sundanese"
  "利根川"                 => "Tonegawa"
  "狐狸與獵狗2"            => "The_fox_and_the_hound_2"
  "Xi_County,_Henan"       => "Xi_County,_Henan"
  "Vyškov_District"        => "Vyskov_District"
  "瑞尼维尔县"             => "Rainville_County"
  "學托擴_(堪薩斯州)"      => "Xuetuoquan_(Kansas)"
  "贝尔纳多特王朝"         => "Bernadotte_dynasty"
  "天主教奥三棉示总教区"   => "catholic_archdiocese_of_osmanthus"
  "威廉·格萊斯頓"          => "William_Gladstone"
  "薩卡特卡斯礦工"         => "zacatecas_miners"
  "屈伊赛"                 => "Quisay"
  "La_Bazoque,_Calvados"   => "La_Bazoque,_Calvados"
  "Glades_County,_Florida" => "Glades_County,_Fla"
  "沙陶"                   => "Satao"
  "史高比耶"               => "Scobie"
  "皮埃尔·比洛特"          => "Pierre_Bilot"
  "东洋大学"               => "Toyo_University"
  "万德拉阿莱拉"           => "van

In [41]:
# 写入实体信息
open("translate/words/zh_subs.txt", "w") do io
    for sub in zh_subs
        println(io, sub, '\t', get(words_dict, sub, sub))
    end
end

open("translate/words/zh_objs.txt", "w") do io
    for obj in zh_objs
        println(io, obj, '\t', get(words_dict, obj, obj))
    end
end

open("translate/words/zh_words.txt", "w") do io
    for obj in zh_words
        println(io, obj, '\t', get(words_dict, obj, obj))
    end
end


open("translate/words/en_subs.txt", "w") do io
    for sub in en_subs
        println(io, sub, '\t', get(words_dict, sub, sub))
    end
end

open("translate/words/en_objs.txt", "w") do io
    for obj in en_objs
        println(io, obj, '\t', get(words_dict, obj, obj))
    end
end

open("translate/words/en_words.txt", "w") do io
    for obj in en_words
        println(io, obj, '\t', get(words_dict, obj, obj))
    end
end

#### 翻译三元组

In [77]:
triples = NTuple{3, String}[]
# 导出英文三元组
open("translate/triples/en_triples.txt", "w") do io
    for (sub, rel, obj) in en_triples
        sub = get(words_dict, sub, sub)
        obj = get(words_dict, obj, obj)
        rel = get(dict_rels, rel, rel)
        triple = (sub, rel, obj)
        push!(triples, triple)
        println(io, join(triple, '\t'))
    end
end

# 中文三元组
open("translate/triples/zh_triples.txt", "w") do io
    for (sub, rel, obj) in zh_triples
        sub = get(words_dict, sub, sub)
        obj = get(words_dict, obj, obj)
        rel = get(dict_rels, rel, rel)
        triple = (sub, rel, obj)
        push!(triples, triple)
        println(io, join(triple, '\t'))
    end
end

open("translate/triples/triples.txt", "w") do io
    for triple in unique!(triples)
        println(io, join(triple, '\t'))
    end
end

#### 对齐文件

In [86]:
MT_ILLs = [(get(words_dict, en, en), get(words_dict, zh, zh)) for (en, zh) in ILLs]
MT_ILLs = filter(i->i[1]!=i[2], MT_ILLs)

9802-element Vector{Tuple{String, String}}:
 ("Guixers", "jixels")
 ("Westvoorne", "west_forne")
 ("Ron_Johnson_(baseball)", "Ron_Johnson_(baseball_player)")
 ("May_Chan", "Chen_Jiajia")
 ("Catherine_of_Navarre", "Kathleen_(Navarra)")
 ("Pyin_Oo_Lwin_District", "Pyin_Oo_Lwin_County")
 ("Cyrano_Agency", "Big_Nose_Lover:_Love_Operation_Group")
 ("Qianlong_Emperor", "Emperor_Qianlong")
 ("Prince_Philip,_Duke_of_Orleans_(1869–1926)", "Philip_(Duke_of_Orleans)")
 ("Thomas_C_Kinkaid", "Thomas_Kinkade")
 ("1976-77_European_Cup", "1976–77_European_Champions_Cup")
 ("The_Good,_the_Bad_and_the_Ugly", "Twilight_Three")
 ("Park_Hills,_Kentucky", "Park_Hill_(Kentucky)")
 ⋮
 ("Lamar_County,_Mississippi", "Lamar_County_(Mississippi)")
 ("Billings_County,_North_Dakota", "Billings_County_(North_Dakota)")
 ("List_of_Premiers_of_Bermuda", "Prime_Minister_of_Bermuda")
 ("Kemper_County,_Mississippi", "Kemper_County_(Mississippi)")
 ("Morgan_County,_Tennessee", "Morgan_County_(Tennessee)")
 ("Eva_Cheng", "Z

In [87]:
# 对齐实体：有 3k+ 在翻译后被对齐
open("translate/ILLs/ILLs(zh-en).txt", "w") do io
    for (en, zh) in MT_ILLs
        println(io, en, '\t', zh)
    end
end 
length.((ILLs, MT_ILLs))

(13055, 9802)

### 训练集翻译

In [88]:
# 提取关系
sol2rel(sol) = getindex.(Ref(dict_rels), getindex.(sol, 3))
# 翻译三元组
mt_triple(triple) = mt_triple(triple...)
mt_triple(sub, rel, obj) = (get(words_dict, sub, sub), dict_rels[rel], get(words_dict, obj, obj))

sol2rel (generic function with 1 method)

In [90]:
# 筛选非乱码，替换实体名
# 训练集问题，实体，关系
train_ques = String[]
train_sols_mt = Vector{Tuple}[]
for ((que, ner), sol) in zip(train_ques_ner, train_sols)
    nerspace, que = _2space(ner), replace(que, '？'=>'?')
    if occursin(nerspace, que) # 过滤乱码情况
        push!(train_ques, replace(que, nerspace=> "NER"))
        sol = mt_triple.([s[2:end] for s in sol])
        push!(train_sols_mt, sol)
    end
end

In [91]:
# 保存问题
# MT_questions(train_ques, "excel_data/ques/train_ques")

# 读取翻译问题
MT_train_ques = MT_questions(train_ques, "excel_data/ques/train_ques";write=false)

14051-element Vector{String}:
 "what is the title leader of the bay that NER is famous for?"
 "what is the logo of the car that NER is related to?"
 "which format does the cause of " ⋯ 40 bytes ⋯ "during world war ii, belong to?"
 "what is the parent mountain of the client of NER?"
 "which mountain range does the client of NER belong to?"
 "which mountain range does the client of NER belong to?"
 "which mountain range does the c" ⋯ 17 bytes ⋯ "kyscraper in london, belong to?"
 "which official language is used by the country that is affected by NER?"
 "what is the official language o" ⋯ 20 bytes ⋯ "s affected by NER, an erdbeben?"
 "what is the leader name of the country that is affected by NER?"
 "who leads the country that is a" ⋯ 70 bytes ⋯ "south coasts of anatolia (262)?"
 "who is the author of the works that make NER known?"
 "who is the author of the works that make NER known?"
 ⋮
 "do you know who is the leader of the Northwest of NER's Birthplace"
 "what time zone uses North of

In [92]:
# 写入文件
open("translate/ques/train_ques.txt", "w") do io
    for (que, sols) in zip(MT_train_ques, train_sols_mt)
        ner, rels = sols[1][1], getindex.(sols, 2)
        println(io, que, '\t', ner, '\t', join(rels, '\t'))
    end
end

open("translate/ques/train_data.txt", "w") do io
    for (que, sols) in zip(MT_train_ques, train_sols_mt)
        ner = sols[1][1]
        println(io, que, '\t', ner)
        for sol in sols
            println(io, join(sol, '\t'))
        end
        println(io)
    end
end

### 验证集翻译

In [93]:
# 读取数据
txts = split(strip(read(open("extract/valid_data_ner.txt", "r"), String)), '\n')
valid_ques_ner = [split(txt, '\t') for txt in txts]

# 关键词替换
valid_ques = [replace(que, _2space(ner) => "NER", '？'=>'?') for (que, ner) in valid_ques_ner]
valid_ners = [get(words_dict, ner, ner) for ner in last.(valid_ques_ner)]

1500-element Vector{AbstractString}:
 "Felix_Chung"
 "Francis_Russell,_Marquess_of_Tavistock"
 "Governor_of_Genoa"
 "Franklin_Delano_Roosevelt_III"
 "The_Way_It's_Goin'_Down"
 "Jawaharlal_Nehru"
 "Khuut_coal_mine"
 "Aryabhata_(crater)"
 "They_All_Went_to_Mexico"
 "Franklin_Delano_Roosevelt_III"
 "Ira_Goldstein"
 "Francis_Russell,_Marquess_of_Tavistock"
 "USS_Mauna_Kea_(AE-22)"
 ⋮
 "Vall-llobrega"
 "USCGC_Dallas_(WHEC-716)"
 "Ragnar_Kravan"
 "Junior_Paulo_(rugby_league,_born_1993)"
 "List_of_National_Cultural_Treasures_in_the_Philippines"
 "Protagoras_(crater)"
 "2014-15_Gonzaga_Bulldogs_women's_basketball_team"
 "2015-16_Utah_State_Aggies_men's_basketball_team"
 "Documentary"
 "Trujillo_State_Anthem"
 "Tanaecia"
 "2011-12_UCLA_Bruins_men's_basketball_team"

In [94]:
# 写入文件
# MT_questions(valid_ques, "excel_data/ques/valid_ques")

# 读取翻译文件
valid_ques_MT = MT_questions(valid_ques, "excel_data/ques/valid_ques"; write=false)

1500-element Vector{String}:
 "where is the constituency of th" ⋯ 26 bytes ⋯ ", a hong kong politician, from?"
 "who is the successor of the parent of NER, an Irish politician?"
 "who is the leader of the administrative region to which NER belongs?"
 "who preceded the parent of NER, an american economist?"
 "which draft team does the autho" ⋯ 60 bytes ⋯ "ariq and peter gunz, belong to?"
 "who is before the parent of NER" ⋯ 54 bytes ⋯ " minister of india (1889-1964)?"
 "where does the location province of NER, a mine in mongolia, belong to?"
 "who does the origin of NER influence's name?"
 "what is the genre of the author of NER, a song performed by carlos santana?"
 "which title does the parent of NER, an american economist, belong to?"
 "what is the parent mountain of the client of NER?"
 "who did a job before the parent of NER?"
 "what is the type of the people, things and things with the same name as NER?"
 ⋮
 "do you know what time zone is i" ⋯ 66 bytes ⋯ " community of catalonia, 

In [95]:
# 翻译命名实体-旧命名实体从 zh_triples, en_triples, ILLs 中匹配
open("translate/ques/valid_ques.txt", "w") do io
    for (que, ner) in zip(valid_ques_MT, valid_ners)
        println(io, que, '\t', ner)
    end
end

### 维基对齐实体
**仅用于观察对齐模型准确性，不参与训练**

In [104]:
txts = split(strip(read(open("extract/wiki_ills.txt", "r"), String)), '\n')
wiki_ILLs = Tuple.(split.(txts, '\t'))
MT_wiki_ILLs = unique!([(get(words_dict, en, en), get(words_dict, zh, zh)) for (en, zh) in wiki_ILLs])
MT_wiki_ILLs = filter(i->i[1]!=i[2], MT_wiki_ILLs)
println(length(MT_wiki_ILLs),'\t', length(wiki_ILLs))

29656	42073


In [100]:
filter(∉(Set(MT_wiki_ILLs)), MT_ILLs) # 存在包含关系

Tuple{String, String}[]

In [105]:
# 10913 个实体在翻译后被对齐
open("translate/ILLs/ILLs_wiki.txt", "w") do io
    for (en, zh) in MT_wiki_ILLs
        println(io, en, '\t', zh)
    end
end