# 知识图谱与问题翻译

### 概要/说明：
- `include("translatedata.jl")` 导入翻译后的数据
- 翻译文件放在 `translate/`
- 函数工具 `../src/xlsx.jl` 和 `../src/translate.jl`

细节：
- 准备工作
    - 尝试了 Tensor2Tensor 的预训练模型，但官方文档只找到 `德文=>英文` 现成的工具（弃用）
    - 谷歌翻译支持 API 调用，但官方 API 需付费，而 GitHub 第三方编写的 API 不稳定（弃用）
    - 最后使用谷歌提供的文档翻译功能（免费，仅支持 pdf, excel 等，不支持 txt 文本）
    - 决定方案： Excel 进行数据交互，函数工具 `../src/xlsx.jl`
- 翻译训练集和验证集
    - 我们将问题分 `英法` 和 `汉语` 两类进行单语言翻译
    - 翻译前将实体名替换为 "NER" 

> 由于官方提供的 `ILLs(zh-en).txt` 不足以提供所有实体的对齐，三元组翻译放在后边进行

### 导入数据和工具

In [1]:
include("../src/xlsx.jl")
include("../src/extractdata.jl")
include("../src/translate.jl")
mkpath.(["excel_data", "translate", "translate/triples", "translate/words", "translate/ILLs"])

5-element Vector{String}:
 "excel_data"
 "translate"
 "translate/triples"
 "translate/words"
 "translate/ILLs"

### 翻译三元组

#### 翻译关系

In [2]:
# 关系中的少量规则修改
rel_rules = (
    "mittelstädte" => "medium-sized_towns",
    "écoulement" => "flow",
    "résidenceOfficielle" => "official_residence",
    "siège" => "headquarters",
    r"\d上一節目" => "上一節目",
    r"\d+下一節目" => "下一節目",
    "é" => "e");

In [20]:
# 获取关系
zh_rels = unique!(replace.(getindex.(zh_triples, 2), rel_rules...))
en_rels = unique!(replace.(getindex.(en_triples, 2), rel_rules...))
rels = union(zh_rels, en_rels)
println(length.([zh_rels, en_rels, rels]))

[1142, 1635, 2216]


In [4]:
# 写入 Excel
# write_xlsx("excel_data/triples/rels.xlsx", rels)
# 读取翻译
MT_rels = lowercase.(strip.(read_xlsx("excel_data/triples/rels_MT.xlsx")))
MT_rels = replace.(MT_rels, " "=>"", "ゲームジャンル" => "gamegenre", "ジャンル" => "genre")
# 构建翻译
dict_rels = Dict((=>).(rels, MT_rels))

Dict{String, String} with 2216 entries:
  "schoolTradition"           => "schooltradition"
  "action"                    => "action"
  "mainOrgan"                 => "mainorgan"
  "西"                        => "west"
  "garrisonLabel"             => "garrisonlabel"
  "namedFor"                  => "namedfor"
  "demoCd"                    => "democd"
  "upperUnit"                 => "upperunit"
  "municunit"                 => "municunit"
  "locationCity"              => "locationcity"
  "軍種"                      => "military"
  "blankData"                 => "blankdata"
  "subprefectures"            => "subprefectures"
  "分類"                      => "classification"
  "firstRider"                => "firstrider"
  "postNoms"                  => "postnoms"
  "worldPlace"                => "worldplace"
  "placeofbirth"              => "placeofbirth"
  "开国君主"                  => "foundingmonarch"
  "continentalcup1Qualifiers" => "continentalcup1qualifiers"
  "honorificPrefix"           => 

In [21]:
open("translate/words/rels_zh2en.txt", "w") do io
    for (rel1, rel2) in zip(rels, MT_rels)
        println(io, rel1, '\t', rel2)
    end
end

#### 翻译实体

In [9]:
# 英文实体
en_subs, en_objs = @. unique!([first(en_triples), last(en_triples)])
en_words = union(en_subs, en_objs)
enset = Set(en_words)

# 翻译英文图谱中的非英语实体
en_nonen = filter(!isenglish, en_words)
# write_xlsx("excel_data/triples/en_words/en_nonenglish.xlsx", _2space.(en_nonen))
MT_en_nonen = strip.(read_xlsx("excel_data/triples/en_words/en_nonenglish_fr_ge_vn.xlsx"))
MT_en_nonen = replace.(MT_en_nonen, ' ' => '_')
en_words_dict = Dict((=>).(en_nonen, MT_en_nonen))

# 中文实体
zh_subs, zh_objs = @. unique!([first(zh_triples), last(zh_triples)])
zh_words = union(zh_subs, zh_objs)
zhset = Set(zh_words)

# 翻译中文图谱中不落在英文实体的部分
zh_nonen = filter(∉(enset), zh_words)
write_xlsx("excel_data/triples/zh_words/zh_nonenglish.xlsx", _2space.(zh_nonen))
MT_zh_nonen = strip.(read_xlsx("excel_data/triples/zh_words/zh_nonenglish_en.xlsx"))
MT_zh_nonen = replace.(MT_zh_nonen, ' ' => '_')
zh_words_dict = Dict((=>).(zh_nonen, MT_zh_nonen))

# 翻译整合
words = union(zh_words, en_words)
words_dict = Dict(union(zh_words_dict, en_words_dict))

Dict{String, String} with 48566 entries:
  "1914–15_NHA_season"      => "1914-15_NHA_season"
  "De_Eendracht,_Gietervee… => "From_Eendracht,_Gieterveen"
  "Maple_Leaf_Sports_&_Ent… => "Maple_Leaf_Sports_&_Entertainment"
  "Kovácshida"              => "Kovacshida"
  "Xi_County,_Henan"        => "Xi_County,_Henan"
  "2007_European_Baseball_… => "2007_European_Baseball_Championship_-_Qualifica…
  "Vyškov_District"         => "Vyskov_District"
  "Peter_Brown_(rugby_leag… => "Peter_Brown_(rugby_league)"
  "Lubowidz,_Masovian_Voiv… => "Lubowidz,_Masovian_Voivodeship"
  "Beire_(Paredes)"         => "Beire_(Walls)"
  "La_Bazoque,_Calvados"    => "La_Bazoque,_Calvados"
  "Glades_County,_Florida"  => "Glades_County,_Fla."
  "Province_of_Jaén_(Spain… => "Province_of_Jaen_(Spain)"
  "Szydłowiec_County"       => "Szydlowiec_County"
  "Libert_H._Boeynaems"     => "Libert_H._Boeynaems"
  "The_Million_Dollar_Hote… => "The_Million_Dollar_Hotel_(soundtrack)"
  "Finley,_Washington"      => "Finley,_Wash.

In [16]:
open("translate/words/zh_subs.txt", "w") do io
    for sub in zh_subs
        println(io, sub, '\t', get(words_dict, sub, sub))
    end
end

open("translate/words/zh_objs.txt", "w") do io
    for obj in zh_objs
        println(io, obj, '\t', get(words_dict, obj, obj))
    end
end

open("translate/words/zh_words.txt", "w") do io
    for obj in zh_words
        println(io, obj, '\t', get(words_dict, obj, obj))
    end
end


open("translate/words/en_subs.txt", "w") do io
    for sub in en_subs
        println(io, sub, '\t', get(words_dict, sub, sub))
    end
end

open("translate/words/en_objs.txt", "w") do io
    for obj in en_objs
        println(io, obj, '\t', get(words_dict, obj, obj))
    end
end

open("translate/words/en_words.txt", "w") do io
    for obj in en_words
        println(io, obj, '\t', get(words_dict, obj, obj))
    end
end

In [17]:
triples = NTuple{3, String}[]
# 导出英文三元组
open("translate/triples/en_triples.txt", "w") do io
    for (sub, rel, obj) in en_triples
        sub = get(words_dict, sub, sub)
        obj = get(words_dict, obj, obj)
        rel = get(dict_rels, rel, rel)
        triple = (sub, rel, obj)
        push!(triples, triple)
        println(io, join(triple, '\t'))
    end
end

# 中文三元组
open("translate/triples/zh_triples.txt", "w") do io
    for (sub, rel, obj) in zh_triples
        sub = get(words_dict, sub, sub)
        obj = get(words_dict, obj, obj)
        rel = get(dict_rels, rel, rel)
        triple = (sub, rel, obj)
        push!(triples, triple)
        println(io, join(triple, '\t'))
    end
end


# 中英文合并
write_txt("translate/triples/triples.txt", unique!(triples))

# 对齐实体：有 3261 个多在翻译后被对齐
open("translate/ILLs/ILLs(zh-en).txt", "w") do io
    for (en, zh) in ILLs
        en = get(words_dict, en, en)
        zh = get(words_dict, zh, zh)
        en == zh && continue
        println(io, en, '\t', zh)
    end
end 


### 训练集翻译

In [29]:
# 提取关系
sol2rel(sol) = getindex.(Ref(dict_rels), getindex.(sol, 3))

sol2rel (generic function with 1 method)

In [30]:
# 筛选非乱码，替换实体名
# 训练集问题，实体，关系
train_ques, train_ners, train_rels = String[], String[], Vector{String}[]
for ((que, ner), sol) in zip(train_ques_ner, train_sols)
    nerspace, que = _2space(ner), replace(que, '？'=>'?')
    if occursin(nerspace, que) # 过滤乱码情况
        push!(train_ques, replace(que, nerspace=> "NER"))
        push!(train_ners, get(words_dict, ner, ner))
        push!(train_rels, sol2rel(sol))
    end
end

In [31]:
# 保存问题
# MT_questions(train_ques, "excel_data/ques/train_ques")

# 读取翻译问题
MT_train_ques = MT_questions(train_ques, "excel_data/ques/train_ques";write=false)

14051-element Vector{String}:
 "what is the title leader of the bay that NER is famous for?"
 "what is the logo of the car that NER is related to?"
 "which format does the cause of " ⋯ 40 bytes ⋯ "during world war ii, belong to?"
 "what is the parent mountain of the client of NER?"
 "which mountain range does the client of NER belong to?"
 "which mountain range does the client of NER belong to?"
 "which mountain range does the c" ⋯ 17 bytes ⋯ "kyscraper in london, belong to?"
 "which official language is used by the country that is affected by NER?"
 "what is the official language o" ⋯ 20 bytes ⋯ "s affected by NER, an erdbeben?"
 "what is the leader name of the country that is affected by NER?"
 "who leads the country that is a" ⋯ 70 bytes ⋯ "south coasts of anatolia (262)?"
 "who is the author of the works that make NER known?"
 "who is the author of the works that make NER known?"
 ⋮
 "do you know who is the leader of the Northwest of NER's Birthplace"
 "what time zone uses North of

In [32]:
# 写入文件
open("translate/ques/train_ques.txt", "w") do io
    for (que, ner, rels) in zip(MT_train_ques, train_ners, train_rels)
        println(io, que, '\t', ner, '\t', join(rels, '\t'))
    end
end

### 验证集翻译

In [22]:
# 读取数据
txts = split(strip(read(open("extract/valid_data_ner.txt", "r"), String)), '\n')
valid_ques_ner = [split(txt, '\t') for txt in txts]

# 关键词替换
valid_ques = [replace(que, _2space(ner) => "NER", '？'=>'?') for (que, ner) in valid_ques_ner]

valid_ners = [get(words_dict, ner, ner) for ner in last.(valid_ques_ner)]

1500-element Vector{AbstractString}:
 "Felix_Chung"
 "Francis_Russell,_Marquess_of_Tavistock"
 "Governor_of_Genoa"
 "Franklin_Delano_Roosevelt_III"
 "The_Way_It's_Goin'_Down"
 "Jawaharlal_Nehru"
 "Khuut_coal_mine"
 "Aryabhata_(crater)"
 "They_All_Went_to_Mexico"
 "Franklin_Delano_Roosevelt_III"
 "Ira_Goldstein"
 "Francis_Russell,_Marquess_of_Tavistock"
 "USS_Mauna_Kea_(AE-22)"
 ⋮
 "Vall-llobrega"
 "USCGC_Dallas_(WHEC-716)"
 "Ragnar_Kravan"
 "Junior_Paulo_(rugby_league,_born_1993)"
 "List_of_National_Cultural_Treasures_in_the_Philippines"
 "Protagoras_(crater)"
 "2014-15_Gonzaga_Bulldogs_women's_basketball_team"
 "2015-16_Utah_State_Aggies_men's_basketball_team"
 "Documentary"
 "Trujillo_State_Anthem"
 "Tanaecia"
 "2011-12_UCLA_Bruins_men's_basketball_team"

In [24]:
# 写入文件
# MT_questions(valid_ques, "excel_data/ques/valid_ques")

# 读取翻译文件
valid_ques_MT = MT_questions(valid_ques, "excel_data/ques/valid_ques"; write=false)

1500-element Vector{String}:
 "where is the constituency of th" ⋯ 26 bytes ⋯ ", a hong kong politician, from?"
 "who is the successor of the parent of NER, an Irish politician?"
 "who is the leader of the administrative region to which NER belongs?"
 "who preceded the parent of NER, an american economist?"
 "which draft team does the autho" ⋯ 60 bytes ⋯ "ariq and peter gunz, belong to?"
 "who is before the parent of NER" ⋯ 54 bytes ⋯ " minister of india (1889-1964)?"
 "where does the location province of NER, a mine in mongolia, belong to?"
 "who does the origin of NER influence's name?"
 "what is the genre of the author of NER, a song performed by carlos santana?"
 "which title does the parent of NER, an american economist, belong to?"
 "what is the parent mountain of the client of NER?"
 "who did a job before the parent of NER?"
 "what is the type of the people, things and things with the same name as NER?"
 ⋮
 "do you know what time zone is i" ⋯ 66 bytes ⋯ " community of catalonia, 

In [25]:
# 翻译命名实体-旧命名实体从 zh_triples, en_triples, ILLs 中匹配
open("translate/ques/valid_ques.txt", "w") do io
    for (que, ner) in zip(valid_ques_MT, valid_ners)
        println(io, que, '\t', ner)
    end
end

### 维基对齐实体
**仅用于观察对齐模型准确性，不参与训练**

In [34]:
txts = split(strip(read(open("extract/wiki_ills.txt", "r"), String)), '\n')
ILLs_valid = Tuple.(split.(txts, '\t'))

42073-element Vector{Tuple{SubString{String}, SubString{String}}}:
 ("Lakas–CMD", "基督教穆斯林民主力量党")
 ("Zhu_Shanlu", "朱善璐")
 ("Chaoshan", "潮汕地區")
 ("Saran_district", "薩蘭縣")
 ("Carballo", "卡尔瓦略")
 ("My_Belarusy", "白俄罗斯共和国国歌")
 ("Klaus_Allofs", "克勞斯·阿洛夫斯")
 ("Alghero", "阿尔盖罗")
 ("West_Lafayette,_Indiana", "西拉法葉")
 ("Ernest_Mason_Satow", "薩道義")
 ("Afyonkarahisar", "阿菲永卡拉希萨尔")
 ("Stanley_Baldwin", "斯坦利·鲍德温")
 ("Thessaly", "色萨利")
 ⋮
 ("Tianjin_Quanjian_F.C.", "天津权健足球俱乐部")
 ("John_Kerry", "约翰·克里")
 ("Móra_d'Ebre", "莫拉德夫雷")
 ("Kano", "卡諾")
 ("Chengdu_Qianbao_F.C.", "成都钱宝足球俱乐部")
 ("Cyril_Newall,_1st_Baron_Newall", "西里尔·内维尔，第一代内维尔男爵")
 ("Jambi_City", "占碑市")
 ("Barneveld", "巴讷费尔德")
 ("List_of_Justices_of_the_Supreme_Court_of_the_United_States", "美国联邦最高法院大法官列表")
 ("New_Territories_West_(constituency)", "新界西選區")
 ("Kowloon_West_(constituency)", "九龍西選區")
 ("Leonid_Brezhnev", "列昂尼德·伊里奇·勃列日涅夫")

In [35]:
using DataStructures
"字典排序"
function sort_count(list)
    c_list = counter(list)
    sort(Dict(c_list), by=i->c_list[i], rev=true)
end

sort_count(first.(ILLs_valid))

OrderedDict{SubString{String}, Int64} with 37897 entries:
  "ZIP_Code"                                 => 8
  "Holy_Roman_Emperor"                       => 7
  "Postal_code"                              => 7
  "Independent_politician"                   => 7
  "Lutheranism"                              => 7
  "Barack_Obama"                             => 6
  "Texas"                                    => 6
  "Presbyterianism"                          => 6
  "Democratic_Party_(United_States)"         => 6
  "Republican_Party_(United_States)"         => 6
  "Electronic_Arts"                          => 5
  "Monarchy_of_the_United_Kingdom"           => 5
  "South_Korea"                              => 5
  "Dance-pop"                                => 5
  "Hip_hop_music"                            => 5
  "New_York_City"                            => 5
  "Sony_Mobile"                              => 5
  "United_States"                            => 5
  "New_South_Wales"                       

In [36]:
filter(∉(Set(ILLs_valid)), ILLs) # 存在包含关系

Tuple{String, String}[]

In [37]:
# 10913 个实体在翻译后被对齐
open("translate/ILLs_wiki.txt", "w") do io
    for (en, zh) in ILLs_valid
        en = get(words_dict, en, en)
        zh = get(words_dict, zh, zh)
        en == zh && continue
        println(io, en, '\t', zh)
    end
end