## 翻译工作

### 读取数据

In [11]:
include("../src/readfiles.jl")

导入成功，内容概要：
    | 变量名 | 说明 |
    | ---- | ---- |
    | zh_triples/en_triples | 三元组 |
    | zh_subs/en_subs | 头实体 |
    | zh_objs/en_objs | 尾实体 |
    | zh_rels/en_rels | 关系 |
    | zh_entity/en_entity | 实体（头和尾） |
    | ILLs | 英文 => 中文对齐 |
    | ILLs_zh_en | 中文 => 英文对齐|
    | train_data | 训练集 |


### ILLs 翻译
1. `triple_zh` => `triple_zh_ills`
2. `triples` = `triple_en + triple_zh_ills`
3. 通过 `triples` 计算三跳信息

#### 三元组-实体对齐

In [1]:
"用 ILLs 翻译实体和关系"
function zh2en(triple)
    sub, rel, obj = triple
    sub ∈ keys(ILLs_zh_en) && (sub = ILLs_zh_en[sub])
    rel ∈ keys(ILLs_zh_en) && (rel = ILLs_zh_en[rel])
    obj ∈ keys(ILLs_zh_en) && (obj = ILLs_zh_en[obj])
    return (sub, rel, obj)
end

zh2en

In [3]:
# 处理，合并三元组
en_triples_ills = [zh2en(triple) for triple in zh_triples]
triples = unique!(vcat(en_triples, en_triples_ills))
# 存在 736 个重复
println(length.([en_triples_ills, en_triples, triples]))

[104941, 162544, 266749]


In [4]:
triple_repeat = filter(in(Set(en_triples)), en_triples_ills)
println(length(triple_repeat))
open("../data/ILLs/triple_repeat.txt", "w") do io
    for triple in triple_repeat
        println(io, join(triple, '\t'))
    end
end
# ("雷", "format", "Ogg") => ("Thunder", "format", "Ogg")

736


In [68]:
# 写入数据
# open("../data/ILLs/triple_zh_ills.txt", "w") do io
#     for triple in en_triples_ills
#         println(io, join(triple, '\t'))
#     end
# end

# open("../data/ILLs/triples.txt", "w") do io
#     for triple in triples
#         println(io, join(triple, '\t'))
#     end
# end

#### 三跳计算量

In [114]:
using DataStructures

# 初始化边集
edges = DefaultDict{String, Vector{String}}(Vector{String})
for (sub, _, obj) in triples
    push!(edges[sub], obj)
end

# 多跳函数 O(N*frequence*jump) ≈ O(N*jump)
function multi_jumps(edges::AbstractDict; num=3)
    jumps = Dict{String, Vector{Int}}(sub => [length(val)] for (sub, val) in edges)
    for i in 1:(num-1), (sub, objs) in edges
        new_num = sum(haskey(jumps, obj) ? jumps[obj][i] : 1 for obj in objs)
        push!(jumps[sub], new_num)
    end
    jumps
end

multi_jumps (generic function with 1 method)

In [115]:
# 只需要考虑头实体
# 频次高主要是尾实体形式
jumps = multi_jumps(edges, num=4)
sort(jumps; by=i->last(jumps[i]), rev=true)

OrderedDict{String, Vector{Int64}} with 123248 entries:
  "Antonio_Segni"                         => [57, 369, 2855, 16834]
  "Giovanni_Leone"                        => [34, 248, 1751, 11127]
  "Fernando_Tambroni"                     => [20, 233, 1451, 10367]
  "Giovanni_Gronchi"                      => [24, 221, 1335, 9618]
  "Ma_Ying-jeou"                          => [35, 220, 1472, 9301]
  "Louis_the_German"                      => [26, 232, 1366, 8486]
  "Koosharem,_Utah"                       => [21, 236, 1361, 7904]
  "Giuseppe_Pella"                        => [33, 189, 1111, 7649]
  "Louis_the_Pious"                       => [33, 240, 1232, 7430]
  "Maurice_Bourgès-Maunoury"              => [47, 189, 1615, 6950]
  "Lien_Chan"                             => [29, 172, 1140, 6891]
  "Itō_Hirobumi"                          => [28, 173, 1069, 6848]
  "吳敦義"                                => [20, 167, 1036, 6680]
  "Louis_the_Younger"                     => [15, 167, 1021, 6413]
  "Yam

In [87]:
# # 保存结果
# open("../data/ILLs/jumps.txt", "w") do io
#     for (sub, val) in sort(jumps; by=i->last(jumps[i]), rev=true)
#         println(io, sub, '\t', join(val, '\t'))
#     end
# end

### 机器翻译

#### 三元组翻译
1. 关系提取过程只需要子图附近的边集，不需要输入节点
2. `triple_zh_ills` => `triple_en_2.txt`
3. 汇总 `triples.txt`，格式如下

   |  subject | relation | object |
   | ------- | -------- | ------ | 
   |  头实体 | 关系 | 尾实体 | 

In [5]:
using XLSX
pure_zh = filter(!isascii, filter(!in(en_rels), zh_rels))
# # 写入文件
# XLSX.openxlsx("../data/translate/pure_zh.xlsx", mode="w") do xf
#     sheet = xf[1]
#     sheet["A1:A255"] = reshape(pure_zh, 255, 1)
# end

# 格式标准化
function rel_combine(txt::AbstractString)
    txts = split(txt, ' ')
    length(txts) == 1 && return txt
    first(txts) * join(uppercasefirst.(txts[2:end]))
end

xf = XLSX.readxlsx("../data/translate/pure_zh_translate.xlsx")
translate = rel_combine.(strip.(Vector{String}(xf[1][:][:])))

# 转化关系-注意格式
zh2en_rel = Dict(zh=>en for (zh, en) in zip(pure_zh, translate))

Dict{String, AbstractString} with 255 entries:
  "西"   => "West"
  "编剧" => "screenwriter"
  "流經" => "flowThrough"
  "軍種" => "military"
  "北"   => "north"
  "首長… => "Chief'sName"
  "分類" => "Classification"
  "頻道" => "channel"
  "樂器" => "musicalInstrument"
  "开国… => "foundingMonarch"
  "父"   => "father"
  "腳本" => "script"
  "出生… => "dateOfBirth"
  "類"   => "kind"
  "使用… => "UseThePlatform"
  "电视… => "TVStation"
  "聲優" => "seiyuu"
  "国籍" => "CountryOfCitizenship"
  "受影… => "affectedBy"
  "元配… => "FatherOfYuanPei"
  "皇居" => "ImperialPalace"
  "执政… => "rulingParty"
  "隶属" => "affiliation"
  "分封… => "Packer"
  "擁有… => "owner"
  ⋮      => ⋮

In [8]:
# 写入文件
# open("../data/translate/triple_en_2.txt", "w") do io
#     for (sub, rel, obj) in en_triples_ills
#         haskey(zh2en_rel, rel) && (rel = zh2en_rel[rel])
#         println(io, join([sub, rel, obj], '\t'))
#     end
# end

# open("../data/translate/triples.txt", "w") do io
#     for (sub, rel, obj) in unique!(vcat(en_triples, en_triples_ills))
#         haskey(zh2en_rel, rel) && (rel = zh2en_rel[rel])
#         println(io, join([sub, rel, obj], '\t'))
#     end
# end

#### 还原翻译
`triples.txt => triple_en_1.txt/triple_en_2.txt => raw_triple`

In [23]:
# 读入翻译后的
en_triples_1 = readtriples("translate/triple_en_1.txt")
en_triples_2 = readtriples("translate/triple_en_2.txt")

# 读入翻译前的数据
raw_en_triples = readtriples("raw_triple/triple_en.txt")
raw_zh_triples = readtriples("raw_triple/triple_zh.txt")

MT_en = Dict((=>).(raw_en_triples, en_triples_1))
MT_zh = Dict((=>).(raw_zh_triples, en_triples_2))
MT_en_rev = Dict((=>).(en_triples_1, raw_en_triples))
MT_zh_rev = Dict((=>).(en_triples_2, raw_zh_triples))

Dict{Tuple{String, String, String}, Tuple{String, String, String}} with 104940 entries:
  ("比拉达马特", "subdivis… => ("比拉达马特", "subdivisionName", "加泰罗尼亚")
  ("Sieniawa", "subdivisio… => ("谢尼亚瓦", "subdivisionName", "普热沃斯克县")
  ("第二次中东战争", "plac… => ("第二次中东战争", "place", "中東")
  ("小行星2048", "discover… => ("小行星2048", "discoverySite", "帕洛马山")
  ("喬·沙翠亞尼", "genre",… => ("喬·沙翠亞尼", "genre", "硬搖滾")
  ("蓬蒂布兰卡", "subdivis… => ("蓬蒂布兰卡", "subdivisionName", "马托格罗索州")
  ("湯姆·卡波", "state", "… => ("湯姆·卡波", "state", "德拉瓦州")
  ("Hisashi_Kimura", "birt… => ("木村荣", "birthPlace", "日本")
  ("古斯塔沃·卡纳莱斯", "y… => ("古斯塔沃·卡纳莱斯", "youthclubs", "拉普拉塔体操…
  ("毁灭者_(电影)", "starr… => ("毁灭者_(电影)", "starring", "凡妮莎·威廉斯")
  ("小行星8071", "discover… => ("小行星8071", "discoverySite", "可可尼诺县")
  ("韦斯利·克拉克", "alleg… => ("韦斯利·克拉克", "allegiance", "美国陆军")
  ("凌時差音樂", "keyPeopl… => ("凌時差音樂", "keyPeople", "蔡依林")
  ("埃尔南·克雷斯波", "clu… => ("埃尔南·克雷斯波", "clubs", "拉素體育會")
  ("Christian_Wulff", "off… => ("克里斯蒂安·武尔夫", "office", "德国联邦总统")
  ("蘭開斯特縣

#### 问题翻译
1. 训练集 `train_data.txt`，格式如下

   | index | question | NER | rel1 | rel2 | rel3 |
    | ----- | ---------| --- | ---- | ---- | ---- |
    | 索引 | 问题 | 命名实体 | 关系1 | 关系2 | 关系3 |

2. 测试集 `valid_data.txt` 格式如下
   | index | question | NER |
    | ----- | ---------| --- |
    | 索引 | 问题 | 命名实体 |

In [108]:
# 存在重边，但不同节点的情况，且比例不小
sub_edges = DefaultDict{String, Vector{String}}(Vector{String})
for (sub, edge, obj) in triples
    push!(sub_edges[sub], edge)
end
num = count(i->length(unique(i))!=length(i), values(sub_edges))
println(num,'\t', num/length(sub_edges))

25164	0.204173698559003


In [26]:
# 语言模式-用于区分英文，法语
txt = "U+20-5F, U+61-7A, U+7C, U+A0, U+A7, U+A9, U+AB, U+B2-B3, U+BB, U+C0, U+C2, U+C6-CB, U+CE-CF, U+D4, U+D9, U+DB-DC, U+E0, U+E2, U+E6-EB, U+EE-EF, U+F4, U+F9, U+FB-FC, U+FF, U+152-153, U+178, U+2B3, U+2E2, U+1D48-1D49, U+2010-2011, U+2013-2014, U+2019, U+201C-201D, U+2020-2021, U+2026, U+202F-2030, U+20AC, U+2212"
code_range = [strip(t)[3:end] for t in split(txt, ',')]
french = Char[]
for code in code_range
    if '-' ∉ code
        push!(french, Char(parse(Int, "0x"*code)))
    else
        st, en = split(code, '-')
        append!(french, Char.(parse(Int, "0x"*st):parse(Int, "0x"*en)))
    end
end
french = join(french)
isfrench(c::AbstractChar) = c ∈ french
isfrench(txt::AbstractString) = all(isfrench, txt)
println(french)

 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_abcdefghijklmnopqrstuvwxyz| §©«²³»ÀÂÆÇÈÉÊËÎÏÔÙÛÜàâæçèéêëîïôùûüÿŒœŸʳˢᵈᵉ‐‑–—’“”†‡… ‰€−


In [62]:
# 通过 train_data 返回问题的 NER 和关系
function get_NER_rels(que::AbstractString)
    rels = String[]
    for (i, sol) in enumerate(train_data[que])
        lang, sol = first(sol), sol[2:end]
        sol = lang == "zh" ? MT_zh[sol] : MT_en[sol]
        i == 1 && push!(rels, first(sol))
        push!(rels, sol[2])
    end
    rels
end

get_NER_rels (generic function with 2 methods)

In [107]:
# 问题分两类-英法、中文
train_ques = split(rstrip(read(open("extract/valid_ques.txt", "r"), String)), '\n')
train_ques_ner = similar(train_ques)
for (i, que) in enumerate(train_ques)
    ner = replace(train_data[que][1][2], '_'=>' ')
    train_ques_ner[i] = replace(que, ner=>"NER")
end
french_ques, chinese_ques, french_ques_ner, chinese_ques_ner = [String[] for _ in 1:4]
for (q1, q2) in zip(train_ques, train_ques_ner)
    if isfrench(q2)
        push!(french_ques, q1)
        push!(french_ques_ner, q2)
    else
        push!(chinese_ques, q1)
        push!(chinese_ques_ner, q2)
    end
end

In [100]:
# 保存为 Excel
# XLSX.openxlsx("translate/french_ques.xlsx", mode="w") do xf
#     sheet = xf[1]
#     n = length(french_ques_ner)
#     sheet["A1:A$n"] = reshape(french_ques, n, 1)
# end
# XLSX.openxlsx("translate/chinese_ques.xlsx", mode="w") do xf
#     sheet = xf[1]
#     n = length(chinese_ques_ner)
#     sheet["A1:A$n"] = reshape(chinese_ques, n, 1)
# end

4351×1 Matrix{String}:
 "NER的所属国家的立法机构属于哪个王朝"
 "NER的领导人的国籍是什么"
 "NER的位置西部的地点是什么选区"
 "NER的母校效力的总统是谁"
 "NER，一位中国政治家（1913-2010），他的头衔的提名者由谁担任"
 "a古巴的城市NER的所属群岛属于哪个区划"
 "掌管NER的人的代理人是谁"
 "日本政治家NER扮演过属于哪个党派"
 "NER的位置建立的标志事件采用哪种国家体制"
 "NER的后裔的上一代的前任是什么"
 "NER影响到的国家和地区的首都（或省会）在哪儿"
 "NER的领导人的领导机构是什么"
 "NER的最大城市东北边的海洋叫什么"
 ⋮
 "菲律宾的一个政党NER的领导人所属的党派是什么"
 "NER，一位美国律师和政治家，他的母校隶属于哪个机构"
 "NER的领导人的分支是哪个"
 "NER的镇长的所属区域是什么"
 "NER的著作是谁导演的作品"
 "法国国王（1544-1560）NER的母亲的后裔的上一代是什么"
 "NER的母校隶属于哪个机构"
 "NER的控制者用什么货币"
 "NER的郡西南部是什么地方"
 "NER的选举获胜者住过的地方有哪些"
 "越南统治者NER的父亲的下一任是谁"
 "丹麦独立视频游戏开发商和出版商NER所在国家有什么代表"

In [104]:
xf = XLSX.readxlsx("../data/translate/MT_chinese_ques.xlsx")
chinese_en_ques = Vector{String}(strip.(xf[1][:][:]))
xf = XLSX.readxlsx("../data/translate/MT_french_ques.xlsx")
french_en_ques = Vector{String}(strip.(xf[1][:][:]))

9135-element Vector{String}:
 "who is the developer of the sponsor of NER?"
 "where did the one that is after election of NER, a wahl, study?"
 "where does the event involved in NER occur?"
 "Where did Le descendant de NER," ⋯ 21 bytes ⋯ "n and general (1789-1848), die?"
 "who is the spouse of the leader of NER?"
 "who managed the tenant of the s" ⋯ 40 bytes ⋯ "voetbalcompetitie, takes place?"
 "which country does the alma mater of NER belong to?"
 "which religion does the one that is after election of NER belong to?"
 "which place is called as the su" ⋯ 35 bytes ⋯ "of NER, a bergwerk in mongolei?"
 "which city does the alma mater of NER belong to?"
 "Do you know where the religion of The leader of NER comes from"
 "which label does the author of NER belong to?"
 "Who will be The first of The title of NER next"
 ⋮
 "which sport does the tenant of the location of NER play?"
 "who is the monarch of the one that is after election of NER?"
 "where does the successor of the deputy of NER be

In [112]:
open("translate/train_data.txt", "w") do io
    for (q1, q2) in zip(chinese_en_ques, chinese_ques)
        rels = get_NER_rels(q2)
        println(io, q1,'\t', join(rels, '\t'))
    end
    for (q1, q2) in zip(french_en_ques, french_ques)
        rels = get_NER_rels(q2)
        println(io, q1,'\t',  join(rels, '\t'))
    end
end

#### 小结
1. `translate/triples.txt` 记录知识图谱
2. `translate/train_data.txt` 记录问题，NER（根节点），关系1-3
3. `translate/valid_data.txt` 记录问题，NER（根节点）

现在就差对训练数据的训练，并在测试集上验证