# 数据预处理
主要内容：
- 提取链接中的实体和关系，纠正部分单词（不影响NER）
- 分割问题与解答
- 去除重复数据
- 纠正乱码数据

数据预处理的同时，也保留原始数据，用于最后提交

### 正则表达式

In [18]:
# 从链接中提取关键词
cd("../data")
mkpath("extract")
include("../src/regexpr.jl")

rectify_que (generic function with 1 method)

### 英文关系

In [24]:
# 翻译英文关系
en_rules = (
    "dépt" => "dept",
    "rég" => "reg",
    "mittelstädte" => "medium-sized_towns",
    "président" => "president",
    "écoulement" => "flow",
    "résidenceOfficielle" => "official_residence",
    "département" => "department",
    "siège" => "headquarters")

# 提取英文三元组
extract_io = open("extract/triple_en.txt", "w")
raw_io = open("raw_triple/triple_en.txt", "w")
open("raw_data/triple_en.txt", "r") do io
    while true
        line = readline(io)
        isempty(line) && break
        txt = join(line2triple(line), '\t')
        println(raw_io, txt)
        txt = replace(txt, en_rules...)
        println(extract_io, txt)
    end
end
close(extract_io)
close(raw_io)

### 中文关系

In [23]:
# 翻译中文
zh_rules = (
    r"\d上一節目" => "上一節目",
    r"\d+下一節目" => "下一節目",
    "région" => "region",
    "département" => "department")

# 提取中文三元组
extract_io = open("extract/triple_zh.txt", "w")
raw_io = open("raw_triple/triple_zh.txt", "w")
open("raw_data/triple_zh.txt", "r") do io
    while true
        line = readline(io)
        isempty(line) && break
        txt = join(line2triple(line; en=false), '\t')
        println(raw_io, txt)
        txt = replace(txt, zh_rules...)
        println(extract_io, txt)
    end
end
close(extract_io)
close(raw_io)

### 实体对齐

In [22]:
# 提取实体对齐
doubles = Tuple{String,String}[]
open("raw_data/ILLs(zh-en).txt", "r") do io
    while true
        line = readline(io)
        isempty(line) && break
        txt = split(line, ' ')
        push!(doubles, 
            (get_txt(en_obj_reg, txt[1]), get_txt(zh_obj_reg, txt[3])))
    end
end
unique!(doubles) # 去重
push!(doubles, # 补充知识图谱对齐
    ("Huizhou" , "惠州市"),
    ("Leonid_Brezhnev", "列昂尼德·伊里奇·勃列日涅夫"),
    ("Li_Qingzhao", "李清照"),
    ("Sun_Lianzhong", "孫連仲"))

# 写入文件
open("extract/ILLs(zh-en).txt", "w") do io
    for (en, zh) in doubles
        println(io, en, '\t', zh)
    end
end

### 训练集

In [12]:
# 提取训练集信息
"""获取三元组信息"""
function triple_info(triple)
    s1 = match(en_obj_reg, first(triple))
    isnothing(s1) ? "zh\t" * join(get_txt.(
            [zh_obj_reg, zh_rel_reg, zh_obj_reg], triple),'\t') : "en\t" * join(get_txt.(
            [en_obj_reg, en_rel_reg, en_obj_reg], triple),'\t')
end

"""拆分问题和答案"""
function QandA(txt)
    que, sols = split(txt, '\t')
    que, sols = rectify_que(que), split(sols, '#')
    ner = split(triple_info(sols[1:3]), '\t')[2]
    "$que\t$ner\n" *
    join([triple_info(sols[3 * i - 2: 3 * i]) for i in 1:Int(length(sols)/3)],'\n','\n')
end
# print(QandA("which official language is used by the country that is affected by 1957 Mongolia earthquake?	<http://dbpedia.org/resource/1957_Mongolia_earthquake>#<http://dbpedia.org/property/countriesAffected>#<http://dbpedia.org/resource/Mongolia>#<http://zh.dbpedia.org/resource/蒙古国>#<http://zh.dbpedia.org/property/官方文字>#<http://zh.dbpedia.org/resource/蒙古字母>"))

QandA

In [16]:
# 写入文件
txts = unique!(split(strip(read(open("raw_data/train_data.txt", "r"), String)), '\n'))
extract_io = open("extract/train_data.txt", "w")
for txt in txts
    println(extract_io, QandA(txt), '\n')
end
close(extract_io)

### 验证集

In [27]:
# 处理验证集
txts = split(strip(read(open("raw_data/valid_data.txt", "r"), String)), '\n')
valid_ques = rectify_que.([last(split(strip(txt), '\t')) for txt in txts])
valid_ques[233] = "which national team does Seattle Sounders FC 2's coach belong to?"
valid_ques[284] = "what is the club that Seattle Sounders FC 2's head coach belongs to?"
valid_ques[418] = "which sport does 2014–15 SC Freiburg season's coach play?"

extract_io = open("extract/valid_data.txt", "w")
for que in valid_ques
    println(extract_io, que)
end
close(extract_io)