# 预处理

### 概要/说明：

- 从 `raw_data` 提取数据，放在 `extract`
- 输入 `include("../src/extractdata.jl")` 导入提取内容

主要内容：
- 提取训练数据: `问题-实体\n 三元组1，2，3`
- 提取对齐信息：去重去等，补充少量对齐（从训练集得来）
- 纠正错位问题：规则参见 `../src/regexpr.jl`

### 导入正则表达式

In [6]:
# 从链接中提取关键词
cd("../data/")
mkpath("extract")
include("../src/tools/regexpr.jl")

rectify_que (generic function with 1 method)

In [7]:
# 提取英文三元组
extract_io = open("extract/triple_en.txt", "w")
open("raw_data/triple_en.txt", "r") do io
    while true
        line = readline(io)
        isempty(line) && break
        println(extract_io, join(line2triple(line), '\t'))
    end
end
close(extract_io)

In [4]:
# 提取中文三元组
extract_io = open("extract/triple_zh.txt", "w")
open("raw_data/triple_zh.txt", "r") do io
    while true
        line = readline(io)
        isempty(line) && break
        println(extract_io, join(line2triple(line;en=false), '\t'))
    end
end
close(extract_io)

### 实体对齐

In [5]:
# 提取实体对齐
doubles = Tuple{String,String}[]
open("raw_data/ILLs(zh-en).txt", "r") do io
    while true
        line = readline(io)
        isempty(line) && break
        en, _, zh = split(line, ' ')
        push!(doubles, 
            (get_txt(en_obj_reg, en), get_txt(zh_obj_reg, zh)))
    end
end
unique!(doubles) # 去重
push!(doubles, # 补充知识图谱对齐
    ("Huizhou" , "惠州市"),
    ("Leonid_Brezhnev", "列昂尼德·伊里奇·勃列日涅夫"),
    ("Li_Qingzhao", "李清照"),
    ("Sun_Lianzhong", "孫連仲"))

# 写入文件
open("extract/ILLs(zh-en).txt", "w") do io
    for (en, zh) in doubles
        en == zh && continue
        println(io, en, '\t', zh)
    end
end

### 训练集

In [1]:
# 提取训练集信息

# print(QandA("which official language is used by the country that is affected by 1957 Mongolia earthquake?	<http://dbpedia.org/resource/1957_Mongolia_earthquake>#<http://dbpedia.org/property/countriesAffected>#<http://dbpedia.org/resource/Mongolia>#<http://zh.dbpedia.org/resource/蒙古国>#<http://zh.dbpedia.org/property/官方文字>#<http://zh.dbpedia.org/resource/蒙古字母>"))

QandA

In [48]:
# 写入文件
txts = unique!(split(strip(read(open("raw_data/train_data.txt", "r"), String)), '\n'))
extract_io = open("extract/train_data.txt", "w")
train_ques_io = open("extract/train_ques.txt", "w")

# 跳过 26 个乱码问题
error = [2507,2961,9562,9669,10640,10699,10981,11374,11539,11730,11914,11983,11990,12042,12229,12281,12373,12629,12676,12767, 853, 1832, 2020, 2376, 2959, 2051]
for (i, txt) in enumerate(txts)
    i ∈ error && continue
    que, sols = split(strip(txt), '\t')
    que, sols = rectify_que(que), split(sols, '#')
    triples = triple_info.(partition(sols, 3))
    ner = triples[1][2]
    # 训练集 + ner + 三元组
    println(extract_io, que, '\t', ner)
    println(extract_io, join(join.(triples, '\t'), '\n'), '\n')
    # 训练集 + ner + 关系
    println(train_ques_io, que, '\t', ner, '\t', join(getindex.(triples, 3), '\t'))
end
close(extract_io)

### 验证集

In [8]:
# 处理验证集
txts = split(strip(read(open("raw_data/valid_data.txt", "r"), String)), '\n')
valid_ques = rectify_que.([last(split(strip(txt), '\t')) for txt in txts])
valid_ques[233] = "which national team does Seattle Sounders FC 2's coach belong to?"
valid_ques[284] = "what is the club that Seattle Sounders FC 2's head coach belongs to?"
valid_ques[418] = "which sport does 2014–15 SC Freiburg season's coach play?"
valid_ques[280] = "who is the predecessor of the one that is after election of Italian general election, 1874?"

extract_io = open("extract/valid_data.txt", "w")
for que in valid_ques
    println(extract_io, que)
end
close(extract_io)

### 最终验证集

In [9]:
txts = split(strip(read(open("raw_data/valid_data.txt", "r"), String)), '\n')
valid_ques = rectify_que.([last(split(strip(txt), '\t')) for txt in txts])

1500-element Vector{String}:
 "who is influenced by the origin of Cézanne_(crater)'s name?"
 "Savez-vous quel est le nom de l" ⋯ 135 bytes ⋯ "-ball universitaire américain?"
 "Hero_(Super_Junior_album)的头衔的作词者的职业是什么"
 "Savez-vous où est le berceau de" ⋯ 27 bytes ⋯ "e club de UEFA_Euro_2012_squads"
 "Qui est le prédécesseurs de Élection après Taiwan_legislative_election,_1995"
 "who is the predecessor of the o" ⋯ 53 bytes ⋯ "ction_in_Wyoming,_2000, a wahl?"
 "what is the job of the author o" ⋯ 29 bytes ⋯ "io drama written by paul magrs?"
 "和French_cruiser_Georges_Leygues名称一样的人或事物的前任是什么"
 "Savez-vous où se trouve l'adres" ⋯ 59 bytes ⋯ "14–15_Panathinaikos_F.C._season"
 "Savez-vous quelles sont les pos" ⋯ 63 bytes ⋯ "Party_leadership_election,_1997"
 "what is the office of the one t" ⋯ 66 bytes ⋯ "onstituency)_by-election,_2001?"
 "À quel parti appartient Élection après Estonian_parliamentary_election,_2003"
 "who is the predecessor of the p" ⋯ 45 bytes ⋯ "s French_ship_Alexandre_(1857)?"


In [10]:
extract_io = open("extract/valid_data.txt", "w")
for que in valid_ques
    println(extract_io, que)
end
close(extract_io)