This repository has been archived by the owner on Nov 3, 2023. It is now read-only.
/
build.py
114 lines (93 loc) · 3.47 KB
/
build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# Download and build the data if it does not exist.
import parlai.core.build_data as build_data
import os
import json
from parlai.core.build_data import DownloadableFile
from parlai.utils.io import PathManager
RESOURCES = [
DownloadableFile(
'https://s3.amazonaws.com/my89public/quac/train_v0.2.json',
'train_v0.2.json',
'ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a',
zipped=False,
),
DownloadableFile(
'https://s3.amazonaws.com/my89public/quac/val_v0.2.json',
'val_v0.2.json',
'09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378',
zipped=False,
),
]
VERSION = '0.2'
SHOULD = '__SHOULD__'
MAYBE = '__MAYBE__'
SHOULD_NOT = '__SHOULDNOT__'
YES = '__YES__'
NO = '__NO__'
NEITHER = '__NEITHER__'
MAP_CONTINUATION = {'m': MAYBE, 'f': SHOULD, 'n': SHOULD_NOT}
MAP_AFFIRMATION = {'y': YES, 'n': NO, 'x': NEITHER}
OUTPUT_FORMAT = (
'text:{question}\tfollowup:{continuation}\tyesno:'
'{affirmation}\tanswer_starts:{start}\tlabels:{labels}'
)
def _parse_answers(q_a):
starts = []
labels = []
for each in q_a['answers']:
starts.append(str(each['answer_start']))
labels.append(each['text'].replace('|', ' __PIPE__ '))
return '|'.join(starts), '|'.join(labels)
def _handle_paragraph(each):
output = []
story = each['context'].replace('\n', '\\n')
for idx, q_a in enumerate(each['qas']):
question_txt = ''
if idx == 0:
question_txt = story + '\\n' + q_a['question']
else:
question_txt = q_a['question']
starts, labels = _parse_answers(q_a)
output.append(
OUTPUT_FORMAT.format(
question=question_txt,
continuation=MAP_CONTINUATION.get(q_a['followup']),
affirmation=MAP_AFFIRMATION.get(q_a['yesno']),
start=starts,
labels=labels,
)
)
if idx < len(each['qas']) - 1:
output.append('\n')
output.append('\t\tepisode_done:True\n')
return ''.join(output)
def make_parlai_format(outpath, dtype, data):
print('building parlai:' + dtype)
with PathManager.open(os.path.join(outpath, dtype + '.txt'), 'w') as fout:
for line in data:
for each in line['paragraphs']:
fout.write(_handle_paragraph(each))
def build(opt):
dpath = os.path.join(opt['datapath'], 'QuAC')
version = VERSION
if not build_data.built(dpath, version_string=version):
print('[building data: ' + dpath + ']')
if build_data.built(dpath):
# An older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
# Download the data.
for downloadable_file in RESOURCES:
downloadable_file.download_file(dpath)
with PathManager.open(os.path.join(dpath, RESOURCES[0].file_name)) as f:
data = json.load(f)['data']
make_parlai_format(dpath, 'train', data)
with PathManager.open(os.path.join(dpath, RESOURCES[1].file_name)) as f:
data = json.load(f)['data']
make_parlai_format(dpath, 'valid', data)
# Mark the data as built.
build_data.mark_done(dpath, version_string=version)