-
Notifications
You must be signed in to change notification settings - Fork 1
/
ParseData.py
164 lines (132 loc) · 4.52 KB
/
ParseData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import re # regex
def cornell_cleanup(sentence):
# clean up html tags
sentence = re.sub(r'<.*?>', '', sentence.lower())
# clean up \n and \r
return sentence.replace('\n', '').replace('\r', '')
def load_cornell(path_conversations, path_lines):
movie_lines = {}
lines_file = open(path_lines, 'r', encoding="iso-8859-1")
for line in lines_file:
line = line.split(" +++$+++ ")
line_number = line[0]
character = line[1]
movie = line[2]
sentence = line[-1]
if movie not in movie_lines:
movie_lines[movie] = {}
movie_lines[movie][line_number] = (character, sentence)
questions = []
responses = []
conversations_file = open(path_conversations, 'r', encoding="iso-8859-1")
for line in conversations_file:
line = line.split(" +++$+++ ")
movie = line[2]
line_numbers = []
for num in line[3][1:-2].split(", "):
line_numbers.append(num[1:-1])
# Not used since the cornell data set already placed
# the lines of the same character together
#
# lines = []
#
# tmp = []
#
# teacher = movie_lines[movie][line_numbers[0]][0]
# # teacher is the one that speaks first
# was_teacher = True
#
# for num in line_numbers:
#
# line = movie_lines[movie][num]
# if line[0] == teacher:
# if not was_teacher: # was the bot
# lines.append([True, tmp]) # append previous conversation and mark as "is bot"
# tmp = []
# tmp.append(cornell_cleanup(line[1]))
# was_teacher = True
# else: # bot speaking
# if was_teacher: # was teacher
# lines.append([False, tmp]) # append previous conversation and mark "is not bot"
# tmp = []
# tmp.append(cornell_cleanup(line[1]))
# was_teacher = False
#
# if len(tmp) > 0:
# lines.append([not was_teacher, tmp]) # append the last response (not b/c of the inverse)
#
# conversations.append(lines)
for i in range(len(line_numbers) - 1):
questions.append(cornell_cleanup(movie_lines[movie][line_numbers[i]][1]))
responses.append(cornell_cleanup(movie_lines[movie][line_numbers[i + 1]][1]))
return questions, responses
def load_twitter(path):
lines_x = []
lines_y = []
lines = open(path, 'r', encoding='utf-8')
is_x = True
for line in lines:
if is_x:
lines_x.append(line.lower())
else:
lines_y.append(line.lower())
is_x = not is_x
return lines_x, lines_y
def split_sentence(sentence):
# collect independent words
result = re.findall(r"[\w]+|[.,!?;'\"]+", sentence)
return result
def split_data(data):
result = []
for line in data:
result.append(split_sentence(line))
return result
def sentence_to_index(sentence, word_to_index, target=False):
if not target:
result = [word_to_index["<GO>"]]
length = 1
else:
result = []
length = 0
unk = 0
for word in sentence:
length += 1
if word in word_to_index:
result.append(word_to_index[word])
else:
result.append(word_to_index["<UNK>"])
unk += 1
# max sequence length of 20
if length < 20:
result.append(word_to_index["<EOS>"])
length += 1
# EOS also used as padding
result.extend([word_to_index["<EOS>"]] * (20 - length))
else:
# result = result[:19]
# result.append(word_to_index["<EOS>"])
# length = 19
result = result[:20]
length = 20
return result, length, unk
def data_to_index(data_x, data_y, word_to_index):
result_x = []
result_y = []
lengths_x = []
lengths_y = []
result_y_target = []
index = 0
while index < len(data_x):
x, x_length, x_unk = sentence_to_index(data_x[index], word_to_index)
y, y_length, y_unk = sentence_to_index(data_y[index], word_to_index)
index += 1
if x_unk > 1 or y_unk > 0:
continue
result_x.append(x)
result_y.append(y)
lengths_x.append(x_length)
lengths_y.append(y_length)
y_target = y[1:]
y_target.append(word_to_index["<EOS>"])
result_y_target.append(y_target)
return result_x, result_y, lengths_x, lengths_y, result_y_target