-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_data.py
142 lines (125 loc) · 5.81 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# coding: utf-8
import torch
import numpy
import os
import codecs
def pad_tgt_seq(seq, max_length):
# max_length = 270
pad = numpy.zeros((max_length - seq.size), dtype='int32')
pad_seq = numpy.hstack((seq, pad))
return pad_seq
def pad_src_seq(seq, max_length):
# max_length = 225
pad = numpy.zeros((max_length-seq.shape[0], seq.shape[1]), dtype='int32')
pad_seq = numpy.vstack((seq, pad))
return pad_seq
class AnnotationBatchGetter(object):
def __init__(self, config, batch_size):
# self.encoding = config['encoding']
self.cursor = 0
self.all_samples = []
self.sample_num = len(self.all_samples)
self.config = config
self.batch_size = batch_size
self._vocabs = config['Vocabs']
self._outtag_voc = config['BioOutTags'] # self._outtag_voc = config['OutTags']
# self._tag_pos = config['ner_pos']
self._word_pos = config['WordPos']
self._fea_pos = config['fea_pos']
self._vocab_char = config['CharVoc']
self._vocab_word = config['WordId']
self._max_char_len = config['max_char']
self._use_char_conv = config['use_char_conv']
self._use_gaz = config['use_gaz']
if self._use_gaz:
gazdir = config['GazetteerDir']
gaz_names = config['Gazetteers']
self._gazetteers = []
for (id, gaz) in enumerate(gaz_names):
gazfile = os.path.join(gazdir, gaz)
self._gazetteers.append(self._load_gaz_list(gazfile))
def _load_gaz_list(self, file):
words = set()
with codecs.open(file, mode='rb', encoding='utf-8') as f:
for line in f:
words.add(line.strip())
return words
def get_feature(self, tokens):
fea_len = len(self._vocabs)
if self._use_gaz:
fea_len += len(self._gazetteers)
if self._use_char_conv:
fea_len += self._max_char_len * 2
if self.config['lang'] == 'cmn':
fea_len += 1
feaMat = numpy.zeros((len(tokens), fea_len),
dtype='int32')
for (lid, token) in enumerate(tokens):
parts = [token['word_lower'], token['word'],
token['caps'], token['pos'],
token['ner']]
if token.has_key('comb-word'):
parts.append(token['comb-word']) # .encode('utf-8'))
for (i, voc) in enumerate(self._vocabs):
fpos = self._fea_pos[i]
wid = voc.getID(parts[fpos])
feaMat[lid, i] = wid
curr_end = len(self._vocabs)
if self._use_gaz:
gazStart = len(self._vocabs)
for (id, gaz) in enumerate(self._gazetteers):
if self.config['lang'] == 'cmn':
if parts[5] in gaz:
feaMat[lid, id + gazStart] = 1
else:
if parts[0] in gaz:
feaMat[lid, id + gazStart] = 1
curr_end += len(self._gazetteers)
if self.config['lang'] == 'cmn':
feaMat[lid, curr_end] = self._vocab_word.getID(parts[5])
curr_end += 1
if self._use_char_conv:
word = parts[self._word_pos]
chStart = curr_end
chMaskStart = chStart + self._max_char_len
for i in range(len(word)):
if i >= self._max_char_len:
break
feaMat[lid, chStart + i] = self._vocab_char.getID(word[i])
feaMat[lid, chMaskStart + i] = 1
return feaMat
def use_annotaion(self, text_spans):
all_samples = []
for id, sentence in enumerate(text_spans):
feaMat = self.get_feature(sentence['tokens'])
all_samples.append(feaMat)
self.cursor = 0
self.all_samples = all_samples
self.sample_num = len(self.all_samples)
input_seqs = self.all_samples
input_seqs_length = [s.shape[0] for s in input_seqs]
seqs_padded = [pad_src_seq(s, max(input_seqs_length))[numpy.newaxis, ...] for s in input_seqs]
seq_tensor = torch.from_numpy(numpy.concatenate(seqs_padded, axis=0)).type(torch.LongTensor)
return seq_tensor, 0, [0], input_seqs_length
def next(self):
if self.cursor < self.sample_num:
required_batch = self.all_samples[self.cursor:self.cursor + self.batch_size]
# required_batch = self.all_samples[:config['batch_size']]
self.cursor += self.batch_size
# # 按句子长度从大到小排列
# required_batch.sort(key=lambda x: x[0].shape[0], reverse=True)
input_seqs = required_batch
# input_labels = [seq_label[1] for seq_label in required_batch]
input_seqs_length = [s.shape[0] for s in input_seqs]
# input_labels_length = [s.size for s in input_labels]
seqs_padded = [pad_src_seq(s, max(input_seqs_length))[numpy.newaxis, ...] for s in input_seqs]
# labels_padded = [pad_tgt_seq(s, max(input_labels_length))[numpy.newaxis, ...] for s in input_labels]
# (batch, max_seq, len(embnames)+len(gazs)+max_char+max_char)
seq_tensor = torch.from_numpy(numpy.concatenate(seqs_padded, axis=0)).type(torch.LongTensor)
# (batch, max_label)
# label_tensor = torch.from_numpy(numpy.concatenate(labels_padded, axis=0)).type(torch.LongTensor)
# input_seqs_length[-1] = 225
# input_labels_length[-1] = 270
return seq_tensor, 0, [0], input_seqs_length
else:
raise StopIteration("out of list")