Skip to content

Commit

Permalink
optimize the way of appending. (#402)
Browse files Browse the repository at this point in the history
Co-authored-by: wheatxzhang <wheatxzhang@tencent.com>
  • Loading branch information
Winter523 and wheatxzhang committed Apr 25, 2024
1 parent 4d31484 commit 072b50b
Show file tree
Hide file tree
Showing 11 changed files with 60 additions and 72 deletions.
8 changes: 4 additions & 4 deletions finetune/run_c3.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,10 @@ def read_dataset(args, path):
if len(src) > args.seq_length:
src = src[: args.seq_length]
seg = seg[: args.seq_length]
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
if len(src) < args.seq_length:
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
src += [PAD_ID] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))

dataset[-1][0].append(src)
dataset[-1][2].append(seg)
Expand Down
6 changes: 3 additions & 3 deletions finetune/run_chid.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,9 @@ def read_dataset(args, data_path, answer_path):
src = args.tokenizer.convert_tokens_to_ids(tokens)[: args.seq_length]
seg = [0] * len(src)

while len(src) < args.seq_length:
src.append(0)
seg.append(0)
if len(src) < args.seq_length:
src += [0] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))

dataset[-1][0].append(src)
dataset[-1][2].append(seg)
Expand Down
8 changes: 4 additions & 4 deletions finetune/run_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,10 @@ def read_dataset(args, path):
if len(src) > args.seq_length:
src = src[: args.seq_length]
seg = seg[: args.seq_length]
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
if len(src) < args.seq_length:
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
src += [PAD_ID] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))
if args.soft_targets and "logits" in columns.keys():
dataset.append((src, tgt, seg, soft_tgt))
else:
Expand Down
8 changes: 4 additions & 4 deletions finetune/run_classifier_multi_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,10 @@ def read_dataset(args, path):
if len(src) > args.seq_length:
src = src[: args.seq_length]
seg = seg[: args.seq_length]
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
if len(src) < args.seq_length:
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
src += [PAD_ID] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))

dataset.append((src, tgt, seg))

Expand Down
8 changes: 4 additions & 4 deletions finetune/run_classifier_prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,10 @@ def read_dataset(args, path):
src = src[: args.seq_length]
seg = seg[: args.seq_length]

PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
if len(src) < args.seq_length:
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
src += [PAD_ID] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))
tgt = [0] * len(src)
# Ignore the sentence which the answer is not in a sequence
if mask_position >= args.seq_length:
Expand Down
8 changes: 4 additions & 4 deletions finetune/run_cmrc.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,10 @@ def convert_examples_to_dataset(args, examples):
src_b = args.tokenizer.convert_tokens_to_ids(args.tokenizer.tokenize(span_context) + [SEP_TOKEN])
src = src_a + src_b
seg = [1] * len(src_a) + [2] * len(src_b)
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
if len(src) < args.seq_length:
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
src += [PAD_ID] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))

dataset.append((src, seg, start_position, end_position, answers, question_id, len(question), doc_span_index, start_offset))
return dataset
Expand Down
8 changes: 4 additions & 4 deletions finetune/run_dbqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ def read_dataset(args, path):
if len(src) > args.seq_length:
src = src[: args.seq_length]
seg = seg[: args.seq_length]
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
if len(src) < args.seq_length:
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
src += [PAD_ID] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))
dataset.append((src, tgt, seg, qid))

return dataset
Expand Down
10 changes: 5 additions & 5 deletions finetune/run_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,11 @@ def read_dataset(args, path):
src = src[: args.seq_length]
tgt = tgt[: args.seq_length]
seg = seg[: args.seq_length]
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
tgt.append(args.labels_num - 1)
seg.append(0)
if len(src) < args.seq_length:
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
src += [PAD_ID] * (args.seq_length - len(src))
tgt += [args.labels_num - 1] * (args.seq_length - len(tgt))
seg += [0] * (args.seq_length - len(seg))
dataset.append([src, tgt, seg])

return dataset
Expand Down
8 changes: 4 additions & 4 deletions finetune/run_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,10 @@ def read_dataset(args, path):
if len(src) > args.seq_length:
src = src[: args.seq_length]
seg = seg[: args.seq_length]
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
if len(src) < args.seq_length:
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
src += [PAD_ID] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))
dataset.append((src, tgt, seg))

return dataset
Expand Down
14 changes: 7 additions & 7 deletions finetune/run_text2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,13 @@ def read_dataset(args, path):
tgt_seg = tgt_seg[: args.tgt_seq_length]
tgt_out = tgt_in[1:] + [PAD_ID]

while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
while len(tgt_in) < args.tgt_seq_length:
tgt_in.append(PAD_ID)
tgt_out.append(PAD_ID)
tgt_seg.append(0)
if len(src) < args.seq_length:
src += [PAD_ID] * (args.seq_length - len(src))
seg += [0] * (args.seq_length - len(seg))
if len(tgt_in) < args.tgt_seq_length:
tgt_in += [PAD_ID] * (args.tgt_seq_length - len(tgt_in))
tgt_out += [PAD_ID] * (args.tgt_seq_length - len(tgt_out))
tgt_seg += [0] * (args.tgt_seq_length - len(tgt_seg))

dataset.append((src, tgt_in, tgt_out, seg, tgt_seg))

Expand Down
46 changes: 17 additions & 29 deletions uer/utils/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,7 @@ def __iter__(self):

for ins in instances:
src_single, pad_num = ins[0]
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num

if len(ins) == 4:
src.append(src_single)
Expand Down Expand Up @@ -123,8 +122,7 @@ def __iter__(self):

for ins in instances:
src_single, pad_num = ins[0]
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num

if len(ins) == 3:
src.append(src_single)
Expand Down Expand Up @@ -175,8 +173,7 @@ def __iter__(self):

for ins in instances:
src_single, pad_num = ins[0]
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num
src.append(src_single[:-1])
tgt.append(src_single[1:])
seg.append([1] * ins[1][0] + [0] * (len(src_single) - 1 - ins[1][0]))
Expand Down Expand Up @@ -206,10 +203,9 @@ def __iter__(self):
for ins in instances:
src_single, pad_num = ins[0]
tgt_forward_single, tgt_backward_single = ins[1], ins[2]
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
tgt_forward_single.append(self.vocab.get(PAD_TOKEN))
tgt_backward_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num
tgt_forward_single += [self.vocab.get(PAD_TOKEN)] * pad_num
tgt_backward_single += [self.vocab.get(PAD_TOKEN)] * pad_num
src.append(src_single)
tgt_forward.append(tgt_forward_single)
tgt_backward.append(tgt_backward_single)
Expand Down Expand Up @@ -241,11 +237,9 @@ def __iter__(self):

for ins in instances:
src_single, pad_num = ins[0]
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num
tgt_single, pad_num = ins[1]
for _ in range(pad_num):
tgt_single.append(self.vocab.get(PAD_TOKEN))
tgt_single += [self.vocab.get(PAD_TOKEN)] * pad_num

src.append(src_single)
tgt_in.append(tgt_single[:-1])
Expand Down Expand Up @@ -283,8 +277,7 @@ def __iter__(self):

for _, ins in enumerate(instances):
src_single, pad_num = ins[0]
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num

if len(ins) == 3:
tgt_single = ins[1]
Expand Down Expand Up @@ -370,11 +363,9 @@ def __iter__(self):

for _, ins in enumerate(instances):
src_single, pad_num = ins[0]
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num
tgt_single, pad_num = ins[1]
for _ in range(pad_num):
tgt_single.append(self.vocab.get(PAD_TOKEN))
tgt_single += [self.vocab.get(PAD_TOKEN)] * pad_num

src_single, _ = mask_seq(src_single, self.tokenizer, self.whole_word_masking, self.span_masking,
self.span_geo_prob, self.span_max_length)
Expand Down Expand Up @@ -436,9 +427,8 @@ def __iter__(self):
elif len(seg_pos_single) == 2:
seg_single = [1] * seg_pos_single[0] + [2] * seg_pos_single[1]

for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
seg_single.append(0)
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num
seg_single += [0] * pad_num

src.append(src_single)
tgt.append(ins[1])
Expand Down Expand Up @@ -468,9 +458,8 @@ def __iter__(self):
for ins in instances:
src_single, pad_num = ins[0]
tgt_single = ins[1]
for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
tgt_single.append(self.vocab.get(PAD_TOKEN))
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num
tgt_single += [self.vocab.get(PAD_TOKEN)] * pad_num
src.append(src_single)
tgt.append(tgt_single)
seg.append([1] * ins[2][0] + [2] * (ins[2][1] - ins[2][0]) + [0] * (len(src_single) - ins[2][1]))
Expand Down Expand Up @@ -509,9 +498,8 @@ def __iter__(self):
elif len(seg_pos_single) == 2:
seg_single = [1] * seg_pos_single[0] + [2] * seg_pos_single[1]

for _ in range(pad_num):
src_single.append(self.vocab.get(PAD_TOKEN))
seg_single.append(0)
src_single += [self.vocab.get(PAD_TOKEN)] * pad_num
seg_single += [0] * pad_num
seg.append(seg_single)

if len(ins) == 4 :
Expand Down

0 comments on commit 072b50b

Please sign in to comment.