-
Notifications
You must be signed in to change notification settings - Fork 0
/
add_cls_sep_to_sections.py
45 lines (30 loc) · 1.24 KB
/
add_cls_sep_to_sections.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import os
import re
from tqdm import tqdm
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
txt_files_dir_path = 'LongSumm-data/extractive_summaries/combined_sections_txt/'
save_path = 'LongSumm-data/extractive_summaries/combined_sections_txt_with_cls_sep/'
p_dirs = os.listdir(txt_files_dir_path)
if '.DS_Store' in p_dirs:
p_dirs.remove('.DS_Store')
for p in tqdm(p_dirs):
os.mkdir(save_path+p)
section_txt_files = os.listdir(txt_files_dir_path+p)
for s_file in section_txt_files:
with open(txt_files_dir_path+p+'/'+s_file, 'r') as f:
txt = f.readlines()
txt = ' '.join(txt)
txt = re.sub('et al.', 'et al', txt)
txt = re.sub('\n', '', txt)
lines = txt.split('.')
valid_lines = []
for i in range(len(lines)):
if lines[i] == '':
continue
valid_lines.append(lines[i].strip())
for i in range(len(valid_lines)):
valid_lines[i] = valid_lines[i] + '.' + ' ' + tokenizer.cls_token + ' ' + tokenizer.sep_token
txt = ' '.join(valid_lines)
with open(save_path+p+'/'+s_file, 'w') as f:
f.write(txt)