# 将每个数据集文件拆分个多个小文件，每个小文件包含一个故事的内容和标题，以 ID 命名

In [53]:
import time
import os

stories_dir = "/media/tao/文件和数据/dataset/bytecup2018"
split_stories_dir = "/media/tao/文件和数据/dataset/bytecup2018/split_stories"
tokenized_stories_dir = "/media/tao/文件和数据/dataset/bytecup2018/tokenized_stories"
finished_files_dir = "/media/tao/6F0855440D2070BB/code/python/Headline_Generation/Get to the point/cnn-dailymail-master/bytecup/finished_files"
chunks_dir = os.path.join(finished_files_dir, "chunked")
file_names = ["for_test.txt"]
# file_names = ["bytecup.corpus.train.{}.txt".format(i) for i in range(8,9)]

if not os.path.exists(split_stories_dir): os.makedirs(split_stories_dir)
if not os.path.exists(tokenized_stories_dir): os.makedirs(tokenized_stories_dir)
if not os.path.exists(finished_files_dir): os.makedirs(finished_files_dir)
if not os.path.exists(chunks_dir): os.makedirs(chunks_dir)


In [15]:
import json
import os

for name in file_names:
    file_dir = os.path.join(stories_dir, name)
    heads = []
    desc = []

    print('Opening file: %s...' % name)
    with open(file_dir, 'r') as load_f:
        lines = load_f.readlines()
        for i, line in enumerate(lines):
            txt_dic = json.loads(line)

            split_file_dir = os.path.join(split_stories_dir, str(txt_dic['id']))
            with open(split_file_dir, "w") as f:
                f.write(txt_dic['content'])
                f.write(" . \n\n@title.\n")
                f.write(txt_dic['title'])
                
    print("completed\n")

Opening file: for_test.txt...
completed



# 使用 Stanford CoreNLP Tokenization 将每个文件进行按句、按词进行 token 并存放在另一个文件夹中

DIVIDE_SENTENCE 为 Ture 表示分句子，否则不分句子

In [36]:
DIVIDE_SENTENCE = False

### 只分词，不分句（先生成映射文件 mapping，再使用 PTBTokenizer 对文件进行直接操作）

In [31]:
import subprocess

if not DIVIDE_SENTENCE:

    time_start = time.time()
    
    stories = os.listdir(split_stories_dir)
    #生成映射文件 mapping.txt
    print("Making list of files to tokenize...")
    with open("mapping.txt", "w") as f:
        for s in stories:
            f.write("%s \t %s\n" % (os.path.join(split_stories_dir, s), os.path.join(tokenized_stories_dir, s)))
    print("completed\n")
    
    # Doesn't work
    # command = ['java', 'edu.stanford.nlp.process.DocumentPreprocessor', '-ioFileList', '-preserveLines', 'mapping.txt']
    
    command = ['java', 'edu.stanford.nlp.process.PTBTokenizer', '-ioFileList', '-preserveLines', 'mapping.txt']
    subprocess.call(command)
    
    # Doesn't work
    # os.system("java edu.stanford.nlp.process.DocumentPreprocessor -ioFileList -preserveLines mapping.txt")
    
    time_end = time.time()
    print('Consume time:', time_end - time_start)
    os.remove("mapping.txt")

### 分句加分词（无法利用映射文件进行分词，有BUG，故只能一个一个文件分词）

In [32]:
import os
if DIVIDE_SENTENCE:
    
    time_start = time.time()
    
    stories = os.listdir(split_stories_dir)
    for s in stories:
        os.system("java edu.stanford.nlp.process.DocumentPreprocessor -preserveLines < %s > %s" %
                  (os.path.join(split_stories_dir, s), os.path.join(tokenized_stories_dir, s)))

    time_end = time.time()
    print('Consume time:', time_end - time_start)

Consume time: 1.1641461849212646


# 写入 bin 文件

### 根据文件地址读文件，返回行列表

In [33]:
def read_text_file(text_file):
    lines = []
    with open(text_file, "r") as f:
        for line in f:
            lines.append(line.strip())
        return lines

### 根据文件内容获得文章和摘要

In [34]:
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'

def get_art_abs(story_file):
    lines = read_text_file(story_file)

    # Lowercase everything
    lines = [line.lower() for line in lines]

    # Separate out article and abstract sentences
    article_lines = []
    titles = []
    next_is_title = False
    for idx,line in enumerate(lines):
        if line == "":
            continue # empty line
        elif line.startswith("@title"):
            next_is_title = True
        elif next_is_title:
            titles.append(line)
        else:
            article_lines.append(line)

    # Make article into a single string
    article = ' '.join(article_lines)

    # Make abstract into a signle string, putting <s> and </s> tags around the sentences
    title = ' '.join(["%s %s %s" % (SENTENCE_START, sent, SENTENCE_END) for sent in titles])

    return article, title

### 写入 bin 文件

In [51]:
from tensorflow.core.example import example_pb2
import struct

def write_to_bin(src_dir, out_file):
    
    with open(out_file, 'wb') as writer:
        
        stories = os.listdir(src_dir)
        for idx, s in enumerate(stories):
            story_file = os.path.join(tokenized_stories_dir, s)
            article, title = get_art_abs(story_file)
        #     print("article: \n%s\n\n"%article)
        #     print("title: \n%s\n" % title)

            tf_example = example_pb2.Example()
            tf_example.features.feature['article'].bytes_list.value.extend([article.encode()])
            tf_example.features.feature['abstract'].bytes_list.value.extend([title.encode()])
            tf_example_str = tf_example.SerializeToString()

            str_len = len(tf_example_str)
            writer.write(struct.pack('q', str_len))
            writer.write(struct.pack('%ds' % str_len, tf_example_str))

In [52]:
write_to_bin(tokenized_stories_dir, os.path.join(finished_files_dir, "test.bin"))

### 将文件分块，每个块的文章个数为 CHUNK_SIZE

In [57]:
CHUNK_SIZE = 1000 # num examples per chunk, for the chunked data

def chunk_file(set_name):
    in_file = os.path.join(finished_files_dir, '%s.bin' % set_name)
    with open(in_file, 'rb') as reader:
        chunk = 0
        finished = False
        while not finished:
            chunk_fname = os.path.join(chunks_dir, '%s_%03d.bin' % (set_name, chunk)) # new chunk
            with open(chunk_fname, 'wb') as writer:
                for _ in range(CHUNK_SIZE):
                    len_bytes = reader.read(8)
                    if not len_bytes:
                        finished = True
                        break
                    str_len = struct.unpack('q', len_bytes)[0]
                    example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
                    writer.write(struct.pack('q', str_len))
                    writer.write(struct.pack('%ds' % str_len, example_str))
                chunk += 1
            
def chunk_all():
    # Chunk the data
#     for set_name in ['train', 'val', 'test']:
    for set_name in ['test']:
        print("Splitting %s data into chunks..." % set_name)
        chunk_file(set_name)
    print("Saved chunked data in %s" % chunks_dir)
    
# 分块
chunk_all()

Splitting test data into chunks...
Saved chunked data in /media/tao/6F0855440D2070BB/code/python/Headline_Generation/Get to the point/cnn-dailymail-master/bytecup/finished_files/chunked
