In [2]:
import codecs
import collections
from operator import itemgetter

#### 将词汇映射成词汇表

In [6]:
#训练集数据文件
RAW_DATA="data/simple-examples/data/ptb.train.txt"
#输出的词汇表文件
VOCAB_OUTPUT="data/simple-examples/data/ptb.vocab"

#统计单词出现的频率
counter=collections.Counter()
with codecs.open(RAW_DATA,"r","utf-8") as f:
    for line in f:
        for word in line.strip().split():
            counter[word]+=1


#按照词频进行从大到小排序,转换成list
sorted_word_to_cnt=sorted(counter.items(),key=itemgetter(1),reverse=True)
sorted_words=[x[0] for x in sorted_word_to_cnt]

#文本换行处加入句子结束符号
sorted_words=["<eos>"]+sorted_words

with codecs.open(VOCAB_OUTPUT,'w','utf-8') as file_output:
    for word in sorted_words:
        file_output.write(word+'\n')

#### 将训练文件，测试文件等根据词汇文件转换为单词编号
- 每个单词标号就是他们在词汇文件中的行号

In [13]:
import sys
VOCAB="data/simple-examples/data/ptb.vocab"
OUTPUT_DATA="data/simple-examples/data/ptb.train"

# 读取词汇表,并建立词汇到单词编号的映射
with codecs.open(VOCAB, "r", "utf-8") as f_vocab:
    vocab = [w.strip() for w in f_vocab.readlines()]
word_to_id = {k: v for (k, v) in zip(vocab, range(len(vocab)))}


# 如果出现了不在词汇表内的低频词,则替换为"unk"
def get_id(word):
    return word_to_id[word] if word in word_to_id else word_to_id["<unk>"]


fin = codecs.open(RAW_DATA, "r", "utf-8")
fout = codecs.open(OUTPUT_DATA, 'w', 'utf-8')
for line in fin:
    words = line.strip().split() + ["<eos>"]  # 读取单词并添加<eos>结束符
    # 将每个单词替换为词汇表中的编号
    out_line = ' '.join([str(get_id(w)) for w in words]) + '\n'
    fout.write(out_line)
fin.close()
fout.close()


#### PTB数据的BATCHING
- 该数据集不大，一次行读入内存



In [10]:
import tensorflow as tf
import numpy as np

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [12]:
TRAIN_DATA="data/simple-examples/data/ptb.train"
TRAIN_BATCH_SIZE=20
TRAIN_NUM_STEP=35

#从文件中读取数据，并返回包含单词编号的数组
def read_data(file_path):
    with open(file_path,'r') as fin:
        #将整个文档读进一个长字符串
        id_string=' '.join([line.strip() for line in fin.readlines() ])
    #将读取到的编号转换为整数
    id_list=[int(w) for w in id_string.split()]
    return id_list

def make_batch(id_list,batch_size,num_step):
    #计算总的batch数量，每个batch包含的单词数量是batch_size*num_step
    num_batches=(len(id_list)-1)//(batch_size*num_step)

    data=np.array(id_list[:num_batches*batch_size*num_step])
    data=np.reshape(data,[batch_size,num_batches*num_step])
    data_batches=np.split(data,num_batches,axis=1)

    lable=np.array(id_list[1:num_batches*batch_size*num_step+1])
    lable=np.reshape(data,[batch_size,num_batches*num_step])
    lable_batches=np.split(data,num_batches,axis=1)

    return list(zip(data_batches,lable_batches))
def main():
    train_batches=make_batch(read_data(TRAIN_DATA),TRAIN_BATCH_SIZE,TRAIN_NUM_STEP)

if __name__=='main':
    main()