In [27]:
import numpy as np
import os
from keras.layers import Embedding
from seq2seq import SimpleSeq2Seq, Seq2Seq, AttentionSeq2Seq
import tensorflow as tf
from keras.callbacks import TensorBoard
from keras.layers import Input, Dense ,Dropout
from keras.models import Model,load_model
#############
# 写表文件
stem_tab_file = '../table/stem.txt'
dbn_tab_file = '../table/dbn.txt'
flag_tab_file = '../table/flag.txt'
#
#############
# 训练文件和测试文件
train_folder = '../data/train/'
test_folder = '../data/test/'
vaild_folder = '../data/vaild/'

dbn_file_in ='dbn.in.txt'
dbn_file_out ='dbn.out.txt'

flag_file_in ='flag.in.txt'
flag_file_out = 'flag.out.txt'
#
#############
# 保存模型
model_file ='./save_model/keras_seq2seq_embedding.h5'
if not os.path.exists('./save_model'):
    os.makedirs('./save_model')

In [2]:
# 解决keras 显存问题
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.5
set_session(tf.Session(config=config)) 

In [3]:

class Base:
    def __init__(self,filename,num_word_threshold):
        self._bash_to_id = {}
        self._unk = -1
        self._pad = -1
        self._go   = -1
        self._eos  = -1
        self._num_word_threshold = num_word_threshold
        self._read_dict(filename)
    
    def _read_dict(self,filename):
        with open(filename,'r') as f:
            lines = f.readlines()
        for line in lines:
            # print(line)
            word,frequency = line.strip('\r\n').split('\t')
            frequency = int(frequency)
            if frequency < self._num_word_threshold:
                continue
            idx = len(self._bash_to_id)
            if word == '<UNK>':
                self._unk = idx
            elif word == '<PAD>':
                self._pad = idx
#             elif word == '<GO>':
#                 self._go = idx
#             elif word == '<EOS>':
#                 self._eos = idx
            self._bash_to_id[word] = idx
            
    @property
    def unk(self):
         return self._unk
    @property
    def pad(self):
         return self._pad    
    @property
    def go(self):
         return self._go
    @property
    def eos(self):
         return self._eos 
        
    def size(self):
         return len(self._bash_to_id)
    def base_to_id(self,word):
        #完成
         return self._bash_to_id.get(word,self._unk)
    def sequence_to_id(self,sequeuce):
        word_ids = [self.base_to_id(cur_word) for cur_word in sequeuce.split()]
        return word_ids
    
    ### 
    # 不需要id2base 所以没写
    ###
num_word_threshold = 5000 # 频率低于5000的不要
base =Base(stem_tab_file,num_word_threshold)

In [4]:
# base.sequence_to_id("A G C U A C G G C C A U A C A U A G A U G A A A A U A C C G G A U C C C G U C C G A U C U C C G A A G U C A A G C A U C U A A U G G C G A C G U C A G U A C U G U G A U G G G G G A C C G C A C G G G A A U A C G U C G U G C U G U A G U")

In [5]:
class CategoryDict:
    def __init__(self,filename):
        self._category_to_id = {}
        self._id_to_category = {}
        with open(filename , 'r') as f:
            lines = f.readlines()
        for line in lines:
            category,_ = line.strip('\r\n').split('\t')
            idx = len(self._category_to_id)
            self._category_to_id[category] = idx
            self._id_to_category[idx] = category
    def size(self):
        return len(self._category_to_id)
    def category_to_id(self,category):
        if category not in self._category_to_id:
            raise Exception(
                    "%s is not in our category list" % category)
        return self._category_to_id[category]  
    def id_to_category(self,id):
        if id not in self._id_to_category:
            raise Exception(
                    "%s is not in our id list" % id)
        return self._id_to_category[id]  
    
    def sequence_to_id(self,sequeuce):
        
        word_ids = [self.category_to_id(cur_word) for cur_word in sequeuce.split()]
        return word_ids
    
    def id_to_sequence(self,ids):
        print(self._id_to_category)
        category = ' '.join([self.id_to_category(_id) for _id in ids])
        return category
    
category_base = CategoryDict(dbn_tab_file)
test_str = '. ( )'
test_str2 = [0,1,2,3]
print(category_base.id_to_sequence(test_str2))

{0: '<PAD>', 1: '.', 2: ')', 3: '('}
<PAD> . ) (


In [6]:
# 构建数据库 对其，shuffle，batch

num_timesteps = 300

class TextDataSet:
    def __init__(self, folder, vocab, category_vocab, num_timesteps,):
        infile = folder + dbn_file_in
        outfile = folder + dbn_file_out
        self._vocab = vocab
        self._category_vocab = category_vocab
        self._num_timesteps = num_timesteps
        # matrix 矩阵
        self._inputs = []# 所有数据的集合
        # vector 列表
        self._target = []# 
#         self._target_out = []# 
        self._label_len = []
        self._feature_len = []
        self._indicator = 0# batch的位置
        
        self._parse_file(infile,outfile) # 解析⽂件

        
        
    def _parse_file(self,infile,outfile): 
        """ 解析文件 """
        
        print('Loading data from %s',infile)
        print('Loading data from %s',outfile)
        with open(infile,'r') as fi:
            inlines = fi.readlines()
        with open(outfile,'r')as fo:
            outlines = fo.readlines()
            
        for line in inlines:
            line = line.strip('\r\n').strip()
            id_feature = self._vocab.sequence_to_id(line)
            id_feature = id_feature[0:self._num_timesteps] #保证不会过长
            feature_len = len(id_feature)
            self._feature_len.append(feature_len)
            padding_num = self._num_timesteps - feature_len # 如果id_words⼩于这个num_timesteps
            id_feature = id_feature + list(self._vocab.pad for i in range(padding_num))
            self._inputs.append(id_feature)
            
        for line in outlines:
            line = line.strip('\r\n')
            id_label = self._vocab.sequence_to_id(line)
            id_label = id_label[0:self._num_timesteps]#600
            label_len = len(id_label)
            self._label_len.append(label_len)
            padding_num = self._num_timesteps - len(id_label) # 如果id_words⼩于这个num_timesteps
            id_label = id_label + list(self._vocab.pad for i in range(padding_num))
            self._target.append(id_label)
            
        self._inputs = np.asarray(self._inputs, dtype = np.int32)
        self._target = np.asarray(self._target, dtype = np.int32)
#         self._target_out = np.asarray(self._target_out, dtype = np.int32)
        self._label_len = np.asarray(self._label_len, dtype = np.int32)
        self._feature_len = np.asarray(self._feature_len, dtype = np.int32)
        self._random_shuffle()

    def _random_shuffle(self):
        p = np.random.permutation(len(self._inputs))
        self._inputs = self._inputs[p]
        self._target = self._target[p]
        self._label_len = self._label_len[p]
        self._feature_len = self._feature_len[p]

    def next_batch(self,batch_size):
        end_indicator = self._indicator +batch_size
        if end_indicator > len(self._inputs):
            self._random_shuffle()
            self._indicator = 0
            end_indicator = batch_size
        if end_indicator > len(self._inputs):# 则说明batchsize ⽐样本空间还要⼤
            raise Exception("batch_size:%d is too large"% batch_size)
        
        batch_inputs = self._inputs[self._indicator:end_indicator]
        batch_target = self._target[self._indicator:end_indicator]
        inputs_len = self._feature_len[self._indicator:end_indicator]
        outputs_len =  self._label_len[self._indicator:end_indicator]
        
        
        self._indicator = end_indicator
        return (batch_inputs,inputs_len),(batch_target,outputs_len)
train_dataset = TextDataSet(
    train_folder, base, category_base, num_timesteps)
print('done')
val_dataset = TextDataSet(
    vaild_folder, base, category_base, num_timesteps)
print('done')
test_dataset = TextDataSet(
    test_folder, base, category_base, num_timesteps)
print('done')


Loading data from %s ../data/train/dbn.in.txt
Loading data from %s ../data/train/dbn.out.txt
done
Loading data from %s ../data/vaild/dbn.in.txt
Loading data from %s ../data/vaild/dbn.out.txt
done
Loading data from %s ../data/test/dbn.in.txt
Loading data from %s ../data/test/dbn.out.txt
done


In [7]:
a = train_dataset.next_batch(2)
type(a)
# print(val_dataset.next_batch(2)) 
# print(test_dataset.next_batch(2))

tuple

In [8]:
# 写数据
def write_log(callback, names, logs, batch_no):

    for (name, value) in zip(names, logs):
        summary = tf.Summary()
        summary_value = summary.value.add()
        summary_value.simple_value = value
        summary_value.tag = name
        callback.writer.add_summary(summary, batch_no)
        callback.writer.flush()
        
def write_log_batch(callback, name, log, batch_no):
    value = log
    summary = tf.Summary()
    summary_value = summary.value.add()
    summary_value.simple_value = value
    summary_value.tag = name
    callback.writer.add_summary(summary, batch_no)
    callback.writer.flush()

In [48]:
input_length = 300
input_dim = 600

output_length = 300
output_dim = 3

samples = 16
hidden_dim = 24

seq_embedding = 5



encoder_embed = Embedding(seq_embedding, input_dim, mask_zero=True, name='encoder_embed',input_length=300)


embed = Dropout(0.4)(encoder_embed)
print(embed.shape)
output = AttentionSeq2Seq(output_dim=output_dim, hidden_dim=hidden_dim, output_length=output_length,
                          input_shape=(input_length,input_dim))(embed)

model = Model(e_input, output)
if os.path.exists(model_file):
    model.load_weights(model_file)
    print("load weight from ",model_file)
model.compile(loss='mse', optimizer='sgd',metrics=['accuracy'])


log_path = './graph'
callback = TensorBoard(log_path)
callback.set_model(model)
train_names = 'train_loss'
val_names = 'val_loss'
for batch_no in range(100000):
    #####
    # 处理数据
    X_train_all, Y_train_all = train_dataset.next_batch(16)
    X_train,length = X_train_all
    Y_train,length = Y_train_all
    X_train = X_train[:,:,np.newaxis]
    Y_train = Y_train[:,:,np.newaxis]
    # 处理数据
    #######
#   (samples, input_length, input_dim) 16,300,1
    logs = model.train_on_batch(X_train, Y_train)

    print("[train]:step:%d,loss:%f,acc:%f "%(batch_no,logs[0],logs[1]))
    write_log_batch(callback, train_names, logs[0], batch_no)
    if batch_no % 10 == 0:
        model.save_weights(model_file)
        #####
        # 处理数据
        X_val_all, Y_val_all = val_dataset.next_batch(16)
        X_val,length = X_val_all
        Y_val,length = Y_val_all
        X_val = X_val[:,:,np.newaxis]
        Y_val = Y_val[:,:,np.newaxis]
        # 处理数据
        #######
#         logs = model.test_on_batch(X_val, Y_val)
        score = model.evaluate(X_val, Y_val, verbose=0)

        print("[vaild]:step:%d,loss:%f,acc:%f "%(batch_no,score[0],score[1]))
        
        write_log_batch(callback, val_names, logs[0], batch_no//10)

ValueError: Layer dropout_38 was called with an input that isn't a symbolic tensor. Received type: <class 'keras.layers.embeddings.Embedding'>. Full input: [<keras.layers.embeddings.Embedding object at 0x7f1beccbad30>]. All inputs to the layer should be tensors.

In [22]:
# Test
X_test_all,Y_test_all = test_dataset.next_batch(1)
X_test,length = X_test_all
Y_test,length = Y_test_all
X_test = X_test[:,:,np.newaxis]
Y_test = Y_test[:,:,np.newaxis]
Y_hat = model.predict(X_test)
Y_hat = np.squeeze(Y_hat)
print(Y_hat)

[ 1.39549881e-01  7.17206597e-02  1.91469491e-02 -1.15999440e-02
 -2.58289631e-02 -2.94791106e-02 -2.69863177e-02 -2.13196594e-02
 -1.43572958e-02 -7.24183768e-03 -6.45056251e-04  5.05234487e-03
  9.64918546e-03  1.30563406e-02  1.52599439e-02  1.63010228e-02
  1.62643008e-02  1.52714597e-02  1.34751387e-02  1.10527994e-02
  8.19986593e-03  5.12337685e-03  2.03495938e-03 -8.53746897e-04
 -3.33608943e-03 -5.21383947e-03 -6.30143890e-03 -6.43619942e-03
 -5.49441995e-03 -3.42649361e-03 -3.06189060e-04  3.60551407e-03
  7.81962741e-03  1.16519425e-02  1.43788233e-02  1.54796988e-02
  1.48286978e-02  1.27052087e-02  9.63252783e-03  6.17038831e-03
  2.77342671e-03 -2.59965658e-04 -2.77598971e-03 -4.72863950e-03
 -6.14094455e-03 -7.07413396e-03 -7.60536129e-03 -7.81414378e-03
 -7.77498633e-03 -7.55256973e-03 -7.20157707e-03 -6.76606689e-03
 -6.28032489e-03 -5.77112567e-03 -5.25746867e-03 -4.75304713e-03
 -4.26736847e-03 -3.80561897e-03 -3.37125571e-03 -2.96511385e-03
 -2.58701458e-03 -2.23581