In [1]:
import os
import numpy as np
from keras.models import Sequential, Model
from keras.layers import Input, LSTM, RepeatVector, Embedding
from keras.layers.core import Activation, Dense, Masking
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

Using TensorFlow backend.


In [2]:
class Autoencoder(object):
    """docstring for Autoencoder"""
    def __init__(self):
        self.nb_epoch = 10  # epochs
        self.batch_size = 256
        self.shuffle = True
        self.validation_split = 0.05
        self.optimizer = 'adadelta'
        self.loss = 'mse'

    # 处理fixed-length sequence的model
    def model(self, codeLayerType, inputDim, codeDim):
        self.codeLayerType = codeLayerType
        assert len(codeDim) > 0

        if self.codeLayerType == 'lstm':
            assert len(inputDim) == 2
            inputData = Input(shape=(inputDim[0],inputDim[1]))

            if len(codeDim) == 1:
                encoded = LSTM(codeDim[0])(inputData)
                decoded = RepeatVector(inputDim[0])(encoded)
            elif len(codeDim) > 1:
                encoded = inputData
                for i, units in enumerate(codeDim):
                    if i == len(codeDim) - 1:
                        encoded = LSTM(units)(encoded)
                        continue
                    encoded = LSTM(units, return_sequences=True)(encoded)

                for i, units in enumerate(reversed(codeDim)): 
                    if i == 1:
                        decoded = LSTM(units, return_sequences=True)(RepeatVector(inputDim[0])(encoded))
                    elif i > 1: 
                        decoded = LSTM(units, return_sequences=True)(decoded)
            else: 
                raise ValueError("The codDim must be over 0.")

            decoded = LSTM(inputDim[-1], return_sequences=True)(decoded)
            self.model = Model(inputData, decoded)
            
        else:
            pass


    def modelMasking(self, codeLayerType, inputDim, codeDim):
        self.codeLayerType = codeLayerType
        assert len(codeDim) > 0

        if self.codeLayerType == 'lstm':
            assert len(inputDim) == 2
            inputData = Input(shape=(inputDim[0],inputDim[1]))
            mask = Masking(mask_value=0.)(inputData)
            if len(codeDim) == 1:
                encoded = LSTM(codeDim[0])(mask)
                decoded = RepeatVector(inputDim[0])(encoded)
            elif len(codeDim) > 1:
                encoded = mask
                for i, units in enumerate(codeDim):
                    if i == len(codeDim) - 1:
                        encoded = LSTM(units)(encoded)
                        continue
                    encoded = LSTM(units, return_sequences=True)(encoded)
    
                for i, units in enumerate(reversed(codeDim)):
                    if i == 1:
                        decoded = LSTM(units, return_sequences=True)(RepeatVector(inputDim[0])(encoded))
                    elif i > 1: 
                        decoded = LSTM(units, return_sequences=True)(decoded)
            else: 
                raise ValueError("The codDim must be over 0.")

            decoded = LSTM(inputDim[-1], return_sequences=True)(decoded)
            self.model = Model(inputData, decoded)

        else:
            pass
        

    def compile(self, *args):
        if len(args) == 0:
            self.model.compile(optimizer=self.optimizer, loss=self.loss)
        elif len(args) == 1:
            if args[0] == 'temporal':
                self.sample_weight_mode = args[0]
                self.model.compile(optimizer=self.optimizer, loss=self.loss, sample_weight_mode=self.sample_weight_mode)
            elif args[0] == 'customFunction':
                self.model.compile(optimizer=self.optimizer, loss= self.weighted_vector_mse)
            else: 
                raise ValueError("Invalid maskType, please input 'sampleWeights' or 'customFunction'")
        else: 
            raise ValueError("argument # must be 0 or 1.")


    def fit(self, *args):
        if len(args) == 2:	
            if args[1] == 'nor':
                self.model.fit(args[0],
                               args[0],
                               epochs=self.nb_epoch, 
                               batch_size=self.batch_size, 
                               shuffle=self.shuffle, 
                               validation_split=self.validation_split)
            elif args[1] == 'rev':
                self.model.fit(args[0], 
                               np.flip(args[0], 1), 
                               epochs=self.nb_epoch, 
                               batch_size=self.batch_size, 
                               shuffle=self.shuffle, 
                               validation_split=self.validation_split)
            else: 
                raise ValueError("decoding sequence type: 'normal' or 'reverse'.")

        elif len(args) == 3:
            self.sampleWeights = args[2]
            if args[1] == 'nor':
                self.model.fit(args[0],
                               args[0],
                               epochs=self.nb_epoch, 
                               batch_size=self.batch_size, 
                               shuffle=self.shuffle, 
                               validation_split=self.validation_split, 
                               sample_weight=self.sampleWeights)
            elif args[1] == 'rev':
                self.model.fit(args[0],
                               np.flip(args[0], 1), 
                               epochs=self.nb_epoch, 
                               batch_size=self.batch_size, 
                               shuffle=self.shuffle, 
                               validation_split=self.validation_split,
                               sample_weight=self.sampleWeights)
            else: 
                raise ValueError("Please input, 'data', 'nor' or 'rev', 'sample_weights'")

    def predict(self, data):
        return self.model.predict(data)

    def weighted_vector_mse(self, y_true, y_pred):
        self.y_true = y_true
        self.y_pred = y_pred
        weight = tf.ceil(self.y_true) # 向上取整
        loss = tf.square(weight * (self.y_true - self.y_pred)) 
        return tf.reduce_mean(tf.reduce_sum(loss, axis=1))

In [3]:
class LSTM_Autoencoder(object):
    """docstring for LSTM_Autoencoder"""
    def __init__(self, input_dim, time_step, hidden_dim):
        self.input_dim = input_dim
        self.time_step = time_step
        self.hidden_dim = hidden_dim
        self.autoencoder = Autoencoder()
        self.autoencoder.modelMasking('lstm', [self.time_step, self.input_dim], self.hidden_dim)

    def compile(self):
        self.autoencoder.compile('temporal')

    def fit(self, data, weights):
        self.autoencoder.fit(data, 'rev', weights)

    def get_hidden_layer_last_step(self):
        self.hidden_representation = Sequential()
        self.hidden_representation.add(self.autoencoder.model.layers[0])
        self.hidden_representation.add(self.autoencoder.model.layers[1])
        self.hidden_representation.add(self.autoencoder.model.layers[2])

    def get_hidden_layer_sequence(self):
        inputData = Input(shape=(self.time_step, self.input_dim))
        mask = Masking(mask_value=0.)(inputData)
        encoded = LSTM(self.hidden_dim[0], return_sequences=True, weights=self.autoencoder.model.layers[2].get_weights())(mask)
        self.hidden_representation = Model(inputData, encoded)

    def get_hidden_representation(self, data):
        return self.hidden_representation.predict(data)

In [4]:
def sample_shuffle(X):
    n_samples = len(X)
    s = np.arange(n_samples)
    np.random.shuffle(s)
    return np.array(X[s])

def seq_padding(sample_sequence, max_length, padding_type):
    return pad_sequences(sample_sequence, maxlen=max_length, dtype='float', padding=padding_type)

In [5]:
def gen_hid_repre(fea_dim, hid_dim, fix_or_var, step_length):
    
    """
    :param fea_dim: input dimension of LSTM-AE model
    :param hid_dim: output dimension of hidden representation
    :param fix_or_var:  editing sequence is fixed-length or variant-length.
    :return: fixed-length hidden representation of editing sequence.
    """

    # 定长序列处理
    if fix_or_var == 1:
        # Load data
        x_ben = np.load('data/wiki/', encoding='bytes') # fixed-length sequence
        x_van = np.load('data/wiki/', encoding='bytes')
        print(x_ben.shape, x_van.shape)
        x_ben = sample_shuffle(x_ben)[0:6000]
        x_van = sample_shuffle(x_van)[0:3000]
        train_ben = x_ben[0:3000]

        # Fit Model
        timesteps = 20
        input_dim = fea_dim
        
        autoencoder = Autoencoder()
        autoencoder.model('lstm', [timesteps, input_dim], hid_dim)
        autoencoder.compile()
        autoencoder.fit(train_ben, "rev")

        hidModel = Sequential()
        hidModel.add(autoencoder.model.layers[0])
        hidModel.add(autoencoder.model.layers[1])

        ben_hid_emd = hidModel.predict(x_ben)
        van_hid_emd = hidModel.predict(x_van)

        # store data
#         np.save("data/wiki/ben_hid_emd_20_%s_%s" % (fea_dim, hid_dim[0]), ben_hid_emd)
#         np.save("data/wiki/van_hid_emd_20_%s_%s" % (fea_dim, hid_dim[0]), van_hid_emd)

    # 变长序列处理
    elif fix_or_var == 0:
        if step_length == 20:
            x_ben = np.load('data/wiki/',encoding='bytes')
            x_van = np.load('data/wiki/',encoding='bytes')
            x_ben = sample_shuffle(x_ben)  
            x_van = sample_shuffle(x_van)  
            train_ben = x_ben[0:10000]

            sampleWeights = list()
            for e in train_ben:
                sampleWeights.append(np.ones(len(e)))

            train_ben_P = pad_sequences(train_ben, maxlen=20, dtype='float')
            x_ben_P = pad_sequences(x_ben, maxlen=20, dtype='float')
            x_van_P = pad_sequences(x_van, maxlen=20, dtype='float')

            # decoding sequence is reversed
            sampleWeights = pad_sequences(sampleWeights, maxlen=20, dtype='float', padding='post')

            timesteps = 20
            input_dim = fea_dim
            autoencoder = Autoencoder()
            autoencoder.modelMasking('lstm', [timesteps, input_dim], hid_dim)
            autoencoder.compile('temporal')
            autoencoder.fit(train_ben_P, 'rev', sampleWeights)

            hidModel = Sequential()
            hidModel.add(autoencoder.model.layers[0])
            hidModel.add(autoencoder.model.layers[1])
            hidModel.add(autoencoder.model.layers[2])

            ben_hid_emd = hidModel.predict(x_ben_P)
            van_hid_emd = hidModel.predict(x_van_P)

            # store data
#             np.save("data/wiki/ben_hid_emd_mix_1_20_%s_%s" % (fea_dim, hid_dim[0]), ben_hid_emd)
#             np.save("data/wiki/val_hid_emd_mix_1_20_%s_%s" % (fea_dim, hid_dim[0]), van_hid_emd)

        elif step_length == 50:

            x_ben = np.load('data/wiki/X_v8_4_50_Ben.npy', encoding='bytes')
            x_van = np.load('data/wiki/X_v8_4_50_Van.npy', encoding='bytes')
            x_ben = sample_shuffle(x_ben)
            x_van = sample_shuffle(x_van)
            train_ben = x_ben[0:7000]

            sampleWeights = list()
            for e in train_ben:
                sampleWeights.append(np.ones(len(e)))

            train_ben_P = pad_sequences(train_ben, maxlen=50, dtype='float')
            x_ben_P = pad_sequences(x_ben, maxlen=50, dtype='float')
            x_van_P = pad_sequences(x_van, maxlen=50, dtype='float')

            # decoding sequence is reversed
            sampleWeights = pad_sequences(sampleWeights, maxlen=50, dtype='float', padding='post')

            timesteps = 50
            input_dim = fea_dim
            autoencoder = Autoencoder()
            autoencoder.modelMasking('lstm', [timesteps, input_dim], hid_dim)
            autoencoder.compile('temporal')
            autoencoder.fit(train_ben_P, 'rev', sampleWeights)

            hidModel = Sequential()
            hidModel.add(autoencoder.model.layers[0])
            hidModel.add(autoencoder.model.layers[1])
            hidModel.add(autoencoder.model.layers[2])

            ben_hid_emd = hidModel.predict(x_ben_P)
            van_hid_emd = hidModel.predict(x_van_P)

    return ben_hid_emd, van_hid_emd

In [6]:
ben_hid_emd, van_hid_emd =gen_hid_repre(fea_dim=8, hid_dim=[200], fix_or_var=0, step_length=50)

Train on 6650 samples, validate on 350 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
ben_hid_emd

array([[ 0.05116682, -0.07898297,  0.05688819, ...,  0.09775726,
        -0.03702514, -0.1548021 ],
       [ 0.06015232, -0.14864229,  0.08259501, ...,  0.28237966,
        -0.15756416, -0.26166925],
       [ 0.06606249, -0.10603029,  0.0479022 , ...,  0.1518603 ,
        -0.08845998, -0.19822004],
       ...,
       [ 0.06719301, -0.10862384,  0.05089102, ...,  0.14848088,
        -0.09090625, -0.19644676],
       [-0.00965766, -0.15456763,  0.12148448, ...,  0.14247192,
        -0.1070011 , -0.19472712],
       [ 0.0288748 , -0.1534694 ,  0.17773622, ...,  0.18878806,
        -0.10143352, -0.26629752]], dtype=float32)