In [207]:
import keras
from keras.preprocessing.text import one_hot,Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense , Flatten ,Embedding,Input
from keras.models import Model

from keras import backend as K
from keras.layers import LSTM, Bidirectional, concatenate, Add, Lambda, RepeatVector, TimeDistributed
from attention_decoder import AttentionDecoder

import numpy as np

import re
import os
import pickle

In [2]:
GLOVE_DIR = r'D:\projects\nlp\embeddings'
SAVE_BIN_DIR = r'D:\projects\nlp\summarization\stories\all_bin'
VOCAB_DIR = r'D:\projects\nlp\summarization\stories\vocab'

EMBEDDING_DIM = 100
EMBED_SIZE = EMBEDDING_DIM

In [3]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.{}d.txt'.format(EMBEDDING_DIM)), 'r', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [4]:
word_index = pickle.load(open(os.path.join(VOCAB_DIR, 'vocab.pkl') , 'rb'))

emb_mat = np.array(list(embeddings_index.values()))
ave_emb = emb_mat.mean(axis=0)
emb_matrix = np.array([np.zeros(len(ave_emb))] + [ave_emb] * len(word_index)) # 0 reserved for padding

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        emb_matrix[i] = embedding_vector

In [5]:
VOC_SIZE = len(emb_matrix)

# Design Model

In [6]:
LATENT_DIM = 100

In [176]:
K.clear_session()

# Encoder embedding
enc_input = Input(shape=(None,), name='EncoderInputLayer') 
enc_emb_layer = Embedding(VOC_SIZE, EMBED_SIZE, weights=[emb_matrix], trainable=False, name='EncEmbLayer') # Shared Embedding
enc_emb_output = enc_emb_layer(enc_input) # [batchSize, NumOfWords_Enc, EMBED_SIZE]

#BiLSTM encoder
enc_lstm_layer = Bidirectional(LSTM(LATENT_DIM, return_sequences=True, return_state = True, dropout = 0.3), name='EncBiLSTMLayer')
enc_output, fw_h, fw_c, bw_h, bw_c = enc_lstm_layer(enc_emb_output)

In [177]:
# concat states for attn_dist
enc_state_h = concatenate([fw_h, bw_h], name='EncConcatH')
enc_state_c = concatenate([fw_c, bw_c], name='EncConcatC')

enc_state_h = Dense(LATENT_DIM*2, name='EncDenseH')(enc_state_h)
enc_state_c = Dense(LATENT_DIM*2, name='EncDenseC')(enc_state_c)

# dec initial state
dec_ini_state = [enc_state_h, enc_state_c]

In [178]:
# Decoder
dec_input = Input(shape=(None,), name='DecoderInputLayer')
dec_emb_layer = Embedding(VOC_SIZE, EMBED_SIZE, weights=[emb_matrix], trainable=False, name='DecoderEmbedLayer')
dec_emb_output = dec_emb_layer(dec_input)

# decoder LSTM
dec_lstm_layer = LSTM(LATENT_DIM*2, return_sequences=True, return_state=True, dropout=0.3, name='DecLSTMLayer')
dec_output, state_h, state_c = dec_lstm_layer(dec_emb_output, initial_state=dec_ini_state)
dec_state = concatenate([state_h, state_c], name='DecConcatHC')

# decoder feature
dec_feat = Dense(LATENT_DIM, name='DecFeat1')(dec_state) # [batchSize, LATENT_DIM]
#dec_feat = Lambda(lambda x: K.expand_dims(x, axis=1), name='DecFeat2')(dec_feat)
#dec_feat = Lambda(lambda x: K.expand_dims(x, axis=1), name='DecFeat3')(dec_feat)

# decoder Distribution
dec_dist = Dense(VOC_SIZE, activation='softmax', name='DecDistribution')(dec_output) # [batchSize, NumOfWord_Dec, VOC_SIZE]

In [214]:
# Attention Mechanism

# reshape enc_output
#enc_feat = Lambda(lambda x: K.expand_dims(x, axis=1), name='EncFeat1')(enc_output)
enc_feat = Dense(LATENT_DIM, name='EncFeat2')(enc_output) # [batchSize, LATENT_DIM]

# attention distribution
attn_dist = Add(name='AttnDist1')([enc_feat, dec_feat])
attn_dist = Dense(LATENT_DIM, name='AttnDist2')(attn_dist) # [batchSize, NumOfWords_Enc, LATENT_DIM]
attn_dist = Lambda(lambda x: K.sum(x, axis=2), name='AttnDist3')(attn_dist)
attn_dist = Lambda(lambda x: K.expand_dims(x, axis=2))(attn_dist)
attn_dist = Lambda(lambda x: K.repeat_elements(x, rep=EMBED_SIZE, axis=2))(attn_dist)

# context vector
cntxt_vec = Lambda(lambda x: x[0] * x[1], name='ContextVect1')([attn_dist, enc_emb_output])
cntxt_vec = TimeDistributed(Dense(LATENT_DIM), name='ContextVect2')(cntxt_vec)
cntxt_vec = Lambda(lambda x: K.sum(x, axis=1), name='ContextVect3')(cntxt_vec)

# HOW TO USE THE CONTEXT VECTOR????

#enc_dist = Dense(VOC_SIZE, activation='softmax', name='EncDist2')(enc_dist)

In [196]:
# pgen
pgen = concatenate([concatenate(dec_ini_state), dec_state], name='PGen1')
pgen = Dense(1, activation='sigmoid', name='PGen2')(pgen)
pgen = Lambda(lambda x: K.expand_dims(x, axis=1))(pgen)
pgen_c =Lambda(lambda x: 1-x)(pgen)

# overall distribution
overall_dist_a = Lambda(lambda x: x[0]*x[1], name='Output_a')([enc_dist, pgen])
overall_dist_b = Lambda(lambda x: x[0]*(1-x[1]), name='Output_b')([dec_dist, pgen])
#overall_dist = Add()([overall_dist_a, overall_dist_b])

In [213]:
model = Model(inputs=[enc_input, dec_input], outputs=cntxt_vec)
res = model.predict(x=[x,y0])
#model.compile(loss="categorical_crossentropy", 
#              optimizer='rmsprop',
#             metrics=["accuracy"])
#model.fit(x=[x,y0], y = y1, epochs=1, verbose=1)
res.shape

(5, 100)

In [65]:
print(res[0].shape)
print(res[1].shape)

(5, 1511, 50001)
(5, 53, 50001)


In [None]:
model = Model(inputs=[enc_input, dec_input], outputs=overall_dist)

# Grab Data

In [20]:
num_batches = len(os.listdir(os.path.join(SAVE_BIN_DIR)))

def pad(x, max_len, p_type='pre'):
    x_len = len(x)
    require = max_len - x_len
    
    if require < 0:
        return x[:max_len]
    
    if p_type == 'pre':
        return [0] * require + x
    else:
        return x + [0] * require   

def tmp_cat(idx, VOCAB_SIZE):
    zeros = np.zeros(VOCAB_SIZE)
    zeros[idx] = 1
    return zeros
    
def to_categorical(yi, VOCAB_SIZE):
    return np.array([tmp_cat(idx, VOCAB_SIZE) for idx in yi])
    
def gen_xy(num_batches):
    b = 0
    
    while True:
        b = b % num_batches
        b += 1
        
        filename = os.path.join(SAVE_BIN_DIR, 'story_highlight_{}.bin'.format(b))
        [stories, highlights] = pickle.load(open(filename, "rb"))
                
        highlights_teacher = [y[:-1] for y in highlights]
        highlights_answer = [y[1:] for y in highlights]
        
        # batch padding
        b_max_len_x =  max([len(x) for x in stories])        
        x = [pad(x, b_max_len_x, p_type='pre') for x in stories]
        
        b_max_len_y0 = max([len(y) for y in highlights_teacher])
        y0 = [pad(y, b_max_len_y0, p_type='post') for y in highlights_teacher]
        
        b_max_len_y1 = max([len(y) for y in highlights_teacher])
        y1 = [pad(y, b_max_len_y1, p_type='post') for y in highlights_answer]
        
        print('padding done {}'.format(b))
        
        # numpy array
        x = np.array(x)
        y0 = np.array(y0)
        y1 = [to_categorical(yi, VOCAB_SIZE) for yi in y1]
        y1 = np.array(y1)
        
        print('np done {}'.format(b))
    
    yield([x, y0], y1)

In [21]:
def xy(b_num, VOCAB_SIZE):
    filename = os.path.join(SAVE_BIN_DIR, 'story_highlight_{}.bin'.format(b_num))
    [stories, highlights] = pickle.load(open(filename, "rb"))

    stories = stories[:5]
    highlights = highlights[:5]

    highlights_teacher = [y[:-1] for y in highlights]
    highlights_answer = [y[1:] for y in highlights]

    # batch padding
    b_max_len_x =  max([len(x) for x in stories])        
    x = [pad(x, b_max_len_x, p_type='pre') for x in stories]

    b_max_len_y0 = max([len(y) for y in highlights_teacher])
    y0 = [pad(y, b_max_len_y0, p_type='post') for y in highlights_teacher]

    b_max_len_y1 = max([len(y) for y in highlights_teacher])
    y1 = [pad(y, b_max_len_y1, p_type='post') for y in highlights_answer]
    
    print('padding done')

    # numpy array
    x = np.array(x)
    y0 = np.array(y0)
    y1 = [to_categorical(yi, VOCAB_SIZE) for yi in y1]
    y1 = np.array(y1)
    
    print('np done')
    
    return([x, y0], y1)

In [None]:
model.compile(loss="categorical_crossentropy", 
              optimizer='rmsprop',
              metrics=["accuracy"])

In [22]:
[x, y0], y1 = xy(1, VOC_SIZE)
print([x.shape, y0.shape])
print(y1.shape)
print(model.input_shape)
print(model.output_shape)

padding done
np done
[(5, 1511), (5, 53)]
(5, 53, 50001)
[(None, None), (None, None)]
(None, None, 50001)


In [None]:
model.fit(x=[x,y0], y = y1, epochs=1, verbose=1)

In [None]:
model.fit_generator(
    gen_xy(num_batches),
    steps_per_epoch=num_batches,
    epochs=1,
    verbose=1)