In [541]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow.keras.backend as K

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from flair.embeddings import WordEmbeddings


In [631]:
# load 
data_path = '../data/context_task_trim.tsv'
data = pd.read_csv(data_path, sep='\t')

# Add start and end tokens
BOS, EOS, OOV, PAD = '_start_', '_end_', '_unk_', '_pad_'
data.Summary = BOS + ' ' + data.Summary + ' ' + EOS

In [497]:
# test/train split
x_train, x_test, y_train, y_test = train_test_split(
    data.TaskSentence.values, 
    data.Summary.values,
    test_size=0.2,
    random_state=0,
    shuffle=True
)

In [618]:
# fit tokenizer
MAXLEN = 20

tokenizer = Tokenizer(filters='', lower=True, oov_token = OOV)
tokenizer.fit_on_texts(list(x_train)+list(y_train))

x = tokenizer.texts_to_sequences(x_train)
y = tokenizer.texts_to_sequences(y_train)

x_pad = pad_sequences(x, maxlen=MAXLEN, padding='post')
y_pad = pad_sequences(y, maxlen=MAXLEN, padding='post')

x_val = pad_sequences(
    tokenizer.texts_to_sequences(x_test), 
    maxlen=MAXLEN, 
    padding='post')

y_val = pad_sequences(
    tokenizer.texts_to_sequences(y_test), 
    maxlen=MAXLEN, 
    padding='post')

V = len(tokenizer.word_index) + 1

In [619]:
# word embeddings
glove_embedding = WordEmbeddings('glove')
EMBEDDING_DIM = glove_embedding.embedding_length


In [620]:
embedding_matrix = np.zeros((V, EMBEDDING_DIM))

for word, i in tokenizer.word_index.items():
    try:
        word_sent = Sentence(word)
        glove_embedding.embed(word_sent)
        embedding_vector = word_sent[0].embedding.cpu().detach().numpy()
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25), EMBEDDING_DIM)

In [621]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Attention, Concatenate, TimeDistributed
from attention import AttentionLayer

In [622]:
from keras import backend as K 
K.clear_session() 
latent_dim = 100

# make embedding 
# embedding = Embedding(V, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)

# Encoder 
encoder_inputs = Input(shape=(MAXLEN,)) 
# enc_emb = Embedding(V, latent_dim, trainable=True)(encoder_inputs) 
enc_emb = embedding(encoder_inputs)

#LSTM 1 
encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True) 
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb) 

#LSTM 2 
encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True) 
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1) 

#LSTM 3 
encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True) 
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2) 

# Set up the decoder. 
decoder_inputs = Input(shape=(None,)) 
# dec_emb = Embedding(V, latent_dim, trainable=True)(decoder_inputs) 
dec_emb = embedding(decoder_inputs)

#LSTM using encoder_states as initial state
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True) 
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c]) 

#Attention Layer
attn_layer = AttentionLayer(name='attention_layer') 
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs]) 

# Concat attention output and decoder LSTM output 
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])

#Dense layer
decoder_dense = TimeDistributed(Dense(V, activation='softmax')) 
# decoder_dense = Dense(V, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input) 

# Define the model
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs) 
model.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           multiple             180800      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 20, 100), (N 80400       embedding[7][0]              

In [623]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')


In [624]:
model.fit(x=[x_pad, y_pad],
          y=np.expand_dims(y_pad, axis=2), 
          batch_size = 128, 
          epochs=20)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1cb3b9bd0>

In [625]:
# encoder inference
encoder_model = tf.keras.Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])

# decoder inference
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(MAXLEN,latent_dim))

# Get the embeddings of the decoder sequence
# dec_emb2 = dec_emb_layer(decoder_inputs)
dec_emb2 = embedding(decoder_inputs)

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

#attention inference
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_inf_concat)

# Final decoder model
decoder_model = tf.keras.Model(
    [decoder_inputs] + [decoder_hidden_state_input, decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [state_h2, state_c2])

In [626]:
# def decode_sequence(input_seq):
#     # Encode the input as state vectors.
#     e_out, e_h, e_c = encoder_model.predict(input_seq)

#     # Generate empty target sequence of length 1.
#     target_seq = np.zeros((1,1))

#     # Chose the 'start' word as the first word of the target sequence
#     target_seq[0, 0] = tokenizer.word_index[BOS]

#     stop_condition = False
#     decoded_sentence = ''
#     while not stop_condition:
#         output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

#         # Sample a token
#         sampled_token_index = np.argmax(output_tokens[0, -1, :])
#         sampled_token = tokenizer.index_word[sampled_token_index]
        
#         if(sampled_token is not EOS):
#             decoded_sentence += ' '+sampled_token

#             # Exit condition: either hit max length or find stop word.
#             if (sampled_token == 'end' or len(decoded_sentence.split()) >= (MAXLEN-1)):
#                 stop_condition = True

#         # Update the target sequence (of length 1).
#         target_seq = np.zeros((1,1))
#         target_seq[0, 0] = sampled_token_index

#         # Update internal states
#         e_h, e_c = h, c

#     return decoded_sentence

In [627]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)

    # Chose the 'start' word as the first word of the target sequence
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = tokenizer.word_index[BOS]

    sent = ''
    while True:
        
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

        # Sample a token
        token_index = np.argmax(output_tokens[0, -1, :])
        token = tokenizer.index_word[token_index]
        
        if token == EOS or len(sent.split()) >= MAXLEN-1:
            break
        
        # append 
        sent += ' '+token

        # Update the target sequence (of length 1).
        target_seq[0, 0] = token_index

        # Update internal states
        e_h, e_c = h, c

    return sent

In [628]:
def seq2summary(input_seq):
    newString=''
    for i in input_seq:
        if((i!=0 and i!=tokenizer.word_index[BOS]) and i!=tokenizer.word_index[EOS]):
            newString = newString + tokenizer.index_word[i]+' '
    return newString

def seq2text(input_seq):
    newString=''
    for i in input_seq:
        if(i!=0):
            newString = newString + tokenizer.index_word[i]+' '
    return newString

In [632]:
tokenizer.index_word[0] = PAD

In [652]:
seq2text(x_pad[300])

'please destroy your current card and replace it with the new wallet card upon receipt. '

In [633]:
decode_sequence(np.expand_dims(x_pad[300], axis=0))

' _start_ _start_ _start_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_ _pad_'

In [643]:
a =  embedding_matrix[tokenizer.word_index['send']]
b =  embedding_matrix[tokenizer.word_index['give']]
np.dot(a,b) / (np.linalg.norm(a)*np.linalg.norm(b))

0.7272412382771437

In [399]:
# tokenizer.index_word

In [328]:
for i in range(len(x_pad)):
    print("TaskSentence: ", seq2text(x_pad[i]))
    print("Original summary:" , seq2summary(y_pad[i]))
    print("Predicted summary:", decode_sequence(x_val[i].reshape(1,max_len_text)))
    print("\n")

TaskSentence:  Please respond to Chris Germany@enron.com 
Original summary: Reply to Chris at Germany@enron.com 


TaskSentence:  Please take a look at this and give me a call. 
Original summary: Examine this and call SENDER 


TaskSentence:  Please take a few minutes to completely fill out the following survey. 
Original summary: Complete survey 


TaskSentence:  Please review and send comments to Mollie Lampi mlampi@nyiso.com by 10:00 AM Friday morning 12/1 . 
Original summary: Review filing with the FERC and send coments to Mollie Lampi 


TaskSentence:  Please make review and if you have any modifications, please get with John as soon as possible. 
Original summary: Review Alberta PPA presentation 


TaskSentence:  And lastly, can you put me on cc: list for Wellesley info. 
Original summary: Put SENDER on cc list for Wellesley 


TaskSentence:  To view the Group members, please open the ECT Address book, click on Groups, then double-click on the name of the group. 
Original summary

Original summary: Review latest revision for Fountain Valley Project 


TaskSentence:  Please bring your adapter. 
Original summary: Bring adapter 


TaskSentence:  Please provide feedback on the employees listed below by accessing the Performance Management System PEP and completing an online feedback form as described in the Performance Management Quick Reference Guide . 
Original summary: Provide feedback 


TaskSentence:  Please acknowledge receipt. 
Original summary: Acknowledge receipt 


TaskSentence:  Gerald, could you please verify the entities & other missing information highlighted in the document. 
Original summary: Verify entities in document 


TaskSentence:  Please fill in where it is pertinent and monitor word count to 550. 
Original summary: Edit draft of Executive Letter and monitor word count 


TaskSentence:  Please make sure to write El Salvador on your check. 
Original summary: Write El Salvador on check 


TaskSentence:  Please contact him and arrange his travel.