In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import pickle
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import tensorflow as tf
import keras.backend as K
from keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, Concatenate, TimeDistributed
from keras.models import Model, load_model
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

# import dataset

In [3]:
# dictionaries, pretrained embeddings
with open('code/data/glv_w2idx.pkl', 'rb') as f:
    w2idx = pickle.load(f)
with open('code/data/glv_embed_matrix.pkl', 'rb') as f:
    embedding = pickle.load(f)
    
# need to append BOS ('\t') and EOS ('\n') tokens to embeddings
# give (consistently) random initialization since they don't actually mean anything
# padding already exists as '' at the end of the embedding

# we want to mask_zero, so we need to:
# remove 0 key from dict and put its content at the end
# replace '' entry in embedding with the 0-key element

embedding[embedding.shape[0] - 1] = embedding[0]
assert w2idx['the'] == 0
w2idx['the'] = embedding.shape[0] - 1
del w2idx['']

pad = 0

w2idx['\t'] = embedding.shape[0]
np.random.seed(1)
embedding = np.append(embedding, np.random.rand(1, 300), axis=0)

w2idx['\n'] = embedding.shape[0]
np.random.seed(2)
embedding = np.append(embedding, np.random.rand(1, 300), axis=0)

In [4]:
idx2w = dict((i, word) for word, i in w2idx.items())

In [80]:
# import acrolinx blog post dataset - but with contractions only just to test

def check_df(string):
    # basic check for contractions
    item_list = ['have', 'are', 'is', 'not', 'will']
    for item in item_list:
        if item in string:
            return True
    return False

df = pd.read_pickle('code/data/acrolinx_blog/acrolinx_blog_annotated_df.pkl')
df = df[df['Original'].apply(lambda x: check_df(x)) == True].reset_index(drop=True)

In [81]:
# preprocessing
# change from text to indices

# NOTE: there is word lowering in this because the pretrained word vectors, GloVe, only include
# lowercase tokens

def seq_to_idx(string):
    # turns sequence of tokens to sequence of indices
    seq = word_tokenize(string)
    idx = []
    for word in seq:
        word = word.lower()
        if word in w2idx:
            idx.append(w2idx[word])
        #else: #unknown tokens?
    return idx

def make_span(start, end, seq_len):
    # takes start and end of span and returns 0/1 output for the given sequence length
    new = [0] * seq_len
    new[start : end] = [1] * (end - start)
    return new

def reduce_fragments(orig, repl):
    # removes repeated sections of original and replacement texts (e.g. minimizes length)
    # unless it would make one of the sections empty
    
    # add start-of-sequence ('\t') and end-of-sequence ('\n') markers to these
    # this, the replacement/target text, will be used in the decoder step of training only
    
    return orig, repl

def preprocess(df):
    x_token = []
    span = []
    dec_input = []
    dec_target = []
    y_repl = []
    x_orig = []
    
    for idx, row in tqdm(df.iterrows(), total = df.shape[0]):
        
        # Converting sentence strings to lists of indices.
        sent = seq_to_idx(row['Sentence'])
        orig = seq_to_idx(row['Original'])
        repl = seq_to_idx(row['Replacement'])
        
        if len(sent) == 0 or len(orig) == 0 or len(repl) == 0:
            x_token.append(np.nan)
            span.append(np.nan)
            y_repl.append(np.nan)
            dec_input.append(np.nan)
            dec_target.append(np.nan)
            x_orig.append(np.nan)
            print('Empty sentence or fragment: ' + row['Sentence'])
            continue
            
        x_token.append(sent)
        x_orig.append(orig)
        dec_input.append([w2idx['\t']] + repl)
        dec_target.append(repl + [w2idx['\n']])
        y_repl.append(repl)
                
        # take indices and find the 1st occurrence of the slice in the whole sentence
        starts = [i for i, x in enumerate(sent) if x == orig[0]]
        current_span = []
        y_s = np.nan
        y_e = np.nan
        for potential_start in starts:
            potential_slice = sent[potential_start : potential_start + len(orig)]
            if (potential_slice == np.array(orig)).all():
                y_s = potential_start
                y_e = potential_start + len(orig) + 1
                break
        if np.isnan(y_s) or np.isnan(y_e):
            print('Original not found in sentence.')
            print(row['Sentence'])
            print(row['Original'])
            span.append(np.nan)
        else:
            span.append(make_span(int(y_s), int(y_e - 1), len(sent)))
                
    df['x_token'] = x_token
    df['span'] = span
    df['dec_input'] = dec_input
    df['dec_target'] = dec_target
    df['y_repl'] = y_repl
    df['x_orig'] = x_orig
    
    return df

In [83]:
df = preprocess(df)
df = df.dropna()
#df = df.drop_duplicates(subset = ['Original'])

HBox(children=(IntProgress(value=0, max=3164), HTML(value='')))




In [84]:
df.sample(5) #NOTE: clean the longer ones later

Unnamed: 0,Sentence,Original,Replacement,x_token,span,dec_input,dec_target,y_repl,x_orig
211,Beware and be careful .,Beware and be careful .,Beware !,"[22056, 5, 30, 5604, 2]","[1, 1, 1, 1, 1]","[400001, 22056, 805]","[22056, 805, 400002]","[22056, 805]","[22056, 5, 30, 5604, 2]"
130,"Because it is your customers , and not you , w...",Because it is your,Since it 's your,"[113, 20, 14, 392, 1661, 1, 5, 36, 81, 1, 38, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[400001, 108, 20, 9, 392]","[108, 20, 9, 392, 400002]","[108, 20, 9, 392]","[113, 20, 14, 392]"
197,"If you can answer yes to all of them , you wil...",you will,you 'll,"[83, 81, 86, 2168, 2772, 4, 64, 3, 101, 1, 81,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[400001, 81, 769]","[81, 769, 400002]","[81, 769]","[81, 43]"
153,"If you do not want to get left behind , you ne...",If you do not want,If you do n't want,"[83, 81, 88, 36, 303, 4, 169, 218, 561, 1, 81,...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[400001, 83, 81, 88, 70, 303]","[83, 81, 88, 70, 303, 400002]","[83, 81, 88, 70, 303]","[83, 81, 88, 36, 303]"
44,They are all great resources that we have foun...,we have found,we 've found,"[39, 32, 64, 353, 1540, 12, 53, 33, 238, 191, ...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[400001, 53, 462, 238]","[53, 462, 238, 400002]","[53, 462, 238]","[53, 33, 238]"


In [86]:
# extract data to arrays from df, add POST-padding

X = pad_sequences(df['x_token'], maxlen = 20, value = pad, padding = 'post').astype('int64')
y_span = pad_sequences(df['span'], maxlen = 20, value = 0, padding = 'post').astype('int64')
X_orig = pad_sequences(df['x_orig'], value = pad, padding = 'post').astype('int64')
dec_input = pad_sequences(df['dec_input'], value = pad, padding = 'post').astype('int64')
dec_target = pad_sequences(df['dec_target'], value = pad, padding = 'post').astype('int64')

y_span_cat = np.zeros((y_span.shape[0], y_span.shape[1], 2))
for idx_1 in range(y_span.shape[0]):
    for idx_2 in range(y_span.shape[1]):
        y_span_cat[idx_1][idx_2] = to_categorical(y_span[idx_1][idx_2], num_classes = 2)

# set up target data from output sequence, 1 timestep off from y_repl
#y_repl_cat = np.array([to_categorical(x, num_classes = embedding.shape[0]) for x in y_repl]) 

input_len = X.shape[1]

orig_len = X_orig.shape[1]
assert dec_input.shape[1] == dec_target.shape[1]
repl_len = dec_input.shape[1]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y_span_cat, test_size=0.1, random_state=42)

# model: find spans

In [233]:
K.clear_session()

In [234]:
main_input = Input(shape = (input_len,), dtype = 'int64', name = 'main_input')

with tf.device('/cpu:0'):
    embedding_layer = Embedding(input_dim = embedding.shape[0],
                          output_dim = embedding.shape[1],
                          weights = [embedding],
                          trainable = False, 
                          mask_zero = True,
                          name = 'embedding_layer')
    input_embed = embedding_layer(main_input)

In [235]:
num_units = 128

bi_lstm = Bidirectional(LSTM(return_sequences = True, units = num_units), name='bi-lstm')(input_embed)
dropout_lstm = Dropout(rate = 0.25, name = 'dropout_lstm')(bi_lstm)
dense = TimeDistributed(Dense(num_units, activation = 'relu'), name = 'dense')(dropout_lstm)
dropout_dense = Dropout(rate = 0.25, name = 'dropout_dense')(dense)
# is timedistributed even needed anymore? dense can handle 3D input now?
output = TimeDistributed(Dense(2, activation = 'softmax'), name = 'output')(dropout_dense)

In [236]:
model = Model(inputs = main_input, outputs = output)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 20)                0         
_________________________________________________________________
embedding_layer (Embedding)  (None, 20, 300)           120000900 
_________________________________________________________________
bi-lstm (Bidirectional)      (None, 20, 256)           439296    
_________________________________________________________________
dropout_lstm (Dropout)       (None, 20, 256)           0         
_________________________________________________________________
dense (TimeDistributed)      (None, 20, 128)           32896     
_________________________________________________________________
dropout_dense (Dropout)      (None, 20, 128)           0         
_________________________________________________________________
output (TimeDistributed)     (None, 20, 2)             258       
Total para

In [237]:
model.compile(optimizer = 'adam',
             loss = 'binary_crossentropy',
             metrics = ['binary_accuracy'])

history = model.fit(X_train, y_train, epochs = 5, batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [238]:
def evaluate(X, y):
    true = np.argmax(y, axis = 2)
    pred = np.argmax(model.predict(X), axis = 2)
    total = float(y.shape[0])
    total_correct = 0
    indiv_wrong = 0
    for i in range(y.shape[0]):
        if (true[i] == pred[i]).all():
            total_correct += 1
        for j in range(y.shape[1]):
            if true[i][j] != pred[i][j]:
                indiv_wrong += 1
            
    print('Absolute accuracy (all correct):\t\t' + str(total_correct / total))
    
    print('Average number of incorrect labels per answer:\t' + str(indiv_wrong / total))
    
    return pred

In [239]:
results = evaluate(X_test, y_test)

Absolute accuracy (all correct):		0.6277602523659306
Average number of incorrect labels per answer:	0.7886435331230284


In [240]:
def predict(X):
    pad_X = pad_sequences([X], value = pad, padding = 'post', maxlen = input_len).astype('int64')
    pred = np.argmax(model.predict([pad_X], batch_size = 1), axis = 2)
    result = ''
    for i in range(len(X)):
        result += str(pred[0][i]) + '\t' + idx2w[X[i]] + '\n'
    print(result)

In [241]:
predict(seq_to_idx('Are you not going?'))
predict(seq_to_idx('If you do not go I will not either.'))

1	are
1	you
1	not
0	going
0	?

0	if
0	you
1	do
0	not
0	go
1	i
1	will
1	not
0	either
0	.



# step 2: turning binary output to slice

# step 3: nmt

In [87]:
K.clear_session()

In [88]:
orig_input = Input(shape = (orig_len,), dtype = 'int64', name = 'orig_input')
repl_input = Input(shape = (repl_len,), dtype = 'int64', name = 'repl_input')

with tf.device('/cpu:0'):
    # note for later: can use mask_zero parameter in embedding layer, but would need to go back and change some indices
    embedding_layer = Embedding(input_dim = embedding.shape[0],
                          output_dim = embedding.shape[1],
                          weights = [embedding],
                          trainable = False, 
                          mask_zero = True,
                          name = 'embedding_layer')
    orig_embed = embedding_layer(orig_input)
    repl_embed = embedding_layer(repl_input)

In [89]:
### feed encoder input (main_input), decoder input (repl_input) and sliced replacement text to enc-dec system

# these should change later to some sort of context-based or conditional model
# also with attention

# decoder given 2*units to accept bidirectional outputs
num_units = 128

encoder = Bidirectional(LSTM(return_state = True, units = num_units), name = "encoder")
decoder = LSTM(return_sequences = True, return_state = True, name = "decoder", units = 2 * num_units)

# sequence is unnecessary for the encoder - just states, to start the decoder correctly
# state and sequence for decoder will be necessary in inference, but not right now
enc_output, enc_h_forward, enc_c_forward, enc_h_backward, enc_c_backward = encoder(orig_embed)
enc_h = Concatenate()([enc_h_forward, enc_h_backward])
enc_c = Concatenate()([enc_c_forward, enc_c_backward])
dec_output, _, _ = decoder(repl_embed, initial_state = [enc_h, enc_c])

# Dropout?
# between enc-dec

dense = TimeDistributed(Dense(num_units, activation = 'relu'), name = 'dense_layer')
dec_tdd = TimeDistributed(Dense(embedding.shape[0], activation='softmax'), name = 'dense_output')

dec_dense = dense(dec_output)
repl_output = dec_tdd(dec_dense)

In [90]:
nmt_model = Model(inputs = [orig_input, repl_input], outputs = repl_output)

nmt_model.compile(optimizer = 'adam',
             loss = 'categorical_crossentropy',
             metrics = ['accuracy'])

nmt_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
repl_input (InputLayer)         (None, 11)           0                                            
__________________________________________________________________________________________________
orig_input (InputLayer)         (None, 9)            0                                            
__________________________________________________________________________________________________
embedding_layer (Embedding)     multiple             120000900   orig_input[0][0]                 
                                                                 repl_input[0][0]                 
__________________________________________________________________________________________________
encoder (Bidirectional)         [(None, 256), (None, 439296      embedding_layer[0][0]            
__________

In [91]:
def suggest_generator_train(data, batch_size):
    X_orig_whole = data[0]
    dec_input_whole = data[1]
    dec_target_whole = data[2]
    
    X_orig_whole, dec_input_whole, dec_target_whole = shuffle(X_orig_whole, 
                                                              dec_input_whole, 
                                                              dec_target_whole)
    
    i = 0
            
    while True:
        if i + batch_size > len(X_orig_whole):
            X_orig_batch = X_orig[i:]
            dec_input_batch = dec_input_whole[i:]
            dec_target_batch = dec_target_whole[i:]
            i = 0
        else:
            X_orig_batch = X_orig[i:i+batch_size]
            dec_input_batch = dec_input_whole[i:i+batch_size]
            dec_target_batch = dec_target_whole[i:i+batch_size]
            i += batch_size
        
        inputs = [X_orig_batch, dec_input_batch]
        targets = np.array([to_categorical(x, num_classes = embedding.shape[0]) for x in dec_target_batch])
        
        yield inputs, targets

In [92]:
batch_size = 16
nmt_history = nmt_model.fit_generator(suggest_generator_train([X_orig, dec_input, dec_target], batch_size),
                                  steps_per_epoch=len(X_orig) // batch_size,
                                  epochs = 5,
                                  verbose = 1)#,
                                  #validation_data = (x_val, y_val),
                                  #use_multiprocessing = True,
                                  #workers = 6)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [None]:
#model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
#model = load_model('my_model.h5')

## inference mode for nmt

In [93]:
# redefine encoder model: takes original input, outputs the states
enc_model = Model(inputs = orig_input, outputs = [enc_h, enc_c])

# define the states to input into the decoder (this is what you get from the encoder)
inf_dec_h_input = Input(shape=(num_units * 2,)) #enc_h
inf_dec_c_input = Input(shape=(num_units * 2,)) #enc_c
inf_dec_states_input = [inf_dec_h_input, inf_dec_c_input]

# these are the outputs you get when you run the decoder, set them up matching the original model
# repl_embed is more of a placeholder - of course you won't actually have the answer when you infer
inf_dec_main, inf_dec_h, inf_dec_c = decoder(repl_embed, initial_state = inf_dec_states_input)
inf_dec_states = [inf_dec_h, inf_dec_c]
inf_dec_dense = dense(inf_dec_main)
inf_dec_output = dec_tdd(inf_dec_dense)

# define decoder model
dec_model = Model([repl_input] + inf_dec_states_input, [inf_dec_output] + inf_dec_states)

In [94]:
def decode(input_seq):
    # takes input sequence in form of seq of token indices
    states = enc_model.predict(input_seq)
    
    # begin output sequence, use start character
    target_seq = np.zeros((1, repl_len))
    target_seq[0, 0] = w2idx['\t']
    
    # using batch_size = 1, sample in a loop
    stop = False
    decoded = []
    while not stop:
        output_tok, h, c = dec_model.predict([target_seq] + states)
        states = [h, c] # update states
        
        # sample a token
        sample_idx = np.argmax(output_tok[0, -1, :]) # takes the last one in output
        sample_tok = idx2w[sample_idx]
        
        # exit if maxlen is reached or stop character is found
        if (sample_tok == '\n' or len(decoded) > repl_len):
            stop = True
        else:
            # update target_seq
            decoded.append(sample_tok)
            target_seq = np.zeros((1, repl_len))
            target_seq[0, 0] = sample_idx
    
    return decoded

In [95]:
test = 'is not'

decode(pad_sequences([seq_to_idx(test)], value = pad, padding = 'post', maxlen = orig_len).astype('int64'))

['you']

In [98]:
for idx in np.random.choice(len(X_orig), 10):
    sent = ''
    for x in X_orig[idx]:
        if x != 0:
            sent += idx2w[x] + ' '
    print(sent)
    print(decode(np.array([X_orig[idx]])))

that is easy 
['you']
would you not 
['you']
can not always be easy 
['you']
we have shared 
['you']
discussing 
['you']
. it is 
['you']
your audience will not do anything 
['you']
they have 
['you']
they are unfortunately 
['you']
they are 
['you']
