In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import pickle
from nltk import word_tokenize
from sklearn.model_selection import train_test_split

import tensorflow as tf
import keras.backend as K
from keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, Concatenate, TimeDistributed
from keras.models import Model, load_model
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

# import dataset

In [2]:
# dictionaries, pretrained embeddings
with open('data/glv_w2idx.pkl', 'rb') as f:
    w2idx = pickle.load(f)
with open('data/glv_embed_matrix.pkl', 'rb') as f:
    embedding = pickle.load(f)
    
# need to append BOS ('\t') and EOS ('\n') tokens to embeddings
# give (consistently) random initialization since they don't actually mean anything
# padding already exists as '' at the end of the embedding

pad = len(w2idx) - 1

w2idx['\t'] = embedding.shape[0]
np.random.seed(1)
embedding = np.append(embedding, np.random.rand(1, 300), axis=0)

w2idx['\n'] = embedding.shape[0]
np.random.seed(2)
embedding = np.append(embedding, np.random.rand(1, 300), axis=0)

In [3]:
idx2w = dict((i, word) for word, i in w2idx.items())

In [4]:
# import acrolinx blog post dataset - but with contractions only just to test

def check_df(string):
    # basic check for contractions
    item_list = ['have', 'are', 'is', 'not', 'will']
    for item in item_list:
        if item in string:
            return True
    return False

df = pd.read_pickle('data/acrolinx_blog/acrolinx_blog_annotated_df.pkl')
df = df[df['Original'].apply(lambda x: check_df(x)) == True].reset_index(drop=True)

In [5]:
# preprocessing
# change from text to indices

# NOTE: there is word lowering in this because the pretrained word vectors, GloVe, only include
# lowercase tokens

def seq_to_idx(string):
    # turns sequence of tokens to sequence of indices
    seq = word_tokenize(string)
    idx = []
    for word in seq:
        word = word.lower()
        if word in w2idx:
            idx.append(w2idx[word])
        #else: #unknown tokens?
    return idx

def make_span(start, end, seq_len):
    # takes start and end of span and returns 0/1 output for the given sequence length
    new = [0] * seq_len
    new[start : end] = [1] * (end - start)
    return new

def reduce_fragments(orig, repl):
    # removes repeated sections of original and replacement texts (e.g. minimizes length)
    # unless it would make one of the sections empty
    
    # add start-of-sequence ('\t') and end-of-sequence ('\n') markers to these
    # this, the replacement/target text, will be used in the decoder step of training only
    
    return orig, repl

def preprocess(df):
    x_token = []
    span = []
    y_repl = []
    x_orig = []
    
    for idx, row in tqdm(df.iterrows(), total = df.shape[0]):
        
        # Converting sentence strings to lists of indices.
        sent = seq_to_idx(row['Sentence'])
        orig = seq_to_idx(row['Original'])
        repl = seq_to_idx(row['Replacement'])
        
        if len(sent) == 0 or len(orig) == 0 or len(repl) == 0:
            x_token.append(np.nan)
            span.append(np.nan)
            y_repl.append(np.nan)
            x_orig.append(np.nan)
            print('Empty sentence or fragment: ' + row['Sentence'])
            continue
            
        x_token.append(sent)
        x_orig.append(orig)
        y_repl.append([w2idx['\t']] + repl + [w2idx['\n']])
                
        # take indices and find the 1st occurrence of the slice in the whole sentence
        starts = [i for i, x in enumerate(sent) if x == orig[0]]
        current_span = []
        y_s = np.nan
        y_e = np.nan
        for potential_start in starts:
            potential_slice = sent[potential_start : potential_start + len(orig)]
            if (potential_slice == np.array(orig)).all():
                y_s = potential_start
                y_e = potential_start + len(orig) + 1
                break
        if np.isnan(y_s) or np.isnan(y_e):
            print('Original not found in sentence.')
            print(row['Sentence'])
            print(row['Original'])
            span.append(np.nan)
        else:
            span.append(make_span(int(y_s), int(y_e - 1), len(sent)))
                
    df['x_token'] = x_token
    df['span'] = span
    df['y_repl'] = y_repl
    df['x_orig'] = x_orig
    
    return df

In [6]:
df = preprocess(df)
df = df.dropna()

HBox(children=(IntProgress(value=0, max=3169), HTML(value='')))

Original not found in sentence.
Now you have seen all of the problem words that we have collectedd .
that we have collected
Original not found in sentence.
It does not mean… We 're rigid or uptight .
It does not mean
Original not found in sentence.
It is a great event and one that you should definitely check out if you have not't before ( by the way , you can still register for it by clicking here ) .
have not
Original not found in sentence.
Thmay be conference is all about being smarter with your content — whether you 're a marketer or in tech docs — and following the lead of pioneering companies such as Google , IBM , and Cisco Systems .
This



In [7]:
df.sample(5) #NOTE: clean the longer ones later

Unnamed: 0,Sentence,Original,Replacement,x_token,span,y_repl,x_orig
752,"If you do not embrace most of them , you will ...",do not,do n't,"[83, 81, 88, 36, 7444, 96, 3, 101, 1, 81, 43, ...","[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[400001, 88, 70, 400002]","[88, 36]"
2916,"It is fine to be creative , but do not dial up...",do not,do n't,"[20, 14, 1695, 4, 30, 4069, 1, 34, 88, 36, 125...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[400001, 88, 70, 400002]","[88, 36]"
2795,"Not all music will do this , but scientists ha...",you are,you 're,"[36, 64, 403, 43, 88, 37, 1, 34, 2154, 33, 238...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[400001, 81, 267, 400002]","[81, 32]"
343,You have always got to create your content wit...,You have,You 've,"[81, 33, 690, 405, 4, 1210, 392, 2768, 17, 7, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[400001, 81, 462, 400002]","[81, 33]"
282,"With so many options , it is easy to get distr...",it is,it 's,"[17, 100, 109, 2780, 1, 20, 14, 1673, 4, 169, ...","[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[400001, 20, 9, 400002]","[20, 14]"


In [8]:
# extract data to arrays from df, add POST-padding

X = pad_sequences(df['x_token'], maxlen = 20, value = pad, padding = 'post').astype('int64')
y_span = pad_sequences(df['span'], maxlen = 20, value = 0, padding = 'post').astype('int64')
X_orig = pad_sequences(df['x_orig'], value = pad, padding = 'post').astype('int64')
y_repl = pad_sequences(df['y_repl'], value = pad, padding = 'post').astype('int64')

y_span_cat = np.zeros((y_span.shape[0], y_span.shape[1], 2))
for idx_1 in range(y_span.shape[0]):
    for idx_2 in range(y_span.shape[1]):
        y_span_cat[idx_1][idx_2] = to_categorical(y_span[idx_1][idx_2], num_classes = 2)

# set up target data from output sequence, 1 timestep off from y_repl
y_repl_cat = np.array([to_categorical(x, num_classes = embedding.shape[0]) for x in y_repl]) 

input_len = X.shape[1]
orig_len = X_orig.shape[1]
repl_len = y_repl.shape[1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_span_cat, test_size=0.1, random_state=42)

# model: find spans

In [10]:
K.clear_session()

In [11]:
main_input = Input(shape = (input_len,), dtype = 'int64', name = 'main_input')

with tf.device('/cpu:0'):
    # note for later: can use mask_zero parameter in embedding layer, but would need to go back and change some indices
    embedding_layer = Embedding(input_dim = embedding.shape[0],
                          output_dim = embedding.shape[1],
                          weights = [embedding],
                          trainable = False, 
                          name = 'embedding_layer')
    input_embed = embedding_layer(main_input)

Instructions for updating:
Colocations handled automatically by placer.


In [12]:
num_units = 128

bi_lstm = Bidirectional(LSTM(return_sequences = True, units = num_units), name='bi-lstm')(input_embed)
dropout = Dropout(rate = 0.25)(bi_lstm)
output = Dense(2, activation='softmax')(dropout)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [13]:
model = Model(inputs = main_input, outputs = output)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 20)                0         
_________________________________________________________________
embedding_layer (Embedding)  (None, 20, 300)           120000900 
_________________________________________________________________
bi-lstm (Bidirectional)      (None, 20, 256)           439296    
_________________________________________________________________
dropout_1 (Dropout)          (None, 20, 256)           0         
_________________________________________________________________
dense_1 (Dense)              (None, 20, 2)             514       
Total params: 120,440,710
Trainable params: 439,810
Non-trainable params: 120,000,900
_________________________________________________________________


In [14]:
model.compile(optimizer = 'adam',
             loss = 'binary_crossentropy',
             metrics = ['binary_accuracy'])

history = model.fit(X_train, y_train, epochs = 5, batch_size = 32)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
def evaluate(X, y):
    true = np.argmax(y, axis = 2)
    pred = np.argmax(model.predict(X), axis = 2)
    total = float(y.shape[0])
    total_correct = 0
    indiv_wrong = 0
    for i in range(y.shape[0]):
        if (true[i] == pred[i]).all():
            total_correct += 1
        for j in range(y.shape[1]):
            if true[i][j] != pred[i][j]:
                indiv_wrong += 1
            
    print('Absolute accuracy (all correct):\t\t' + str(total_correct / total))
    
    print('Average number of incorrect labels per answer:\t' + str(indiv_wrong / total))
    
    return pred

In [16]:
results = evaluate(X_test, y_test)

Absolute accuracy (all correct):		0.61198738170347
Average number of incorrect labels per answer:	0.8201892744479495


In [17]:
results

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
def predict(X):
    pad_X = pad_sequences([X], value = pad, padding = 'post', maxlen = input_len).astype('int64')
    pred = np.argmax(model.predict([pad_X], batch_size = 1), axis = 2)
    result = ''
    for i in range(len(X)):
        result += str(pred[0][i]) + '\t' + idx2w[X[i]] + '\n'
    print(result)

In [20]:
predict(seq_to_idx('Do you have a moment?'))

1	do
1	you
1	have
0	a
0	moment
0	?



# step 2: turning binary output to slice

# step 3: nmt

In [None]:
K.clear_session()

In [9]:
orig_input = Input(shape = (orig_len,), dtype = 'int64', name = 'orig_input')
repl_input = Input(shape = (repl_len,), dtype = 'int64', name = 'repl_input')

with tf.device('/cpu:0'):
    # note for later: can use mask_zero parameter in embedding layer, but would need to go back and change some indices
    embedding_layer = Embedding(input_dim = embedding.shape[0],
                          output_dim = embedding.shape[1],
                          weights = [embedding],
                          trainable = False, 
                          name = 'embedding_layer')
    orig_embed = embedding_layer(orig_input)
    repl_embed = embedding_layer(repl_input)

Instructions for updating:
Colocations handled automatically by placer.


In [10]:
### feed encoder input (main_input), decoder input (repl_input) and sliced replacement text to enc-dec system

# these should change later to some sort of context-based or conditional model
# also with attention

# decoder given 2*units to accept bidirectional outputs
num_units = 128

encoder = Bidirectional(LSTM(return_state = True, units = num_units), name = "encoder")
decoder = LSTM(return_sequences = True, return_state = True, name = "decoder", units = 2 * num_units)

# sequence is unnecessary for the encoder - just states, to start the decoder correctly
# state and sequence for decoder will be necessary in inference, but not right now
enc_output, enc_h_forward, enc_c_forward, enc_h_backward, enc_c_backward = encoder(orig_embed)
enc_h = Concatenate()([enc_h_forward, enc_h_backward])
enc_c = Concatenate()([enc_c_forward, enc_c_backward])
dec_output, _, _ = decoder(repl_embed, initial_state = [enc_h, enc_c])

# Dropout?

dec_tdd = TimeDistributed(Dense(embedding.shape[0], activation='softmax'), name = 'dense_output')
repl_output = dec_tdd(dec_output)

In [11]:
nmt_model = Model(inputs = [orig_input, repl_input], outputs = repl_output)

nmt_model.compile(optimizer = 'adam',
             loss = 'categorical_crossentropy',
             metrics = ['accuracy'])

nmt_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
repl_input (InputLayer)         (None, 12)           0                                            
__________________________________________________________________________________________________
orig_input (InputLayer)         (None, 9)            0                                            
__________________________________________________________________________________________________
embedding_layer (Embedding)     multiple             120000900   orig_input[0][0]                 
                                                                 repl_input[0][0]                 
__________________________________________________________________________________________________
encoder (Bidirectional)         [(None, 256), (None, 439296      embedding_layer[0][0]            
__________

In [12]:
nmt_history = nmt_model.fit([X_orig, y_repl], y_repl_cat, epochs = 5, batch_size = 16)

Instructions for updating:
Use tf.cast instead.
Epoch 1/5

KeyboardInterrupt: 

In [None]:
#model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
#model = load_model('my_model.h5')

## inference mode for nmt

In [None]:
# redefine encoder model: takes original input, outputs the states
enc_model = Model(inputs = [orig_input], outputs = [enc_h, enc_c])

# define the states to input into the decoder (this is what you get from the encoder)
inf_dec_h_input = Input(shape=(num_units * 2,)) #enc_h
inf_dec_c_input = Input(shape=(num_units * 2,)) #enc_c
inf_dec_states_input = [inf_dec_h_input, inf_dec_c_input]

# these are the outputs you get when you run the decoder, set them up matching the original model
# repl_embed is more of a placeholder - of course you won't actually have the answer when you infer
inf_dec_main, inf_dec_h, inf_dec_c = decoder(repl_embed, initial_state = inf_dec_states_input)
inf_dec_states = [inf_dec_h, inf_dec_c]
inf_dec_output = dec_tdd(inf_dec_main)

# define decoder model
dec_model = Model([repl_input] + inf_dec_states_input, [inf_dec_output] + inf_dec_states)

In [None]:
y_repl[0]

In [None]:
def decode(input_seq):
    # takes input sequence in form of seq of token indices
    states = enc_model.predict(input_seq)
    
    # begin output sequence, use start character
    target_seq = np.zeros((1, repl_len))
    target_seq[0, 0] = w2idx['\t']
    
    # using batch_size = 1, sample in a loop
    stop = False
    decoded = []
    dec_maxlen = 5
    while not stop:
        output_tok, h, c = dec_model.predict([target_seq] + states)
        states = [h, c] # update states
        
        # sample a token
        sample_idx = np.argmax(output_tok[0, -1, :]) # takes the last one in output
        sample_tok = idx2w[sample_idx]
        decoded.append(sample_tok)
            
        # update target_seq
        target_seq = np.zeros((1, repl_len))
        target_seq[0, 0] = sample_idx
        
        # exit if maxlen is reached or stop character is found
        if (sample_tok == '\n' or len(decoded) > dec_maxlen):
            stop = True
    
    return decoded

In [None]:
test = 'why'

decode(pad_sequences([seq_to_idx(test)], value = pad, padding = 'post', maxlen = orig_len).astype('int64'))

In [None]:
for idx in range(100):
    print(' '.join(decode(np.array([X_orig[idx]]))))