In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import pickle
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import tensorflow as tf
import keras.backend as K
from keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, Concatenate, TimeDistributed
from keras.models import Model, load_model
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


# import dataset

In [5]:
with open('data/lexical_repl/embedding.pkl', 'rb') as f:
    embedding = pickle.load(f)
    
with open('data/lexical_repl/idx2w.pkl', 'rb') as f:
    idx2w = pickle.load(f)
    
with open('data/lexical_repl/w2idx.pkl', 'rb') as f:
    w2idx = pickle.load(f)
    
assert 0 not in idx2w
assert '\t' in w2idx # SOS
assert '\n' in w2idx # EOS
assert '[UNK]' in w2idx

In [3]:
df = pd.read_pickle('data/lexical_repl/sents_marked_df.zip')

In [23]:
def seq_to_idx(string):
    # turns sequence of tokens to sequence of indices
    seq = word_tokenize(string)
    idx = []
    for word in seq:
        word = word.lower()
        if word in w2idx:
            idx.append(w2idx[word])
        #else: #unknown tokens?
    return idx

In [4]:
# extract data to arrays from df, add POST-padding
X = pad_sequences(df.idx, maxlen = 50, value = 0, padding = 'post').astype('int64')
y_span = pad_sequences(df.binary, maxlen = 50, value = 0, padding = 'post').astype('int64')

y_span_cat = np.zeros((y_span.shape[0], y_span.shape[1], 2))
for idx_1 in tqdm(range(y_span.shape[0])):
    for idx_2 in range(y_span.shape[1]):
        y_span_cat[idx_1][idx_2] = to_categorical(y_span[idx_1][idx_2], num_classes = 2)
        
X, y_span_cat = shuffle(X, y_span_cat)

X_train, X_test, y_train, y_test = train_test_split(X, y_span_cat, test_size=0.1, random_state=42)

HBox(children=(IntProgress(value=0, max=509285), HTML(value='')))




# model: find spans

In [14]:
def build_find_model(input_len, embedding, num_units, dropout_rate):
    K.clear_session()
    
    main_input = Input(shape = (input_len,), dtype = 'int64', name = 'main_input')

    with tf.device('/cpu:0'):
        embedding_layer = Embedding(input_dim = embedding.shape[0],
                              output_dim = embedding.shape[1],
                              weights = [embedding],
                              trainable = False, 
                              mask_zero = True,
                              name = 'embedding_layer')
        input_embed = embedding_layer(main_input)
        
    bi_lstm = Bidirectional(LSTM(return_sequences = True, units = num_units), name='bi-lstm')(input_embed)
    dropout_lstm = Dropout(rate = dropout_rate, name = 'dropout_lstm')(bi_lstm)
    dense = TimeDistributed(Dense(num_units, activation = 'relu'), name = 'dense')(dropout_lstm)
    dropout_dense = Dropout(rate = dropout_rate, name = 'dropout_dense')(dense)
    # is timedistributed even needed anymore? dense can handle 3D input now?
    output = TimeDistributed(Dense(2, activation = 'softmax'), name = 'output')(dropout_dense)
    
    model = Model(inputs = main_input, outputs = output)
    
    return model

In [19]:
find_model = build_find_model(input_len = X.shape[1], 
                         embedding = embedding, 
                         num_units = 128, 
                         dropout_rate = 0.25)
find_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      (None, 50)                0         
_________________________________________________________________
embedding_layer (Embedding)  (None, 50, 100)           14830400  
_________________________________________________________________
bi-lstm (Bidirectional)      (None, 50, 256)           234496    
_________________________________________________________________
dropout_lstm (Dropout)       (None, 50, 256)           0         
_________________________________________________________________
dense (TimeDistributed)      (None, 50, 128)           32896     
_________________________________________________________________
dropout_dense (Dropout)      (None, 50, 128)           0         
_________________________________________________________________
output (TimeDistributed)     (None, 50, 2)             258       
Total para

In [20]:
find_model.compile(optimizer = 'adam',
             loss = 'binary_crossentropy',
             metrics = ['binary_accuracy'])

find_history = find_model.fit(X_train, y_train, epochs = 1, batch_size = 32)

Epoch 1/1


In [21]:
def evaluate_find(X, y):
    true = np.argmax(y, axis = 2)
    pred = np.argmax(find_model.predict(X), axis = 2)
    total = float(y.shape[0])
    
    total_correct = 0
    indiv_wrong = 0
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    
    for i in tqdm(range(y.shape[0])):
        if (true[i] == pred[i]).all():
            total_correct += 1
        for j in range(y.shape[1]):
            if true[i][j] != pred[i][j]:
                indiv_wrong += 1
            
    print('Absolute accuracy (all correct):\t\t' + str(total_correct / total))
    
    print('Average number of incorrect labels per answer:\t' + str(indiv_wrong / total))
    
    return pred

results = evaluate_find(X_test, y_test)

HBox(children=(IntProgress(value=0, max=50929), HTML(value='')))


Absolute accuracy (all correct):		0.8779084607983664
Average number of incorrect labels per answer:	0.5282844744644505


In [48]:
def predict_find(X):
    pad_X = pad_sequences([X], value = 0, padding = 'post', maxlen = 50).astype('int64')
    pred = np.argmax(find_model.predict([pad_X], batch_size = 1), axis = 2)
    result = ''
    length = 50 if len(X) > 50 else len(X)
    for i in range(length):
        result += str(pred[0][i]) + '\t' + idx2w[X[i]] + '\n'
    print(result)
    
text = """
He instilled upon her a love for animals.
"""
predict_find(seq_to_idx(text))

0	he
0	instilled
0	upon
0	her
0	a
0	love
0	for
0	animals
0	.



# model: suggest replacements for spans

In [None]:
def build_suggest_model(orig_len, repl_len, embedding, num_units, dropout_rate):
    K.clear_session()
    
    orig_input = Input(shape = (orig_len,), dtype = 'int64', name = 'orig_input')
    repl_input = Input(shape = (repl_len,), dtype = 'int64', name = 'repl_input')

    with tf.device('/cpu:0'):
        embedding_layer = Embedding(input_dim = embedding.shape[0],
                              output_dim = embedding.shape[1],
                              weights = [embedding],
                              trainable = False, 
                              mask_zero = True,
                              name = 'embedding_layer')
        orig_embed = embedding_layer(orig_input)
        repl_embed = embedding_layer(repl_input)
        
    ### feed encoder input (main_input), decoder input (repl_input) and sliced replacement text to enc-dec system

    # these should change later to some sort of context-based or conditional model
    # also with attention

    encoder = Bidirectional(LSTM(return_state = True, units = num_units), name = "encoder")
    decoder = LSTM(return_sequences = True, return_state = True, name = "decoder", units = 2 * num_units)

    enc_output, enc_h_forward, enc_c_forward, enc_h_backward, enc_c_backward = encoder(orig_embed)
    enc_h = Concatenate()([enc_h_forward, enc_h_backward])
    enc_c = Concatenate()([enc_c_forward, enc_c_backward])
    dec_output, _, _ = decoder(repl_embed, initial_state = [enc_h, enc_c])

    # Dropout?
    # between enc-dec

    dense = TimeDistributed(Dense(num_units, activation = 'relu'), name = 'dense_layer')
    dec_tdd = TimeDistributed(Dense(embedding.shape[0], activation='softmax'), name = 'dense_output')

    dec_dense = dense(dec_output)
    repl_output = dec_tdd(dec_dense)
    
    model = Model(inputs = [orig_input, repl_input], outputs = repl_output)
    
    return model

In [None]:
def suggest_generator_train(data, batch_size):
    X_orig_whole = data[0]
    dec_input_whole = data[1]
    dec_target_whole = data[2]
    
    X_orig_whole, dec_input_whole, dec_target_whole = shuffle(X_orig_whole, 
                                                              dec_input_whole, 
                                                              dec_target_whole)
    
    i = 0
            
    while True:
        if i + batch_size > len(X_orig_whole):
            X_orig_batch = X_orig[i:]
            dec_input_batch = dec_input_whole[i:]
            dec_target_batch = dec_target_whole[i:]
            i = 0
        else:
            X_orig_batch = X_orig[i:i+batch_size]
            dec_input_batch = dec_input_whole[i:i+batch_size]
            dec_target_batch = dec_target_whole[i:i+batch_size]
            i += batch_size
        
        inputs = [X_orig_batch, dec_input_batch]
        targets = np.array([to_categorical(x, num_classes = embedding.shape[0]) for x in dec_target_batch])
        
        yield inputs, targets

In [None]:
suggest_model = build_suggest_model(orig_len = X_orig.shape[1],
                                   repl_len = dec_input.shape[1],
                                   embedding = embedding,
                                   num_units = 128,
                                   dropout_rate = 0.25)

suggest_model.summary()

In [None]:
suggest_model.compile(optimizer = 'adam',
             loss = 'categorical_crossentropy',
             metrics = ['accuracy'])

In [None]:
batch_size = 16
suggest_history = suggest_model.fit_generator(
                            suggest_generator_train([X_orig, dec_input, dec_target], 
                                                    batch_size),
                              steps_per_epoch=len(X_orig) // batch_size,
                              epochs = 5,b
                              verbose = 1)#,
                              #validation_data = (x_val, y_val),
                              #use_multiprocessing = True,
                              #workers = 6)

In [None]:
#model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
#model = load_model('my_model.h5')

## inference mode

In [None]:
# redefine encoder model: takes original input, outputs the states
enc_model = Model(inputs = orig_input, outputs = [enc_h, enc_c])

# define the states to input into the decoder (this is what you get from the encoder)
inf_dec_h_input = Input(shape=(num_units * 2,)) #enc_h
inf_dec_c_input = Input(shape=(num_units * 2,)) #enc_c
inf_dec_states_input = [inf_dec_h_input, inf_dec_c_input]

# these are the outputs you get when you run the decoder, set them up matching the original model
# repl_embed is more of a placeholder - of course you won't actually have the answer when you infer
inf_dec_main, inf_dec_h, inf_dec_c = decoder(repl_embed, initial_state = inf_dec_states_input)
inf_dec_states = [inf_dec_h, inf_dec_c]
inf_dec_dense = dense(inf_dec_main)
inf_dec_output = dec_tdd(inf_dec_dense)

# define decoder model
dec_model = Model([repl_input] + inf_dec_states_input, [inf_dec_output] + inf_dec_states)

In [None]:
def decode(input_seq):
    # takes input sequence in form of seq of token indices
    states = enc_model.predict(input_seq)
    
    # begin output sequence, use start character
    target_seq = np.zeros((1, repl_len))
    target_seq[0, 0] = w2idx['\t']
    
    # using batch_size = 1, sample in a loop
    stop = False
    decoded = []
    while not stop:
        output_tok, h, c = dec_model.predict([target_seq] + states)
        states = [h, c] # update states
        
        # sample a token
        sample_idx = np.argmax(output_tok[0, -1, :]) # takes the last one in output
        sample_tok = idx2w[sample_idx]
        
        # exit if maxlen is reached or stop character is found
        if (sample_tok == '\n' or len(decoded) > repl_len):
            stop = True
        else:
            # update target_seq
            decoded.append(sample_tok)
            target_seq = np.zeros((1, repl_len))
            target_seq[0, 0] = sample_idx
    
    return decoded

In [None]:
test = 'is not'

decode(pad_sequences([seq_to_idx(test)], value = pad, padding = 'post', maxlen = orig_len).astype('int64'))

In [None]:
for idx in np.random.choice(len(X_orig), 10):
    sent = ''
    for x in X_orig[idx]:
        if x != 0:
            sent += idx2w[x] + ' '
    print(sent)
    print(decode(np.array([X_orig[idx]])))