In [1]:
import re
import json
import numpy as np

In [2]:
from keras.models import load_model, Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Flatten

Using TensorFlow backend.


In [3]:
model = load_model('model20171227_2.h5')

In [7]:
vocab_size = 13062
emb_dim = 50
input_length = 10

In [9]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emb_dim, input_length=input_length))
model.add(LSTM(1000, activation='relu', return_sequences=True))
model.add(LSTM(1000, activation='relu', return_sequences=True))
model.add(Dense(1000, activation='relu'))
model.add(Flatten())
model.add(Dense(vocab_size, activation='softmax'))

In [10]:
model.load_weights('model20171227.h5')

In [4]:
def load_dict(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def load_index_word_map(word2ind_filename='word2ind', ind2word_filename='ind2word'):
    word2ind = load_dict(word2ind_filename)
    ind2word = load_dict(ind2word_filename)
    return word2ind, ind2word

In [5]:
word2ind, ind2word = load_index_word_map()

In [6]:
apos_end_pattern = r"'( cause| d| em| ll| m| n| re| s| til| till| twas| ve) (?!')"
apos_start_pattern = r" (d |j |l |ol |y )'"
apos_double_pattern = r" ' n ' "

In [7]:
def input_cleaning(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'[\n!"\(\),-.0-9:?\[\]]', lambda x: ' '+x.group(0)+' ', sentence)
    sentence = sentence.replace("''", '"')
    sentence = re.sub(r"\w+in'$|\w+in'\s", lambda m: m.group(0).replace("'", 'g'), sentence)
    sentence = sentence.replace("'", " ' ")
    # recover 'cause, 'd, 'em, 'll, 'm, 'n, 're, 's, 'til, 'till, 'twas
    sentence = re.sub(apos_end_pattern, lambda m: m.group(0)[:1]+m.group(0)[2:], sentence)
    # recover d', j', l', ol', y'
    sentence = re.sub(apos_start_pattern, lambda m: m.group(0)[:-2]+m.group(0)[-1:], sentence)
    # recover 'n'
    sentence = re.sub(apos_double_pattern, lambda m: m.group(0)[:2]+m.group(0)[3]+m.group(0)[-2:], sentence)
    sentence = re.sub(r' {2,}', ' ', sentence)
    sentence = sentence.strip()
    sentence = sentence.split(' ')
    return sentence

In [8]:
def tokenise(words):
    return [word2ind.get(word, 0) for word in words]

In [9]:
def detokenise(tokens):
    return [ind2word[str(ind)] for ind in tokens]

In [10]:
unapos_end_pattern = r" '(d|em|ll|m|n|re|s|til|till|twas|ve) "
unapos_start_pattern = r" (d|j|l|ol|y)' "
unapos_double_pattern = r" 'n' "

In [11]:
def uncleaning(sentence):
    sentence = re.sub(r'[\(\[] ', lambda x: x.group(0)[:-1], sentence)
    sentence = re.sub(r' [\)\].!?]', lambda x: x.group(0)[1:], sentence)
    sentence = re.sub(r'" .+ "', lambda x: x.group(0).replace('" ', '"').replace(' "', '"'), sentence)
    sentence = re.sub(r'^[\[\("]?\w|[?.!]"? \w', lambda x: x.group(0).upper(), sentence)
    sentence = re.sub(r' i ', ' I ', sentence)
    sentence = re.sub(unapos_start_pattern, lambda x: x.group(0)[:-1], sentence)
    sentence = re.sub(unapos_end_pattern, lambda x: x.group(0)[1:], sentence)
    sentence = re.sub(unapos_double_pattern, lambda x: x.group(0)[1:-1], sentence)
    sentence = re.sub(r' {2,}', ' ', sentence)
    sentence = sentence.strip()
    return sentence

In [12]:
def semantic_not_end(tokens):
    unequal_square_quotes = tokens.count(word2ind['[']) != tokens.count(word2ind[']'])
    unequal_brackets = tokens.count(word2ind['(']) != tokens.count(word2ind[')'])
    odd_double_quotes = tokens.count(word2ind['"']) % 2 != 0
    nonfinished_sentence = ind2word[str(tokens[-1])] not in ['.', '!', '?', '"']
    not_end = unequal_square_quotes or unequal_brackets or odd_double_quotes or nonfinished_sentence
    return not_end

In [13]:
def implement(seed_text, n_words, maxlen=10, must_stop=200):
    cleaned = input_cleaning(seed_text)
    res_tokens = padded_input_tokens = tokenise(cleaned)
    while (n_words > 0 or semantic_not_end(padded_input_tokens)) and must_stop > 0:
        padded_input_tokens = pad_sequences([padded_input_tokens], maxlen=maxlen)
        predicted = np.argmax(model.predict(padded_input_tokens))
        padded_input_tokens = padded_input_tokens[0].tolist()
        padded_input_tokens.append(predicted)
        res_tokens.append(predicted)
        n_words -= 1
        must_stop -= 1
    detokenised = detokenise(res_tokens)
    return uncleaning(' '.join(detokenised))

In [14]:
res = implement('standing in the rain, enjoying the moment in the dark', 30)

In [15]:
print(res)

Standing in the rain , enjoying the moment in the dark , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ,
