In [1]:
import re
import json
import numpy as np

In [2]:
from keras.models import load_model, Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Flatten

Using TensorFlow backend.


In [3]:
model = load_model('model20180120_7.h5')

In [4]:
def load_dict(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def load_index_word_map(word2ind_filename='word2ind', ind2word_filename='ind2word'):
    word2ind = load_dict(word2ind_filename)
    ind2word = load_dict(ind2word_filename)
    return word2ind, ind2word

In [5]:
word2ind, ind2word = load_index_word_map()

In [6]:
apos_end_pattern = r"'( cause| d| em| ll| m| n| re| s| til| till| twas| ve) (?!')"
apos_start_pattern = r" (d |j |l |ol |y )'"
apos_double_pattern = r" ' n ' "

In [7]:
def input_cleaning(sentence):
    sentence = sentence.lower()
    sentence = sentence.replace("''cause", "'cause")
    sentence = sentence.replace("n't", " n't")
    sentence = sentence.replace("''", '"')
    sentence = re.sub(r'[\n!"\(\),-.0-9:?\[\]]', lambda x: ' '+x.group(0)+' ', sentence)
    sentence = re.sub(r"\w+in'$|\w+in'\s", lambda m: m.group(0).replace("'", 'g'), sentence)
    sentence = sentence.replace("'", " ' ")
    # recover 'cause, 'd, 'em, 'll, 'm, 'n, 're, 's, 'til, 'till, 'twas
    sentence = re.sub(apos_end_pattern, lambda m: m.group(0)[:1]+m.group(0)[2:], sentence)
    # recover d', j', l', ol', y'
    sentence = re.sub(apos_start_pattern, lambda m: m.group(0)[:-2]+m.group(0)[-1:], sentence)
    # recover 'n'
    sentence = re.sub(apos_double_pattern, lambda m: m.group(0)[:2]+m.group(0)[3]+m.group(0)[-2:], sentence)
    sentence = sentence.replace(" n ' t ", " n't ")
    sentence = re.sub(r' {2,}', ' ', sentence)
    sentence = sentence.strip()
    sentence = sentence.split(' ')
    return sentence

In [8]:
def tokenise(words):
    return [word2ind.get(word, 0) for word in words]

In [9]:
def detokenise(tokens):
    return [ind2word[str(ind)] for ind in tokens]

In [10]:
unapos_end_pattern = r" '(d|em|ll|m|n|re|s|til|till|twas|ve) "
unapos_start_pattern = r" (d|j|l|ol|y)' "
unapos_double_pattern = r" 'n' "

In [11]:
def formatting(sentence):
    sentence = re.sub(r'[\(\[] ', lambda x: x.group(0)[:-1], sentence)
    sentence = re.sub(r' [\)\].!?,:]', lambda x: x.group(0)[1:], sentence)
    sentence = re.sub(r'" .+ "', lambda x: x.group(0).replace('" ', '"').replace(' "', '"'), sentence)
    sentence = re.sub(r'^[\[\("]?\w|[?.!]"? \w', lambda x: x.group(0).upper(), sentence)
    sentence = re.sub(r' i ', ' I ', sentence)
    sentence = re.sub(unapos_start_pattern, lambda x: x.group(0)[:-1], sentence)
    sentence = re.sub(unapos_end_pattern, lambda x: x.group(0)[1:], sentence)
    sentence = re.sub(unapos_double_pattern, lambda x: x.group(0)[1:-1], sentence)
    sentence = sentence.replace(" n't ", "n't ")
    # doin ' => doin'
    sentence = re.sub(r"(\win) (' )", lambda m: m.group(1)+m.group(2), sentence)
    sentence = re.sub(r' {2,}', ' ', sentence)
    # first letter alignment
    sentence = re.sub(r'(\n) (\w)', lambda m: m.group(1)+m.group(2).upper(), sentence)
    sentence = sentence.strip()
    return sentence

In [12]:
def semantic_not_end(tokens):
    unequal_square_quotes = tokens.count(word2ind['[']) != tokens.count(word2ind[']'])
    unequal_brackets = tokens.count(word2ind['(']) != tokens.count(word2ind[')'])
    odd_double_quotes = tokens.count(word2ind['"']) % 2 != 0
    nonfinished_sentence = ind2word[str(tokens[-1])] not in ['.', '!', '?', '"', '\n']
    not_end = unequal_square_quotes or unequal_brackets or odd_double_quotes or nonfinished_sentence
    return not_end

In [13]:
def implement(seed_text, n_words, maxlen=10, must_stop=200):
    cleaned = input_cleaning(seed_text)
    padded_input_tokens = tokenise(cleaned)
    res_tokens = [token for token in padded_input_tokens]
    while (n_words > 0 or semantic_not_end(padded_input_tokens)) and must_stop > 0:
        padded_input_tokens = pad_sequences([padded_input_tokens], maxlen=maxlen)
        probs = model.predict(padded_input_tokens)[0]
        predicted = np.random.choice(list(range(len(probs))), p=probs)
        padded_input_tokens = padded_input_tokens[0].tolist()
        padded_input_tokens.append(predicted)
        res_tokens.append(predicted)
        n_words -= 1
        must_stop -= 1
    detokenised = detokenise(res_tokens)
    return formatting(' '.join(detokenised))

In [14]:
# replace seed_text with your own
res = implement(seed_text="I can't sleep as she said she liked me", n_words=500, maxlen=20)

In [15]:
print(res)

I can't sleep as she said she liked me 
Good time and you opened out me fifteen miles 
 
You'd probably tell me what to do, let me go steppin' 
But a drag's rising low, oh where long grows? 
 
How I can't love that where I'm blessed from nightmare to bury the sidewalk 
Everyone is a thousand - yeah 
Maybe he's gonna squeeze him, little woman takes your hands low 
Mama'll be listening, maybe she's hurry one more line 
I won't stop nothin' I want, hug a friend 
 
Always failing 
Still with you 
 
No and I'm the last thing I have left there 
I'm moving back to myself for the test of men 
She's brushing your ride 
They say I had that song 
I - I won't give me no magic to come 
You're sleeping by the journey, no one hears'em 
He'll lay the stone and obey us. 
 
The president I just rather know is breaking me by the rain.
