In [1]:
import re
import json
import numpy as np
from collections import deque

In [22]:
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [4]:
model = load_model('model20180126_1.h5')

In [5]:
def load_dict(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def load_index_word_map(word2ind_filename='word2ind', ind2word_filename='ind2word'):
    word2ind = load_dict(word2ind_filename)
    ind2word = load_dict(ind2word_filename)
    return word2ind, ind2word

In [6]:
word2ind, ind2word = load_index_word_map()

In [7]:
ending = [word2ind['('], word2ind['end'], word2ind[')']]

In [8]:
ending

[17, 22216, 18]

In [9]:
apos_end_pattern = r"'( cause| d| em| ll| m| n| re| s| til| till| twas| ve) (?!')"
apos_start_pattern = r" (d |j |l |ol |y )'"
apos_double_pattern = r" ' n ' "

In [10]:
def input_cleaning(sentence):
    sentence = sentence.lower()
    sentence = sentence.replace("''cause", "'cause")
    sentence = sentence.replace("n't", " n't")
    sentence = sentence.replace("''", '"')
    sentence = re.sub(r'[\n!"\(\),-.0-9:?\[\]]', lambda x: ' '+x.group(0)+' ', sentence)
    sentence = re.sub(r"\w+in'$|\w+in'\s", lambda m: m.group(0).replace("'", 'g'), sentence)
    sentence = sentence.replace("'", " ' ")
    # recover 'cause, 'd, 'em, 'll, 'm, 'n, 're, 's, 'til, 'till, 'twas
    sentence = re.sub(apos_end_pattern, lambda m: m.group(0)[:1]+m.group(0)[2:], sentence)
    # recover d', j', l', ol', y'
    sentence = re.sub(apos_start_pattern, lambda m: m.group(0)[:-2]+m.group(0)[-1:], sentence)
    # recover 'n'
    sentence = re.sub(apos_double_pattern, lambda m: m.group(0)[:2]+m.group(0)[3]+m.group(0)[-2:], sentence)
    sentence = sentence.replace(" n ' t ", " n't ")
    sentence = re.sub(r' {2,}', ' ', sentence)
    sentence = sentence.strip()
    sentence = sentence.split(' ')
    return sentence

In [11]:
def tokenise(words):
    return [word2ind.get(word, 0) for word in words]

In [12]:
def detokenise(tokens):
    return [ind2word[str(ind)] for ind in tokens]

In [13]:
unapos_end_pattern = r" '(d|em|ll|m|n|re|s|til|till|twas|ve) "
unapos_start_pattern = r" (d|j|l|ol|y)' "
unapos_double_pattern = r" 'n' "

In [14]:
def formatting(sentence):
    sentence = re.sub(r'[\(\[] ', lambda x: x.group(0)[:-1], sentence)
    sentence = re.sub(r' [\)\].!?,:]', lambda x: x.group(0)[1:], sentence)
    sentence = re.sub(r'" .+ "', lambda x: x.group(0).replace('" ', '"').replace(' "', '"'), sentence)
    sentence = re.sub(r'^[\[\("]?\w|[?.!]"? \w', lambda x: x.group(0).upper(), sentence)
    sentence = re.sub(r' i ', ' I ', sentence)
    sentence = re.sub(unapos_start_pattern, lambda x: x.group(0)[:-1], sentence)
    sentence = re.sub(unapos_end_pattern, lambda x: x.group(0)[1:], sentence)
    sentence = re.sub(unapos_double_pattern, lambda x: x.group(0)[1:-1], sentence)
    sentence = sentence.replace(" n't ", "n't ")
    # doin ' => doin'
    sentence = re.sub(r"(\win) (' )", lambda m: m.group(1)+m.group(2), sentence)
    sentence = re.sub(r' {2,}', ' ', sentence)
    # first letter alignment
    sentence = re.sub(r'(\n) (\w)', lambda m: m.group(1)+m.group(2).upper(), sentence)
    sentence = sentence.strip()
    return sentence

In [15]:
def semantic_not_end(tokens):
    unequal_square_quotes = tokens.count(word2ind['[']) != tokens.count(word2ind[']'])
    unequal_brackets = tokens.count(word2ind['(']) != tokens.count(word2ind[')'])
    odd_double_quotes = tokens.count(word2ind['"']) % 2 != 0
    nonfinished_sentence = ind2word[str(tokens[-1])] not in ['.', '!', '?', '"', '\n']
    not_end = unequal_square_quotes or unequal_brackets or odd_double_quotes or nonfinished_sentence
    return not_end

In [26]:
def implement(seed_text, n_words, maxlen=10, must_stop=2000):
    cleaned = input_cleaning(seed_text)
    padded_input_tokens = tokenise(cleaned)
    res_tokens = [token for token in padded_input_tokens]
    last_three = deque([0, 0, 0], maxlen=3)
    while (n_words > 0 or semantic_not_end(padded_input_tokens)) and must_stop > 0 and (last_three[0] != ending[0] or last_three[1] != ending[1] or last_three[2] != ending[2]):
        padded_input_tokens = pad_sequences([padded_input_tokens], maxlen=maxlen)
        probs = model.predict(padded_input_tokens)[0]
        probs = probs / np.sum(probs)
        predicted = np.random.choice(list(range(len(probs))), p=probs)
        padded_input_tokens = padded_input_tokens[0].tolist()
        padded_input_tokens.append(predicted)
        res_tokens.append(predicted)
        n_words -= 1
        must_stop -= 1
        last_three.append(predicted)
    detokenised = detokenise(res_tokens)
    return formatting(' '.join(detokenised))

In [27]:
# replace seed_text with your own
res = implement(seed_text="I can't sleep as she said she liked me", n_words=500, maxlen=20)

In [28]:
print(res)

I can't sleep as she said she liked me 
So hard ain't no less 
So why, one more story 
 
I knew I said that I see me 
 
For two forty - seconds of the water 
The walls will stay home 
Lord was no mobile, and no 
 
Since the wheel can be a celebration 
Even let me see my hearts down. 
When you see, they said 
Never see me for yesterday 
A bore is gone and grow eyes 
Somebody could like you who wont hear my time 
No laugh alone now 
Nor I've found one in trouble 
You were in christmas nights 
This makes a friend 
They'll happy farther there 
That is the only soul cause I want (be never waiting, to let you touch your love) 
Time inside this night and the world is standing 
 
I'll be there, new, you'll let you be blind 
You better many of your life, who forever my love 
All your decision is meant to say 
But what can you fake if you worry 
You had some voice, no human love, baby 
God, every christmas's chemicals I need my back alive 
 
 (end)
