In [1]:
import pickle
import os

filename = 'data/queries_lexicon.pkl'


lexicon = pickle.load( open(filename, "rb"))


def get_lexicon_lookup(lexicon):
    lexicon_lookup = { idx: lexicon_item for lexicon_item, idx in lexicon.items()}
    lexicon_lookup[0] = "" #map 0 padding to empty string
    lexicon_lookup[793] = ""
    print("LEXICON LOOKUP SAMPLE:")
    print(list(lexicon_lookup.items())[500:510])
    return lexicon_lookup

lexicon_lookup = get_lexicon_lookup(lexicon)

LEXICON LOOKUP SAMPLE:
[(500, 'intercourse'), (501, 'lament'), (502, 'occupied'), (503, 'common'), (504, 'saturday'), (505, 'guarded'), (506, 'father'), (507, 'security'), (508, 'necessary'), (509, 'started')]


In [2]:
test_input = [['then', 'i', 'moved','with', 'fear', 'as', 'my', 'father']]

def tokens_to_ids(all_tokens, lexicon):
    ids = [[ lexicon[token] if token in lexicon else lexicon['<UNK>'] \
           for token in token_line] \
           for token_line in all_tokens]
    return ids

tester = tokens_to_ids(test_input, lexicon)
tester



[[1374, 1451, 3189, 1275, 231, 2412, 2375, 506]]

In [3]:
lexicon_lookup[1]

'<UNK>'

In [4]:
from keras.models import Model
from keras.layers import Input, Dense, TimeDistributed
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU

def create_model(seq_input_len, n_input_nodes, n_embedding_nodes, n_hidden_nodes, stateful=False, batch_size=None):

    input_layer = Input(batch_shape=(batch_size, seq_input_len), name='input_layer')
    
    embedding_layer = Embedding(input_dim=n_input_nodes,
                               output_dim=n_embedding_nodes,
                               mask_zero=True, name='embedding_layer')(input_layer)
    
    gru_layer1 = GRU(n_hidden_nodes,
                    return_sequences=True,
                    stateful=stateful,
                    name='hidden_layer1')(embedding_layer)
    
    gru_layer2 = GRU(n_hidden_nodes,
                    return_sequences=True,
                    stateful=stateful,
                    name='hidden_layer2')(gru_layer1)
    
    output_layer = TimeDistributed(Dense(n_input_nodes, activation="softmax"),
                                  name='output_layer')(gru_layer2)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
    
    return model
    

Using Theano backend.


In [5]:
predictor_model = create_model(seq_input_len=1,
                              n_input_nodes=len(lexicon) + 1,
                              n_embedding_nodes=300,
                              n_hidden_nodes = 500,
                              stateful=True,
                              batch_size=1)

predictor_model.load_weights('corpse_weights5.h5')

In [6]:
import numpy 

def generate_ending(idx_seq):
    end_of_sent_tokens = [".", "!","/",";","?",":"]
    generated_ending = []
    
    if len(idx_seq) == 0:
        return [3]
    
    for word in idx_seq:
        p_next_word = predictor_model.predict(numpy.array(word)[None, None])[0,0]
        
    while not generated_ending or lexicon_lookup[next_word] not in end_of_sent_tokens:
        next_word = numpy.random.choice(a=p_next_word.shape[-1], p=p_next_word)

        if next_word != 1:
            generated_ending.append(next_word)
            p_next_word = predictor_model.predict(numpy.array(next_word)[None, None])[0,0]
        
    predictor_model.reset_states()
    return generated_ending


test_input = [['the','creature','said','i','am','afraid','i','do','not','know']]

tester = tokens_to_ids(test_input, lexicon)

generated_ending = generate_ending(tester[0])
if not generated_ending[0] == 2:
    generated_ending = " ".join([lexicon_lookup[word] if word in lexicon_lookup else "" \
                                 for word in generated_ending])
    print(test_input)
    print("\n-----------------")
    print(generated_ending,"\n")



[['the', 'creature', 'said', 'i', 'am', 'afraid', 'i', 'do', 'not', 'know']]

-----------------
how to say , the well of the side of what it could not be well , had great conduct before with an way so produced into all one exertion at netherfield , and bringing away the rest for him . 



In [9]:
import spacy
encoder = spacy.load('en')
#current_app.encoder = spacy.load('en')

def text_to_tokens(lines):
    tokens = [ [word.lower_ for word in encoder(line)] for line in lines]
    #tokens = [ [word.lower_ for word in current_app.encoder(line)] for line in lines]
    return tokens

def lineSubmit(test_line):
    input_line = tokens_to_ids( text_to_tokens(test_line), lexicon)
    #input_line = tokens_to_ids( text_to_tokens(test_line), current_app.lexicon)

    generated_ending = generate_ending(input_line[0])
    if not generated_ending[0] == 2:
        #generated_ending = " ".join([current_app.lexicon_lookup[word] if word in current_app.lexicon_lookup else "" for word in generated_ending])
        generated_ending = " ".join([lexicon_lookup[word] if word in lexicon_lookup else "" for word in generated_ending])
    
    return generated_ending
    #socketio.emit('line_append', { 'new_line':generated_ending}, namespace='/eq')


In [10]:
print(lineSubmit("I looked out the window and saw my sister. I said"))

was scarcely hid when a young girl came running towards the spot where i was concealed , laughing , as if she ran from someone in sport .


In [11]:
import pandas
pandas.set_option('display.max_colwidth', 300)

train_lines = pandas.read_csv('data/queries.csv', encoding='utf-8')[200:240]
train_lines[0:1]

Unnamed: 0,line
200,"The bride and her mother could neither of them talk fast enough; and Wickham, who happened to sit near Elizabeth, began inquiring after his acquaintance in that neighbourhood, with a good humoured ease which she felt very unable to equal in her replies."


In [25]:
import re

with open('data/frankenstein.txt') as f:
    text = f.read()    
    frank_sentences = re.split(r' *[\.\!][\'"\)\]]* *', text)
    
    for i in range(len(frank_sentences)):
        frank_sentences[i] = frank_sentences[i].replace("\n", " ") + "."
        
print("total frankenstein sentences: ", len(frank_sentences))

total frankenstein sentences:  3339


In [35]:
frank_sample = frank_sentences[2700:3000]
len(frank_sample)

300

In [37]:
import random

start = random.randint(0,300)
print(frank_sample[start])


But to a Genevan magistrate, whose mind was occupied by far other ideas than those of devotion and heroism, this elevation of mind had much the appearance of madness.


In [41]:
import re
import random

with open('data/frank_starters.txt') as f:
    text = f.read()
    frank_sentences = re.split(r' *[\.\!][\'"\)\]]* *', text)
    
    for i in range(len(frank_sentences)):
        frank_sentences[i] = frank_sentences[i].replace("\n", " ") + "."
        


In [42]:
len(frank_sentences)

91

In [67]:


wordCount = 0

while wordCount < 12:
    starter = random.randint(0, len(frank_sentences))
    rawfirst = frank_sentences[starter]
    splitted = rawfirst.split()
    wordCount = len(splitted)

print(rawfirst)


stopWord = splitted[12]

stopIndex = rawfirst.find(stopWord)

print(stopWord, ", ", stopIndex)                      
                          
if stopIndex > 3:
    first = rawfirst[:stopIndex]
    print("first: ", first)
else:
    print("rawfirst :", rawfirst)


Everybody believed that poor girl to be guilty; and if she could have committed the crime for which she suffered, assuredly she would have been the most depraved of human creatures.
have ,  65
first:  Everybody believed that poor girl to be guilty; and if she could 


In [73]:
print(first)
print(lineSubmit(first))

Everybody believed that poor girl to be guilty; and if she could 
is not to be when they know how    collins is many enough to be without turn hi   my plan is more enough to find there to say miss de bourgh , in a manner which have the more man at pemberley .


In [74]:
len(frank_sentences)

91