In [10]:
# Quick sanity check that our model makes sense and is loaded correctly
model.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.549946129322052),
 ('prince', 0.5377321839332581),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411999702454)]

In [None]:
# See list of all characters google includes in their word2vec model.  We 
# will not support most of these characters as they are extremely unlikely to occur
# on wikipedia pages

char_dict = dict({})
for idx, key in enumerate(model.wv.vocab):
    for char in key:
        char_dict[char] = 1
        


## List of rules for words

1) We will remove quotations and commas
2) We will support -,_,',

In [1]:
# Useful library for embeddings
import gensim
from string import ascii_lowercase, ascii_uppercase

import numpy as np
from sklearn.model_selection import train_test_split

from keras.preprocessing import sequence 

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dropout, Embedding, Dense, LSTM, Bidirectional

# Load Google's pre-trained Word2Vec model from our thing
# Note: DONT COMMIT THAT FILE TO GITHUB!!!
model = gensim.models.KeyedVectors.load_word2vec_format(
    './word2vec_model/GoogleNews-vectors-negative300.bin', binary=True)  

  from ._conv import register_converters as _register_converters


In [2]:
# Create a dictionary with upper and lower case letters and associated index
# Note: We include underscores, hyphens, and apostrophes but ignore other characters
# found in word2vec model, including chinese symbols, emojis, etc
lower_case_letter_dict = {letter: int(index)+1 for index, letter in enumerate(ascii_lowercase, start=1)} 
upper_case_letter_dict = {letter: int(index)+27 for index, letter in enumerate(ascii_uppercase, start=1)} 
chardict = {**lower_case_letter_dict, **upper_case_letter_dict}
chardict['_']=54
chardict['-']=55
chardict['\'']=56

reverse_chardict = {}
for k,v in chardict.items():
    reverse_chardict[v] = k

def include_word(word, chardict):
    """
    Function to determine if word can be included and perform any parsing
    """
    if all(char in chardict.keys() for char in word):
        return True
    
    return False


# Create list of words which will be used in training/testing our model
all_words = dict({})

# For every word in word2vec model establish if it is "allowed"; if it is
# add the word to our all_words dict, with the embedding as the value
for idx, key in enumerate(model.wv.vocab):
    if include_word(key, chardict):
        all_words[key] = model.wv[key]
    else:
        pass

# Create two lists corresponding to X & Y 
words = []
embeddings = []

for k,v in all_words.items():
    words.append(k)
    embeddings.append(v)

# Convert characters to index references and all lists to numpy arrays
words_index = [[chardict[char] for char in word] for word in words]
words_index = np.array(words_index)
embeddings = np.array(embeddings)

# Establish train/test splits 
train_words, test_words, train_embeddings, test_embeddings = train_test_split(words_index, embeddings, 
                                                                              test_size=0.1, random_state=1)



In [3]:
def batch_generator():
    pass
# code from class utils library below; need to do something similar




# def rnnlm_batch_generator(ids, batch_size, max_time):
#     """Convert ids to data-matrix form for RNN language modeling."""
#     # Clip to multiple of max_time for convenience
#     clip_len = ((len(ids)-1) // batch_size) * batch_size
#     input_w = ids[:clip_len]     # current word
#     target_y = ids[1:clip_len+1]  # next word
#     # Reshape so we can select columns
#     input_w = input_w.reshape([batch_size,-1])
#     target_y = target_y.reshape([batch_size,-1])

#     # Yield batches
#     for i in range(0, input_w.shape[1], max_time):
#         yield input_w[:,i:i+max_time], target_y[:,i:i+max_time]

        
        
# for i, (w,y) in enumerate(rnnlm_batch_generator([3,4,3],2,1)):
#     print(i)

In [4]:
# Pad each sequence with zeroes if it is less than 20.
X_train = sequence.pad_sequences(train_words, maxlen=20) 
X_test = sequence.pad_sequences(test_words, maxlen=20) 

X_train[0]

Using TensorFlow backend.


array([ 0,  0,  0,  0,  0,  0, 42, 13, 26, 14, 17, 10, 18, 22,  6, 54, 39,
       26, 16, 15], dtype=int32)

In [8]:
test = X_train[:10000]
emb_test = train_embeddings[:10000]

val_x = X_train[10000:14000]
val_y = train_embeddings[10000:14000]


In [13]:
lstm = Sequential()
# Presumably this part works; takes in batch-size and integer indices and creates embedding layer
# with 300d embeddings
lstm.add(Embedding(57, 300))

# Zero idea if this part is doing what we want....might have to deviate from Keras and define things manually
# or define custom loss function.  N
lstm.add(Bidirectional(LSTM(300, return_sequences=False)))
lstm.add(Dense(300, activation='linear'))
lstm.compile(loss="mse", optimizer="rmsprop")
lstm.fit(test,emb_test, validation_data=(val_x, val_y))

Train on 10000 samples, validate on 4000 samples


<tensorflow.python.keras.callbacks.History at 0x7f3b85984748>

In [121]:
# The way to do it in tensorflow for when we go to actually train this thing


# mapping_strings = tf.constant([k for k,v in combined.items()])
# table = tf.contrib.lookup.index_table_from_tensor(
#     mapping=mapping_strings, num_oov_buckets=1, default_value=-1)
# tf.tables_initializer().run()

# features = tf.constant(['a','b','c','_'])
# ids = table.lookup(features)
    

In [6]:
# Get dynamic shape info from inputs
with tf.name_scope("batch_size"):
    self.batch_size_ = tf.shape(self.input_w_)[0]
with tf.name_scope("max_time"):
    self.max_time_ = tf.shape(self.input_w_)[1]

with tf.name_scope("training_inputs")
    # input_w will be batch_size by max_time, where max_time is max letters we allow
    self.input_w_ = tf.placeholder(tf.int32, [None, None], name="w")
    # Target_y will be batch_size by embedding size
    self.target_y_ = tf.placeholder(tf.int32, [None, None], name="y")

# Construct embedding layer
with tf.name_scope("embedding_layer"):
    # Assume a 300d vector for each character; will replace with dynamic variables
    self.W_in_ = tf.get_variable("W_in_", [55,300], dtype=tf.float32, 
        initializer=tf.random_uniform_initializer(-1,1), trainable=True)

    self.xs_ = tf.nn.embedding_lookup(self.W_in_, self.input_w_)

with tf.name_scope("model_creation"):
    model = Sequential()
    model.add(Bidirectional(LSMT(300)))


#     model.add(Bidirectional(LSTM(10, return_sequences=True), input_shape=(5,
#     10)))
#     model.add(Bidirectional(LSTM(10)))
#     model.add(Dense(5))
#     model.add(Activation('softmax'))
#     model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

## Key Decisions to prepare training data

1) How do we handle capitalization, and comined words like "crown_prince" in above example?
2) How many words do we want to include?  Top x, random x, etc?

## Key Actions to prepare training data

1) Write function to create training vectors which are series of character indices for features, 
   and embedding for target


## Start of LSTM Code

# Possible useful references

https://www.tensorflow.org/guide/keras
https://blog.keras.io/keras-as-a-simplified-interface-to-tensorflow-tutorial.html
https://keras.io/getting-started/sequential-model-guide/
