In [10]:
# Quick sanity check that our model makes sense and is loaded correctly
model.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.549946129322052),
 ('prince', 0.5377321839332581),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411999702454)]

In [None]:
# See list of all characters google includes in their word2vec model.  We 
# will not support most of these characters as they are extremely unlikely to occur
# on wikipedia pages

char_dict = dict({})
for idx, key in enumerate(model.wv.vocab):
    for char in key:
        char_dict[char] = 1
        


## List of rules for words

1) We will remove quotations and commas
2) We will support -,_,',

In [1]:
# Useful library for embeddings
import gensim
from string import ascii_lowercase, ascii_uppercase

import numpy as np
from sklearn.model_selection import train_test_split

from keras.preprocessing import sequence 

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dropout, Embedding, Dense, LSTM, Bidirectional

# Load Google's pre-trained Word2Vec model from our thing
# Note: DONT COMMIT THAT FILE TO GITHUB!!!
model = gensim.models.KeyedVectors.load_word2vec_format(
    './word2vec_model/GoogleNews-vectors-negative300.bin', binary=True)  

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Create a dictionary with upper and lower case letters and associated index
# Note: We include underscores, hyphens, and apostrophes but ignore other characters
# found in word2vec model, including chinese symbols, emojis, etc
lower_case_letter_dict = {letter: int(index)+1 for index, letter in enumerate(ascii_lowercase, start=1)} 
upper_case_letter_dict = {letter: int(index)+27 for index, letter in enumerate(ascii_uppercase, start=1)} 
chardict = {**lower_case_letter_dict, **upper_case_letter_dict}
chardict['_']=54
chardict['-']=55
chardict['\'']=56
chardict['.']=57

reverse_chardict = {}
for k,v in chardict.items():
    reverse_chardict[v] = k

def include_word(word, chardict):
    """
    Function to determine if word can be included and perform any parsing
    """
    if all(char in chardict.keys() for char in word):
        return True
    
    return False


# Create list of words which will be used in training/testing our model
all_words = dict({})

# For every word in word2vec model establish if it is "allowed"; if it is
# add the word to our all_words dict, with the embedding as the value
for idx, key in enumerate(model.wv.vocab):
    if include_word(key, chardict):
        all_words[key] = model.wv[key]
    else:
        pass

# Create two lists corresponding to X & Y 
words = []
embeddings = []

for k,v in all_words.items():
    words.append(k)
    embeddings.append(v)

# Convert characters to index references and all lists to numpy arrays
words_index = [[chardict[char] for char in word] for word in words]
words_index = np.array(words_index)
embeddings = np.array(embeddings)

# Establish train/test splits 
train_words, test_words, train_embeddings, test_embeddings = train_test_split(words_index, embeddings, 
                                                                              test_size=0.1, random_state=1)



In [3]:
def batch_generator():
    pass
# code from class utils library below; need to do something similar




# def rnnlm_batch_generator(ids, batch_size, max_time):
#     """Convert ids to data-matrix form for RNN language modeling."""
#     # Clip to multiple of max_time for convenience
#     clip_len = ((len(ids)-1) // batch_size) * batch_size
#     input_w = ids[:clip_len]     # current word
#     target_y = ids[1:clip_len+1]  # next word
#     # Reshape so we can select columns
#     input_w = input_w.reshape([batch_size,-1])
#     target_y = target_y.reshape([batch_size,-1])

#     # Yield batches
#     for i in range(0, input_w.shape[1], max_time):
#         yield input_w[:,i:i+max_time], target_y[:,i:i+max_time]

        
        
# for i, (w,y) in enumerate(rnnlm_batch_generator([3,4,3],2,1)):
#     print(i)

In [25]:
train_embeddings.shape

(2572551, 300)

In [4]:
# Pad each sequence with zeroes if it is less than 20.
X_train = sequence.pad_sequences(train_words, maxlen=20) 
X_test = sequence.pad_sequences(test_words, maxlen=20) 

X_train[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 36,
       45, 29, 20], dtype=int32)

In [5]:
test = X_train[:100000]
emb_test = train_embeddings[:100000]

val_x = X_train[100000:140000]
val_y = train_embeddings[100000:140000]


In [42]:
# Notes: This is if we do it straight tensorflow style.  I think I just have to make the loss
# at the end correct, rather than just testing a subtraction

from tensorflow.contrib import rnn
tf.reset_default_graph()
# Define lstm cells with tensorflow
# Forward direction cell
with tf.name_scope("embedding_layer"):
    # Assume a 300d vector for each character; will replace with dynamic variables
    W_in_ = tf.get_variable("W_in_", [58,300], dtype=tf.float32, 
            initializer=tf.random_uniform_initializer(-1,1), trainable=True)

    xs_ = tf.reshape(tf.nn.embedding_lookup(W_in_, test[0]),(1,20,300))
    x = tf.unstack(xs_, 20, 1)

W_out_ = tf.get_variable("W_out_", [600,300], dtype=tf.float32, 
            initializer=tf.random_uniform_initializer(-1,1), trainable=True)
b_out_ = tf.get_variable("b_out_", (1,300), dtype=tf.float32, 
            initializer=tf.random_uniform_initializer(-1,1), trainable=True)

lstm_fw_cell = rnn.BasicLSTMCell(300, forget_bias=1.0)
# Backward direction cell
lstm_bw_cell = rnn.BasicLSTMCell(300, forget_bias=1.0)

# Get lstm cell output
outputs, _, _ = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x,
                                          dtype=tf.float32)

final_out = tf.add(tf.matmul(outputs[-1], W_out_), b_out_)

loss = final_out-emb_test[0]

sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(loss)

array([[-2.2561681 ,  0.0440715 , -0.01946591,  0.5048307 , -1.9254456 ,
        -0.40805894, -0.01643568, -1.8172401 ,  0.7498343 , -0.3580538 ,
        -0.8683603 , -1.385504  , -2.882206  , -0.38658762,  3.2857318 ,
         0.3188932 , -1.8016087 ,  2.9225051 , -0.4670478 , -0.78816897,
         0.05356523, -0.68285334, -3.4449804 , -0.25026512, -2.0927777 ,
         0.32961357, -0.9435488 ,  2.9287925 , -1.8356545 , -0.8652038 ,
         0.26002634, -0.601628  ,  0.8806363 , -0.32313663,  1.133319  ,
         1.2267165 , -1.112298  ,  0.45222613, -3.3475816 , -1.4514046 ,
         2.2781868 ,  1.2351414 , -1.0076942 , -2.2708905 , -0.18919292,
         1.1437875 , -0.06780577, -0.75346076, -3.1618314 ,  0.7291281 ,
         0.55477077,  2.7325916 ,  2.4650793 , -0.08224475,  1.7880992 ,
        -1.206544  ,  2.094314  , -0.16124809, -0.61718667, -1.6677612 ,
         0.45440984, -0.70312715,  1.4425989 ,  0.7693058 , -0.12254685,
         0.14916909, -0.01980835,  2.7577844 , -0.2

## Stuff I've learned so far

    1) Evertyhing keras must be imported as tensorflow.keras otherwise you're using different versions
    2) Keras is really sweet but I have no idea if its working!

In [38]:
train_embeddings[0]

array([-0.08886719,  0.15625   ,  0.19726562,  0.33984375, -0.09130859,
       -0.2578125 ,  0.11523438, -0.02355957,  0.09863281, -0.01385498,
        0.35351562, -0.34375   , -0.1953125 , -0.02697754, -0.4921875 ,
        0.20605469, -0.06689453,  0.09228516,  0.04223633,  0.25195312,
        0.13574219,  0.03979492, -0.1875    , -0.0234375 ,  0.19726562,
       -0.23925781, -0.19335938,  0.12988281, -0.18847656, -0.16015625,
        0.18652344, -0.11328125, -0.02526855, -0.10351562,  0.02905273,
       -0.125     ,  0.02087402,  0.26757812,  0.22558594, -0.21582031,
        0.00442505,  0.265625  , -0.11279297, -0.11914062, -0.18554688,
       -0.3671875 ,  0.31835938, -0.20898438, -0.15234375, -0.01879883,
        0.06445312, -0.09472656,  0.07324219, -0.3046875 , -0.09814453,
       -0.09765625,  0.2734375 ,  0.12695312, -0.02490234, -0.25390625,
       -0.453125  , -0.08251953, -0.00300598, -0.44140625, -0.12011719,
       -0.1171875 , -0.359375  ,  0.22265625, -0.16601562,  0.05

In [27]:
from tensorflow.keras import optimizers
from tensorflow.keras import initializers

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)


In [47]:
# Notes: This is Keras way of doing thinsg which if we can figure it out 
# is way easier, and also way more likely to help us in professional setting
# This all runs, but I'm not 100% sure its doing what we want as the loss
# really isn't going anywhere.


lstm = Sequential()
lstm.add(Embedding(58, 50, input_length=20, embeddings_initializer=initializers.RandomUniform(-.2,.2)))
#lstm.add(Bidirectional(LSTM(50, return_sequences=True)))
lstm.add(Bidirectional(LSTM(50, return_sequences=False)))
lstm.add(Dense(300, activation='tanh'))
#lstm.add(Dense(300,activation='linear'))
lstm.compile(loss="mean_squared_error", optimizer=adam)
lstm.summary()
lstm.fit(test,emb_test, batch_size=1000, validation_data=(val_x, val_y), epochs=2)


# Loss after 10k .0167
# Loss after 25k .0163
# Loss after 50k .0161
# Loss after 75k .0160
# Loss after 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, 20, 50)            2900      
_________________________________________________________________
bidirectional_21 (Bidirectio (None, 100)               40400     
_________________________________________________________________
dense_24 (Dense)             (None, 300)               30300     
Total params: 73,600
Trainable params: 73,600
Non-trainable params: 0
_________________________________________________________________
Train on 100000 samples, validate on 40000 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fcf74da5a90>

In [41]:
a = lstm.predict(tf.reshape(X_train[1:3],(2,20)),steps=2)

In [44]:
print(a[0][25])
print(train_embeddings[1][25])

-0.037851535
-0.23046875


In [121]:
# The way to do it in tensorflow for when we go to actually train this thing


# mapping_strings = tf.constant([k for k,v in combined.items()])
# table = tf.contrib.lookup.index_table_from_tensor(
#     mapping=mapping_strings, num_oov_buckets=1, default_value=-1)
# tf.tables_initializer().run()

# features = tf.constant(['a','b','c','_'])
# ids = table.lookup(features)
    

In [6]:
# Get dynamic shape info from inputs
with tf.name_scope("batch_size"):
    self.batch_size_ = tf.shape(self.input_w_)[0]
with tf.name_scope("max_time"):
    self.max_time_ = tf.shape(self.input_w_)[1]

with tf.name_scope("training_inputs")
    # input_w will be batch_size by max_time, where max_time is max letters we allow
    self.input_w_ = tf.placeholder(tf.int32, [None, None], name="w")
    # Target_y will be batch_size by embedding size
    self.target_y_ = tf.placeholder(tf.int32, [None, None], name="y")

# Construct embedding layer
with tf.name_scope("embedding_layer"):
    # Assume a 300d vector for each character; will replace with dynamic variables
    self.W_in_ = tf.get_variable("W_in_", [57,300], dtype=tf.float32, 
        initializer=tf.random_uniform_initializer(-1,1), trainable=True)

    self.xs_ = tf.nn.embedding_lookup(self.W_in_, self.input_w_)

with tf.name_scope("model_creation"):
    model = Sequential()
    model.add(Bidirectional(LSMT(300)))


#     model.add(Bidirectional(LSTM(10, return_sequences=True), input_shape=(5,
#     10)))
#     model.add(Bidirectional(LSTM(10)))
#     model.add(Dense(5))
#     model.add(Activation('softmax'))
#     model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

## Key Decisions to prepare training data

1) How do we handle capitalization, and comined words like "crown_prince" in above example?
2) How many words do we want to include?  Top x, random x, etc?

## Key Actions to prepare training data

1) Write function to create training vectors which are series of character indices for features, 
   and embedding for target


## Start of LSTM Code

# Possible useful references

https://www.tensorflow.org/guide/keras
https://blog.keras.io/keras-as-a-simplified-interface-to-tensorflow-tutorial.html
https://keras.io/getting-started/sequential-model-guide/
