# Embeddings

## Loading a pre-trained word embedding

In practice, we can use a pre-trained word embedding, from a much larger corpus, trained for a much longer period. 

Source of this word embedding (created from a 6 billion tokens corpus, with results as 50d vectors): http://nlp.stanford.edu/projects/glove/

NB: If you don't have the required data, and the RedCatLabs server doesn't give you the download, the loader below downloads a 823Mb file via a fairly slow connection to a server at Stanford (this can take HOURS).

In [12]:
import glove
import numpy as np

%precision 3

'%.3f'

In [13]:
import os, requests, shutil
import glove

glove_dir = './data/RNN/'
glove_100k_50d = 'glove.first-100k.6B.50d.txt'
glove_100k_50d_path = os.path.join(glove_dir, glove_100k_50d)

# These are temporary files if we need to download it from the original source (slow)
data_cache = './data/cache'
glove_full_tar = 'glove.6B.zip'
glove_full_50d = 'glove.6B.50d.txt'

#force_download_from_original=False
download_url= 'http://redcatlabs.com/downloads/deep-learning-workshop/notebooks/data/RNN/'+glove_100k_50d
original_url = 'http://nlp.stanford.edu/data/'+glove_full_tar

if not os.path.isfile( glove_100k_50d_path ):
    if not os.path.exists(glove_dir):
        os.makedirs(glove_dir)
    
    # First, try to download a pre-prepared file directly...
    response = requests.get(download_url, stream=True)
    if response.status_code == requests.codes.ok:
        print("Downloading 42Mb pre-prepared GloVE file from RedCatLabs")
        with open(glove_100k_50d_path, 'wb') as out_file:
            shutil.copyfileobj(response.raw, out_file)
    else:
        # But, for some reason, RedCatLabs didn't give us the file directly
        if not os.path.exists(data_cache):
            os.makedirs(data_cache)
        
        if not os.path.isfile( os.path.join(data_cache, glove_full_50d) ):
            zipfilepath = os.path.join(data_cache, glove_full_tar)
            if not os.path.isfile( zipfilepath ):
                print("Downloading 860Mb GloVE file from Stanford")
                response = requests.get(download_url, stream=True)
                with open(zipfilepath, 'wb') as out_file:
                    shutil.copyfileobj(response.raw, out_file)
            if os.path.isfile(zipfilepath):
                print("Unpacking 50d GloVE file from zip")
                import zipfile
                zipfile.ZipFile(zipfilepath, 'r').extract(glove_full_50d, data_cache)

        with open(os.path.join(data_cache, glove_full_50d), 'rt') as in_file:
            with open(glove_100k_50d_path, 'wt') as out_file:
                print("Reducing 50d GloVE file to first 100k words")
                for i, l in enumerate(in_file.readlines()):
                    if i>=100: break
                    out_file.write(l)
    
        # Get rid of tarfile source (the required text file itself will remain)
        #os.unlink(zipfilepath)
        #os.unlink(os.path.join(data_cache, glove_full_50d))

print("GloVE available locally")

GloVE available locally


In [14]:
# Due to size constraints, only use the first 100k vectors (i.e. 100k most frequently used words)
word_embedding = glove.Glove.load_stanford( glove_100k_50d_path )
word_embedding.word_vectors.shape

(100000, 50)

Having loaded that, play around with the similarity and analogy tests:

In [15]:
# word-analogy test
def get_embedding_vec(word):
    idx = word_embedding.dictionary.get(word.lower(), -1)
    if idx<0:
        #print("Missing word : '%s'" % (word,))
        return np.zeros(  (EMBEDDING_DIM, ), dtype='float32')  # UNK
    return word_embedding.word_vectors[idx]

def get_closest_word(vec, number=5):
    dst = (np.dot(word_embedding.word_vectors, vec)
                   / np.linalg.norm(word_embedding.word_vectors, axis=1)
                   / np.linalg.norm(vec))
    word_ids = np.argsort(-dst)
    return [(word_embedding.inverse_dictionary[x], dst[x]) for x in word_ids[:number]
            if x in word_embedding.inverse_dictionary]

In [16]:
def test_analogy(s='one two three four'):
    (a,b,c,d) = s.split(' ')
    analogy_vec = get_embedding_vec(b) - get_embedding_vec(a) + get_embedding_vec(c)
    words = [ w for (w,p) in get_closest_word(analogy_vec) if w not in (a,b,c)]
    print("'%s' is to '%s' as '%s' is to {%s}" % (a,b,c,', '.join(words)))

In [17]:
test_words = ['one', 'what', 'king', 'winter', 'chicken']
for word in test_words:
    print(word_embedding.most_similar(word))

[('another', 0.95155760155240554), ('only', 0.93775685546916843), ('same', 0.93508186215121814), ('.', 0.91384127912218194)]
[('why', 0.9699442918090454), ('how', 0.96343385789841574), ('nothing', 0.95298521021645888), ('something', 0.95179124988244079)]
[('prince', 0.8236179693335699), ('queen', 0.78390430109641174), ('ii', 0.77462300306351073), ('emperor', 0.77362476248729251)]
[('summer', 0.91998354386035297), ('spring', 0.86244882945357526), ('autumn', 0.84603370064260131), ('rainy', 0.77108566972678583)]
[('meat', 0.90898847625037249), ('fried', 0.8994253485084005), ('pork', 0.89203300290774223), ('soup', 0.86930858078715156)]


In [18]:
test_analogy('man woman king queen')
test_analogy('paris france rome italy')
test_analogy('kitten cat puppy dog')
test_analogy('understand understood run ran')

'man' is to 'woman' as 'king' is to {queen, daughter, prince, throne}
'paris' is to 'france' as 'rome' is to {italy, spain, portugal}
'kitten' is to 'cat' as 'puppy' is to {dog, rabbit, horse}
'understand' is to 'understood' as 'run' is to {ran, running, runs, twice}


In [19]:
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

embedding_var = tf.Variable(word_embedding.word_vectors, dtype='float32', name='word_embedding')

projector_config = projector.ProjectorConfig()

# You can add multiple embeddings. Here we add only one.
embedding = projector_config.embeddings.add()
embedding.tensor_name = embedding_var.name

# Link this tensor to its metadata file (e.g. labels).
LOG_DIR='./tensorflow.logdir/'
metadata_file = 'glove_full_50d.words.tsv'
vocab_list = [ word_embedding.inverse_dictionary[i] 
               for i in range(len( word_embedding.inverse_dictionary )) ]
with open(os.path.join(LOG_DIR, metadata_file), 'wt') as metadata:
    metadata.writelines("%s\n" % w for w in vocab_list)
    
embedding.metadata_path = os.path.join(os.getcwd(), LOG_DIR, metadata_file)

# Use the same LOG_DIR where you stored your checkpoint.
summary_writer = tf.summary.FileWriter(LOG_DIR)

# The next line writes a projector_config.pbtxt in the LOG_DIR. TensorBoard will read this file during startup.
projector.visualize_embeddings(summary_writer, projector_config)

saver = tf.train.Saver([embedding_var])

with tf.Session() as sess:
    # Initialize the model
    sess.run(tf.global_variables_initializer())
    
    saver.save(sess, os.path.join(LOG_DIR, metadata_file+'.ckpt'))

In [None]:
# Go to your terminal and run the following command: `tensorboard --logdir=./tensorflow.logdir`

TensorBoard 0.1.6 at http://sg-15.local:6006 (Press CTRL+C to quit)
