* Word level embedding with 3 different pre-trained models: 

In [18]:
import datetime
import numpy as np
import os

import gensim
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

print('gensim Version: %s' % (gensim.__version__))

class WordEmbedding:
    __author__ = "Edward Ma"
    __copyright__ = "Copyright 2018, Edward Ma"
    __credits__ = ["Edward Ma"]
    __license__ = "Apache"
    __version__ = "2.0"
    __maintainer__ = "Edward Ma"
    __email__ = "makcedward@gmail.com"

    def __init__(self, verbose=0):
        self.verbose = verbose
        
        self.model = {}
        
    def convert(self, source, input_file_path, output_file_path):
        if source == 'glove':
            input_file = datapath(input_file_path)
            output_file = get_tmpfile(output_file_path)
            glove2word2vec(input_file, output_file)
        elif source == 'word2vec':
            pass
        elif source == 'fasttext':
            pass
        else:
            raise ValueError('Possible value of source are glove, word2vec, fasttext')
        
    def load(self, source, file_path):
        print(datetime.datetime.now(), 'start: loading', source)
        if source == 'glove':
            self.model[source] = gensim.models.KeyedVectors.load_word2vec_format(file_path)
        elif source == 'word2vec':
            self.model[source] = gensim.models.KeyedVectors.load_word2vec_format(file_path, binary=True)
        elif source == 'fasttext':
            self.model[source] = gensim.models.wrappers.FastText.load_fasttext_format(file_path)
        else:
            raise ValueError('Possible value of source are glove, word2vec, fasttext')
            
        print(datetime.datetime.now(), 'end: loading', source)
            
        return self
    
    def get_model(self, source):
        if source not in ['glove', 'word2vec', 'fasttext']:
            raise ValueError('Possible value of source are glove, word2vec, fasttext')
            
        return self.model[source]
    
    def get_words(self, source, size=None):
        if source not in ['glove', 'word2vec', 'fasttext']:
            raise ValueError('Possible value of source are glove, word2vec, fasttext')
        
        if source in ['glove', 'word2vec']:
            if size is None:
                return [w for w in self.get_model(source=source).vocab]
            else:
                results = []
                for i, word in enumerate(self.get_model(source=source).vocab):
                    if i >= size:
                        break
                        
                    results.append(word)
                return results
            
        elif source in ['fasttext']:
            if size is None:
                return [w for w in self.get_model(source=source).wv.vocab]
            else:
                results = []
                for i, word in enumerate(self.get_model(source=source).wv.vocab):
                    if i >= size:
                        break
                        
                    results.append(word)
                return results
        
        return Exception('Unexpected flow')
    
    def get_dimension(self, source):
        if source not in ['glove', 'word2vec', 'fasttext']:
            raise ValueError('Possible value of source are glove, word2vec, fasttext')
        
        if source in ['glove', 'word2vec']:
            return self.get_model(source=source).vectors[0].shape[0]
            
        elif source in ['fasttext']:
            word = self.get_words(source=source, size=1)[0]
            return self.get_model(source=source).wv[word].shape[0]
        
        return Exception('Unexpected flow')
    
    def get_vectors(self, source, words=None):
        if source not in ['glove', 'word2vec', 'fasttext']:
            raise ValueError('Possible value of source are glove, word2vec, fasttext')
        
        if source in ['glove', 'word2vec', 'fasttext']:
            if words is None:
                words = self.get_words(source=source)
            
            embedding = np.empty((len(words), self.get_dimension(source=source)), dtype=np.float32)            
            for i, word in enumerate(words):
                embedding[i] = self.get_vector(source=source, word=word)
                
            return embedding
        
        return Exception('Unexpected flow')
    
    def get_vector(self, source, word, oov=None):
        if source not in ['glove', 'word2vec', 'fasttext']:
            raise ValueError('Possible value of source are glove, word2vec, fasttext')
            
        if source not in self.model:
            raise ValueError('Did not load %s model yet' % source)
        
        try:
            return self.model[source][word]
        except KeyError as e:
            raise
            
            #TODO
#             if oov is None:
#                 raise
            
#             if 'not in vocabulary' in str(e):
#                 if oov == ''

    def build_visual_metadata(self, embedding, words, file_dir, 
                              metadata_name='metadata.csv', project_model_name='model.ckpt'):
        # Create output directory if not exist
        if not os.path.exists(file_dir):
            os.makedirs(file_dir)

        # Build graph
        tf.reset_default_graph()
        sess = tf.InteractiveSession()

        embedding_graph = tf.Variable([0.0], name='embedding')
        place = tf.placeholder(tf.float32, shape=embedding.shape)

        set_embedding_graph = tf.assign(embedding_graph, place, validate_shape=False)
        sess.run(tf.global_variables_initializer())
        sess.run(set_embedding_graph, feed_dict={place: embedding})

        # Build metadata
        with open(os.path.join(file_dir, metadata_name), 'w') as f:
            for word in words:
                f.write(word + '\n')

        # Build projector
        summary_writer = tf.summary.FileWriter(file_dir, sess.graph)
        config = projector.ProjectorConfig()
        embedding_conf = config.embeddings.add()
        embedding_conf.tensor_name = 'embedding:0'
        embedding_conf.metadata_path = metadata_name
        projector.visualize_embeddings(summary_writer, config)

        # Save model
        saver = tf.train.Saver()
        saver.save(sess, os.path.join(file_dir, project_model_name))

        # Clear
        sess.close()

        
downloaded_glove_file_path = './word2vec/glove.840B.300d.txt'
glove_file_path = './word2vec/glove.840B.300d.vec'

word2vec_file_path = './word2vec/GoogleNews-vectors-negative300.bin'
fasttext_file_path = './word2vec/crawl-300d-2M.vec'


cwd = os.getcwd()+'/'
word_embedding = WordEmbedding()
word_embedding.convert(source='glove', input_file_path=cwd+downloaded_glove_file_path, output_file_path=cwd+glove_file_path)

word_embedding.load(source='word2vec', file_path=word2vec_file_path)

print(word_embedding.get_vector(source='word2vec', word='/en/nothing'))

[ 2.46582031e-02 -5.21850586e-03  3.82995605e-03 -5.00488281e-03
  4.82177734e-03  2.58789062e-02 -2.70996094e-02 -4.69970703e-03
 -1.84326172e-02 -3.46679688e-02  4.10156250e-02 -5.92041016e-03
  6.78710938e-02 -5.18798828e-03 -1.06201172e-02 -1.84631348e-03
  2.09960938e-02  1.28173828e-03 -6.40869141e-03 -1.11083984e-02
  1.00708008e-02  3.41796875e-02 -1.19628906e-02  3.83300781e-02
  1.30615234e-02  4.60815430e-03  2.25830078e-02 -1.47094727e-02
  5.09643555e-03  6.07910156e-02 -3.39355469e-02 -3.44238281e-02
  1.81884766e-02  7.47680664e-04 -1.40991211e-02 -2.01416016e-02
  2.91748047e-02  6.68945312e-02 -4.24804688e-02  2.70996094e-02
  1.19628906e-02 -2.09960938e-02 -5.31005859e-03 -7.95898438e-02
 -2.80761719e-02 -1.42211914e-02  1.89208984e-02  3.70788574e-03
  1.99890137e-03  8.54492188e-04  3.99780273e-03  1.94091797e-02
 -1.30004883e-02  8.50677490e-04 -2.89306641e-02  3.54003906e-02
 -3.73535156e-02  1.68457031e-02 -9.21630859e-03 -2.85644531e-02
 -5.88378906e-02  2.96630

In [19]:
source = 'word2vec'
embedding = word_embedding.get_vectors(source=source)
words = word_embedding.get_words(source=source)
sub_embedding = embedding[:100000]
sub_words = words[:100000]
word_embedding.build_visual_metadata(embedding=sub_embedding, words=sub_words, file_dir='./word_embedding')

* Read in CSV file 

In [4]:
import pandas as pd
nyt_data = pd.read_csv('../nyt_dataset/nyt_structured_data.csv.1000', 
                       delimiter=',', index_col=0, header=None, quotechar='"', quoting=1, 
                       skipinitialspace=True, engine='c')

In [48]:
print(nyt_data.head(10)[:][2])

0
original/2007/06/15/1854648.xml     When federal prosecutors found in 1995 that Sh...
original/2007/06/15/1854649.xml     There is the yelp, an electronic yodel that gr...
original/2007/06/15/1854650.xml     Cloaking one's identity while writing -- to hi...
original/2007/06/15/1854651.xml     Mike Keiser, who made a fortune selling greeti...
original/2007/06/15/1854652.xml     The Havens column on June 8 about Milford, Pa....
original/2007/06/15/1854653.xml     An article last Friday about second-home devel...
original/2007/06/15/1854654.xml     Delinquencies and foreclosures among homeowner...
original/2007/06/15/1854655.xml     Globalization is coming to the securities mark...
original/2007/06/15/1854656.xml     Talk about trying to rain on their parade.As t...
original/2007/06/15/1854657.xml     In marathon speeches peppered with quotes from...
Name: 2, dtype: object


In [6]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import re
import seaborn as sns

# Import the Universal Sentence Encoder's TF Hub module
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]
embed = hub.Module(module_url)

  from ._conv import register_converters as _register_converters


INFO:tensorflow:Using /var/folders/qz/vw4lgmdn10595s9kr2jr1w280000gn/T/tfhub_modules to cache modules.


In [None]:
# Compute a representation for each message, showing various lengths supported.
for n in range(0,len(nyt_data[:])-1) :
    TITLE = nyt_data[1][n]
    LEAD_PARAGRAPH = nyt_data[2][n]
    FULL_TEXT = nyt_data[3][n]
    RECORD = [TITLE, LEAD_PARAGRAPH, FULL_TEXT]

    # Reduce logging output.
    tf.logging.set_verbosity(tf.logging.ERROR)

    with tf.Session() as session:
      session.run([tf.global_variables_initializer(), tf.tables_initializer()])
      RECORD_embeddings = session.run(embed(RECORD))

      for i, RECORD_embedding in enumerate(np.array(RECORD_embeddings).tolist()):
        print("RECORD: {}".format(RECORD[i]))
        print("Embedding size: {}".format(len(RECORD_embedding)))
        RECORD_embedding_snippet = ", ".join(
            (str(x) for x in RECORD_embedding[:3]))
        print("Embedding: [{}, ...]\n".format(RECORD_embedding_snippet))

In [5]:
print(len(nyt_data[:])-1)

999


In [14]:
print(nyt_data[3][3])

Mike Keiser, who made a fortune selling greeting cards on recycled paper, turned this remote spot on the southern Oregon coast into a golfing mecca that attracts wealthy people in private jets from around the world.To many in this hard-luck town of 3,000, Mr. Keiser is an economic hero. Work became scarce after the timber and fishing industries collapsed a quarter-century ago, and his Bandon Dunes Golf Resort, a few miles north of town, has created 325 full-time jobs, plus hundreds more part-time jobs. Mr. Keiser earns millions of dollars in profits each year.But beneath this model of enterprise, largely hidden subsidies from airline passengers, state-lottery players, taxpayers and company shareholders support the benefits that the owner, workers and visitors at Bandon Dunes enjoy.Airline passengers and lottery players are paying for a $31 million airport expansion to serve the 5,000 business jets that arrive each year, filled almost entirely with golfers. Many of them are executives o