* Word level embedding with 3 different pre-trained models: 

In [18]:
import datetime
import numpy as np
import os

import gensim
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

print('gensim Version: %s' % (gensim.__version__))

class WordEmbedding:
    __author__ = "Edward Ma"
    __copyright__ = "Copyright 2018, Edward Ma"
    __credits__ = ["Edward Ma"]
    __license__ = "Apache"
    __version__ = "2.0"
    __maintainer__ = "Edward Ma"
    __email__ = "makcedward@gmail.com"
    __updatedby__ = "Alan Tan, 2018"
    # updated load method to test if .vec already exist, if it does then load, otherwise, call convert before load.
    
    def __init__(self, downloaded_glove_file_path="", verbose=0):
        self.verbose = verbose
        self.downloaded_glove_file_path = downloaded_glove_file_path
        self.model = {}
        
    def convert(self, source, input_file_path, output_file_path):
        if source == 'glove':
            input_file = datapath(input_file_path)
            output_file = get_tmpfile(output_file_path)
            glove2word2vec(input_file, output_file)
        elif source == 'word2vec':
            pass
        elif source == 'fasttext':
            pass
        else:
            raise ValueError('Possible value of source are glove, word2vec, fasttext')
        
    def load(self, source, file_path):
        print(datetime.datetime.now(), 'start: loading', source)
        if source == 'glove':
            exists = os.path.isfile(file_path)
            if exists:
                self.model[source] = gensim.models.KeyedVectors.load_word2vec_format(file_path)
            else:
                self.convert(source='glove', input_file_path=cwd+downloaded_glove_file_path, output_file_path=cwd+glove_file_path)
                glove2word2vec(self.downloaded_glove_file_path, file_path)

                self.model[source] = gensim.models.KeyedVectors.load_word2vec_format(file_path)
        elif source == 'word2vec':
            self.model[source] = gensim.models.KeyedVectors.load_word2vec_format(file_path, binary=True)
        elif source == 'fasttext':
            self.model[source] = gensim.models.wrappers.FastText.load_fasttext_format(file_path)
        else:
            raise ValueError('Possible value of source are glove, word2vec, fasttext')
            
        print(datetime.datetime.now(), 'end: loading', source)
            
        return self
    
    def get_model(self, source):
        if source not in ['glove', 'word2vec', 'fasttext']:
            raise ValueError('Possible value of source are glove, word2vec, fasttext')
            
        return self.model[source]
    
    def get_words(self, source, size=None):
        if source not in ['glove', 'word2vec', 'fasttext']:
            raise ValueError('Possible value of source are glove, word2vec, fasttext')
        
        if source in ['glove', 'word2vec']:
            if size is None:
                return [w for w in self.get_model(source=source).vocab]
            else:
                results = []
                for i, word in enumerate(self.get_model(source=source).vocab):
                    if i >= size:
                        break
                        
                    results.append(word)
                return results
            
        elif source in ['fasttext']:
            if size is None:
                return [w for w in self.get_model(source=source).wv.vocab]
            else:
                results = []
                for i, word in enumerate(self.get_model(source=source).wv.vocab):
                    if i >= size:
                        break
                        
                    results.append(word)
                return results
        
        return Exception('Unexpected flow')
    
    def get_dimension(self, source):
        if source not in ['glove', 'word2vec', 'fasttext']:
            raise ValueError('Possible value of source are glove, word2vec, fasttext')
        
        if source in ['glove', 'word2vec']:
            return self.get_model(source=source).vectors[0].shape[0]
            
        elif source in ['fasttext']:
            word = self.get_words(source=source, size=1)[0]
            return self.get_model(source=source).wv[word].shape[0]
        
        return Exception('Unexpected flow')
    
    def get_vectors(self, source, words=None):
        if source not in ['glove', 'word2vec', 'fasttext']:
            raise ValueError('Possible value of source are glove, word2vec, fasttext')
        
        if source in ['glove', 'word2vec', 'fasttext']:
            if words is None:
                words = self.get_words(source=source)
            
            embedding = np.empty((len(words), self.get_dimension(source=source)), dtype=np.float32)            
            for i, word in enumerate(words):
                embedding[i] = self.get_vector(source=source, word=word)
                
            return embedding
        
        return Exception('Unexpected flow')
    
    def get_vector(self, source, word, oov=None):
        if source not in ['glove', 'word2vec', 'fasttext']:
            raise ValueError('Possible value of source are glove, word2vec, fasttext')
            
        if source not in self.model:
            raise ValueError('Did not load %s model yet' % source)
        
        try:
            return self.model[source][word]
        except KeyError as e:
            raise
            
            #TODO
#             if oov is None:
#                 raise
            
#             if 'not in vocabulary' in str(e):
#                 if oov == ''

    def build_visual_metadata(self, embedding, words, file_dir, 
                              metadata_name='metadata.csv', project_model_name='model.ckpt'):
        # Create output directory if not exist
        if not os.path.exists(file_dir):
            os.makedirs(file_dir)

        # Build graph
        tf.reset_default_graph()
        sess = tf.InteractiveSession()

        embedding_graph = tf.Variable([0.0], name='embedding')
        place = tf.placeholder(tf.float32, shape=embedding.shape)

        set_embedding_graph = tf.assign(embedding_graph, place, validate_shape=False)
        sess.run(tf.global_variables_initializer())
        sess.run(set_embedding_graph, feed_dict={place: embedding})

        # Build metadata
        with open(os.path.join(file_dir, metadata_name), 'w') as f:
            for word in words:
                f.write(word + '\n')

        # Build projector
        summary_writer = tf.summary.FileWriter(file_dir, sess.graph)
        config = projector.ProjectorConfig()
        embedding_conf = config.embeddings.add()
        embedding_conf.tensor_name = 'embedding:0'
        embedding_conf.metadata_path = metadata_name
        projector.visualize_embeddings(summary_writer, config)

        # Save model
        saver = tf.train.Saver()
        saver.save(sess, os.path.join(file_dir, project_model_name))

        # Clear
        sess.close()


        
#setup directories

cwd = os.getcwd()+'/'

#the glove pre-trained model is in txt, needs to be converted to .vec format
downloaded_glove_file_path = './word2vec/glove.840B.300d.txt'
glove_file_path = './word2vec/glove.840B.300d.vec'

word2vec_file_path = './word2vec/GoogleNews-vectors-negative300.bin'
fasttext_file_path = './word2vec/crawl-300d-2M.vec'


#set up embedding (including convert glove pre-trained from .txt to .vec)
word_embedding = WordEmbedding(downloaded_glove_file_path=downloaded_glove_file_path)
#word_embedding.convert(source='glove', input_file_path=cwd+downloaded_glove_file_path, output_file_path=cwd+glove_file_path)

word_embedding.load(source='word2vec', file_path=word2vec_file_path)


gensim Version: 3.4.0
2018-11-14 21:45:44.302384 start: loading word2vec
2018-11-14 21:46:39.757653 end: loading word2vec


<__main__.WordEmbedding at 0x1f05d79198>

In [21]:
print(word_embedding.get_vector(source='word2vec', word='cow'))

[ 0.18945312 -0.07519531 -0.15625     0.19921875 -0.18457031  0.20703125
 -0.04125977 -0.01428223  0.00363159 -0.09570312  0.10693359 -0.5859375
 -0.08300781 -0.08007812 -0.32421875 -0.03662109 -0.20898438  0.24511719
 -0.25585938 -0.08837891  0.12255859 -0.10742188 -0.00454712  0.06176758
  0.00466919  0.04174805 -0.21582031 -0.04443359  0.28125    -0.2109375
 -0.02441406 -0.01190186  0.08154297 -0.03955078 -0.32226562  0.16992188
 -0.078125    0.00653076  0.28320312  0.33398438 -0.06591797 -0.07910156
 -0.02441406  0.09179688  0.09082031 -0.1640625  -0.04223633  0.26953125
  0.12792969  0.25       -0.23535156 -0.33203125  0.27929688  0.10107422
 -0.05786133 -0.09814453 -0.06884766 -0.2734375   0.57421875  0.04736328
  0.125      -0.07177734 -0.10107422 -0.02355957  0.11132812 -0.39648438
  0.02026367  0.0625     -0.20996094 -0.09521484  0.12255859 -0.11035156
 -0.24902344  0.03222656 -0.09423828  0.11132812  0.01184082  0.01672363
 -0.15722656 -0.02368164  0.25976562 -0.18652344  0.1

In [19]:
source = 'word2vec'
embedding = word_embedding.get_vectors(source=source)
words = word_embedding.get_words(source=source)
sub_embedding = embedding[:100000]
sub_words = words[:100000]
word_embedding.build_visual_metadata(embedding=sub_embedding, words=sub_words, file_dir='./word_embedding')

* Read in CSV file 

In [4]:
import pandas as pd
nyt_data = pd.read_csv('nyt_dataset/nyt_structured_data.csv.1000', 
                       delimiter=',', index_col=0, header=None, quotechar='"', quoting=1, 
                       skipinitialspace=True, engine='c')

In [48]:
print(nyt_data.head(10)[:][2])

0
original/2007/06/15/1854648.xml     When federal prosecutors found in 1995 that Sh...
original/2007/06/15/1854649.xml     There is the yelp, an electronic yodel that gr...
original/2007/06/15/1854650.xml     Cloaking one's identity while writing -- to hi...
original/2007/06/15/1854651.xml     Mike Keiser, who made a fortune selling greeti...
original/2007/06/15/1854652.xml     The Havens column on June 8 about Milford, Pa....
original/2007/06/15/1854653.xml     An article last Friday about second-home devel...
original/2007/06/15/1854654.xml     Delinquencies and foreclosures among homeowner...
original/2007/06/15/1854655.xml     Globalization is coming to the securities mark...
original/2007/06/15/1854656.xml     Talk about trying to rain on their parade.As t...
original/2007/06/15/1854657.xml     In marathon speeches peppered with quotes from...
Name: 2, dtype: object


In [22]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import re
import seaborn as sns

# Import the Universal Sentence Encoder's TF Hub module
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]
embed = hub.Module(module_url)

In [28]:
# Compute a representation for each message, showing various lengths supported.

import datetime

#for n in range(0,len(nyt_data[:])-1) :
for n in range(0,5) :
    TITLE = nyt_data[1][n]
    LEAD_PARAGRAPH = nyt_data[2][n]
    FULL_TEXT = nyt_data[3][n]
    RECORD = [TITLE, LEAD_PARAGRAPH, FULL_TEXT]
    RECORD = [TITLE, FULL_TEXT]


    # Reduce logging output.
    tf.logging.set_verbosity(tf.logging.ERROR)
    
    print(str(datetime.datetime.now()))
    with tf.Session() as session:
      session.run([tf.global_variables_initializer(), tf.tables_initializer()])
      RECORD_embeddings = session.run(embed(RECORD))

      for i, RECORD_embedding in enumerate(np.array(RECORD_embeddings).tolist()):
        print("TEXT: {}".format(RECORD[i]))
        print("Embedding size: {}".format(len(RECORD_embedding)))
        RECORD_embedding_snippet = ", ".join(
            (str(x) for x in RECORD_embedding[:3]))
        print("Embedding: [{}, ...]\n".format(RECORD_embedding_snippet))
        
    print(str(datetime.datetime.now()))



2018-11-15 14:26:45.028106
TEXT: Often Accused, Never Charged, Newark's Ex-Mayor Faces U.S. 
Embedding size: 512
Embedding: [0.026562940329313278, 0.0049184830859303474, -0.045423150062561035, ...]

TEXT: When federal prosecutors found in 1995 that Sharpe James, then the mayor of Newark, could not account for hundreds of thousands of dollars in donations from political patrons, his only penalty was a $44,000 fine from the state election commission.Two years later, Mr. James's chief of staff was imprisoned for accepting bribes from city contractors. The mayor praised him as ''a loyal public servant'' and escaped from that investigation unscathed, soon winning a second public office as state senator.And throughout his five terms at the helm of New Jersey's largest city, Mr. James lived a lifestyle so lavish -- complete with a beach house, 54-foot yacht and Rolls-Royce -- that he was frequently the focus of accusations and insinuations, but he never faced a criminal charge.Now Mr. James, 