In [1]:
import gensim
import tensorflow as tf

print('gensim version: \t%s' % gensim.__version__)
print('TensorFlow version: \t%s' % tf.__version__)

gensim version: 	3.8.1
TensorFlow version: 	2.0.0


## Config

In [2]:
import logging

# For displaying gensim logs
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

# Directory with raw txt-files
TEXT_DIR  = 'data/yelp/train'

# Directory for saving checkpoint and metadata
MODEL_DIR = 'emb_yelp/'

# Word2vec
EMBEDDING_SIZE = 300

## Preprocessing

In [3]:
import os, re, string


def clean_doc(doc):
    """
    Cleaning a document by several methods
    """
    # Lowercase
    doc = doc.lower()
    # Remove numbers
    doc = re.sub(r"[0-9]+", "", doc)
    # Split in tokens
    tokens = doc.split()
    # Remove punctuation
    tokens = [w.translate(str.maketrans('', '', string.punctuation)) for w in tokens]
    # Tokens with less then two characters will be ignored
    tokens = [word for word in tokens if len(word) > 1]
    return ' '.join(tokens)


def read_files(path):
    """
    Read in text files
    """
    documents = list()
    tokenize  = lambda x: gensim.utils.simple_preprocess(x)
    
    # Read in all files in directory
    if os.path.isdir(path):
        for filename in os.listdir(path):
            with open('%s/%s' % (path, filename), encoding='utf-8') as f:
                doc = f.read()
                doc = clean_doc(doc)
                documents.append(tokenize(doc))
    return documents

docs = read_files(TEXT_DIR)
print('Number of documents: %i' % len(docs))

Number of documents: 200000


## Training model

In [4]:
model = gensim.models.Word2Vec(docs, size=EMBEDDING_SIZE, min_count=0)

INFO : collecting all words and their counts


## Saving model

In [5]:
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
model.save(os.path.join(MODEL_DIR,'word2vec'))

INFO : saving Word2Vec object under emb_yelp/word2vec, separately None
INFO : not storing attribute vectors_norm
INFO : not storing attribute cum_table
INFO : saved emb_yelp/word2vec


## Creating checkpoint and metadata

In [6]:
from tensorboard.plugins import projector

weights     = model.wv.vectors
index_words = model.wv.index2word

vocab_size    = weights.shape[0]
embedding_dim = weights.shape[1]

print('Shape of weights:', weights.shape)
print('Vocabulary size: %i' % vocab_size)
print('Embedding size: %i'  % embedding_dim)

with open(os.path.join(MODEL_DIR,'metadata.tsv'), 'w') as f:
    f.writelines("\n".join(index_words))

config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = 'embeddings'
embedding.metadata_path = './metadata.tsv'
projector.visualize_embeddings(MODEL_DIR, config)

tensor_embeddings = tf.Variable(model.wv.vectors, name='embeddings')

checkpoint = tf.compat.v1.train.Saver([tensor_embeddings])
checkpoint_path = checkpoint.save(sess=None, global_step=None, save_path=os.path.join(MODEL_DIR, "model.ckpt"))

Shape of weights: (42113, 300)
Vocabulary size: 42113
Embedding size: 300




## Example

In [7]:
model.wv.most_similar(positive=['coffee'], topn=10)

INFO : precomputing L2-norms of word weight vectors


[('espresso', 0.6709840893745422),
 ('latte', 0.6611574292182922),
 ('cappuccino', 0.6460868716239929),
 ('tea', 0.643097996711731),
 ('lattes', 0.613446056842804),
 ('coffees', 0.612466037273407),
 ('teas', 0.5807890295982361),
 ('chai', 0.567467451095581),
 ('mocha', 0.565311074256897),
 ('gelato', 0.5606527328491211)]