In [2]:
# !pip install gensim
import gensim
import pprint
text_corpus = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]
# Create a set of frequent words
stoplist = set('for a of the and to in'.split(' '))
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in text_corpus]

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
pprint.pprint(processed_corpus)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [3]:
from gensim import corpora

dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [4]:
pprint.pprint(dictionary.token2id)

{'computer': 0,
 'eps': 8,
 'graph': 10,
 'human': 1,
 'interface': 2,
 'minors': 11,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'trees': 9,
 'user': 7}


In [5]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

[(0, 1), (1, 1)]


In [6]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
pprint.pprint(bow_corpus)

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


In [7]:
from gensim import models

# train the model
tfidf = models.TfidfModel(bow_corpus)

# transform the "system minors" string
words = "system minors".lower().split()
print(tfidf[dictionary.doc2bow(words)])

[(5, 0.5898341626740045), (11, 0.8075244024440723)]


In [8]:
from gensim import similarities

index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)

In [9]:
query_document = 'system engineering'.split()
query_bow = dictionary.doc2bow(query_document)
sims = index[tfidf[query_bow]]
print(list(enumerate(sims)))

[(0, 0.0), (1, 0.32448703), (2, 0.41707572), (3, 0.7184812), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]


In [10]:
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

3 0.7184812
2 0.41707572
1 0.32448703
0 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0


In [11]:
# Corpora and Vector Spaces

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [14]:
dictionary.save('./deerwester.dict')  # store the dictionary, for future reference

2020-06-03 02:02:02,769 : INFO : saving Dictionary object under ./deerwester.dict, separately None
2020-06-03 02:02:02,771 : INFO : saved ./deerwester.dict


In [1]:
# word2vec demo
%reset -f
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from gensim.test.utils import datapath
from gensim import utils

class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = datapath('lee_background.cor')
        for line in open(corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

2020-06-03 03:07:51,295 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-06-03 03:07:51,296 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)


In [3]:
import gensim.models

sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences)

2020-06-03 03:09:03,818 : INFO : collecting all words and their counts
2020-06-03 03:09:03,820 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-06-03 03:09:03,928 : INFO : collected 6981 word types from a corpus of 58152 raw words and 300 sentences
2020-06-03 03:09:03,930 : INFO : Loading a fresh vocabulary
2020-06-03 03:09:03,939 : INFO : effective_min_count=5 retains 1750 unique words (25% of original 6981, drops 5231)
2020-06-03 03:09:03,940 : INFO : effective_min_count=5 leaves 49335 word corpus (84% of original 58152, drops 8817)
2020-06-03 03:09:03,948 : INFO : deleting the raw counts dictionary of 6981 items
2020-06-03 03:09:03,950 : INFO : sample=0.001 downsamples 51 most-common words
2020-06-03 03:09:03,950 : INFO : downsampling leaves estimated 35935 word corpus (72.8% of prior 49335)
2020-06-03 03:09:03,955 : INFO : estimated required memory for 1750 words and 100 dimensions: 2275000 bytes
2020-06-03 03:09:03,956 : INFO : resetting layer weight

In [5]:
model.wv['king']

array([ 0.00269477, -0.02434526,  0.02238227,  0.02677673, -0.04741764,
        0.01725792,  0.05955269,  0.06336463, -0.03325649, -0.00725   ,
        0.02932971,  0.05322807,  0.10096155,  0.03138018,  0.02115   ,
        0.00411769, -0.06382673, -0.02785617, -0.05525706, -0.0160682 ,
        0.08101115,  0.02231089, -0.00064019, -0.00869474,  0.05844448,
        0.04394283,  0.01022059, -0.0748246 ,  0.00326857, -0.0224652 ,
        0.01952092, -0.02028982, -0.04886932,  0.03620696, -0.0093549 ,
        0.0213713 , -0.03056756,  0.01152956,  0.00536874,  0.03019593,
       -0.05655464,  0.0285409 , -0.01244486,  0.06834869,  0.00950526,
       -0.04446067, -0.00166665,  0.00827059,  0.03045221,  0.03716309,
        0.01514133,  0.0171531 ,  0.00722389, -0.01139333,  0.01037639,
        0.02450302,  0.04309493,  0.01562723,  0.01096504,  0.06577818,
        0.03011777,  0.01612068, -0.0269418 ,  0.00527515, -0.02746065,
       -0.01505195,  0.03651873,  0.0569701 ,  0.01330653,  0.04

In [6]:
for i, word in enumerate(model.wv.vocab):
    if i == 10:
        break
    print(word)

hundreds
of
people
have
been
forced
to
their
homes
in


In [7]:
model.save('./w2v_demo_model')

2020-06-03 03:27:45,766 : INFO : saving Word2Vec object under ./w2v_demo_model, separately None
2020-06-03 03:27:45,767 : INFO : not storing attribute vectors_norm
2020-06-03 03:27:45,769 : INFO : not storing attribute cum_table
2020-06-03 03:27:45,794 : INFO : saved ./w2v_demo_model


In [9]:
model = gensim.models.Word2Vec.load('./w2v_demo_model')
model.wv['king']

2020-06-03 03:29:10,789 : INFO : loading Word2Vec object from ./w2v_demo_model
2020-06-03 03:29:10,806 : INFO : loading wv recursively from ./w2v_demo_model.wv.* with mmap=None
2020-06-03 03:29:10,807 : INFO : setting ignored attribute vectors_norm to None
2020-06-03 03:29:10,807 : INFO : loading vocabulary recursively from ./w2v_demo_model.vocabulary.* with mmap=None
2020-06-03 03:29:10,808 : INFO : loading trainables recursively from ./w2v_demo_model.trainables.* with mmap=None
2020-06-03 03:29:10,809 : INFO : setting ignored attribute cum_table to None
2020-06-03 03:29:10,809 : INFO : loaded ./w2v_demo_model


array([ 0.00269477, -0.02434526,  0.02238227,  0.02677673, -0.04741764,
        0.01725792,  0.05955269,  0.06336463, -0.03325649, -0.00725   ,
        0.02932971,  0.05322807,  0.10096155,  0.03138018,  0.02115   ,
        0.00411769, -0.06382673, -0.02785617, -0.05525706, -0.0160682 ,
        0.08101115,  0.02231089, -0.00064019, -0.00869474,  0.05844448,
        0.04394283,  0.01022059, -0.0748246 ,  0.00326857, -0.0224652 ,
        0.01952092, -0.02028982, -0.04886932,  0.03620696, -0.0093549 ,
        0.0213713 , -0.03056756,  0.01152956,  0.00536874,  0.03019593,
       -0.05655464,  0.0285409 , -0.01244486,  0.06834869,  0.00950526,
       -0.04446067, -0.00166665,  0.00827059,  0.03045221,  0.03716309,
        0.01514133,  0.0171531 ,  0.00722389, -0.01139333,  0.01037639,
        0.02450302,  0.04309493,  0.01562723,  0.01096504,  0.06577818,
        0.03011777,  0.01612068, -0.0269418 ,  0.00527515, -0.02746065,
       -0.01505195,  0.03651873,  0.0569701 ,  0.01330653,  0.04

In [10]:
model.evaluate_word_pairs(datapath('wordsim353.tsv'))

  if __name__ == '__main__':
2020-06-03 03:45:44,656 : INFO : Pearson correlation coefficient against /opt/conda/envs/tensorflow2_py3/lib/python3.6/site-packages/gensim/test/test_data/wordsim353.tsv: 0.1056
2020-06-03 03:45:44,657 : INFO : Spearman rank-order correlation coefficient against /opt/conda/envs/tensorflow2_py3/lib/python3.6/site-packages/gensim/test/test_data/wordsim353.tsv: 0.0841
2020-06-03 03:45:44,658 : INFO : Pairs with unknown words ratio: 83.0%


((0.10556483612984553, 0.4221209027340719),
 SpearmanrResult(correlation=0.08411955187144943, pvalue=0.5228134629850876),
 83.0028328611898)

In [11]:
more_sentences = [
    ['Advanced', 'users', 'can', 'load', 'a', 'model',
     'and', 'continue', 'training', 'it', 'with', 'more', 'sentences']
]
model.build_vocab(more_sentences, update=True)
model.train(more_sentences, total_examples=model.corpus_count, epochs=model.iter)

2020-06-03 03:50:48,038 : INFO : collecting all words and their counts
2020-06-03 03:50:48,039 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-06-03 03:50:48,040 : INFO : collected 13 word types from a corpus of 13 raw words and 1 sentences
2020-06-03 03:50:48,040 : INFO : Updating model with new vocabulary
2020-06-03 03:50:48,041 : INFO : New added 0 unique words (0% of original 13) and increased the count of 0 pre-existing words (0% of original 13)
2020-06-03 03:50:48,041 : INFO : deleting the raw counts dictionary of 13 items
2020-06-03 03:50:48,042 : INFO : sample=0.001 downsamples 0 most-common words
2020-06-03 03:50:48,042 : INFO : downsampling leaves estimated 0 word corpus (0.0% of prior 0)
2020-06-03 03:50:48,046 : INFO : estimated required memory for 1750 words and 100 dimensions: 2275000 bytes
2020-06-03 03:50:48,048 : INFO : updating layer weights
2020-06-03 03:50:48,051 : INFO : training model with 3 workers on 1750 vocabulary and 100 featur

(26, 65)

In [14]:
# instantiating and training the Word2Vec model
model_with_loss = gensim.models.Word2Vec(
    sentences,
    min_count=1,
    compute_loss=True,
    workers=12,
    hs=0,
    sg=1,
    seed=42
)

# getting the training loss value
training_loss = model_with_loss.get_latest_training_loss()
print(training_loss)

2020-06-03 03:54:09,002 : INFO : collecting all words and their counts
2020-06-03 03:54:09,004 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-06-03 03:54:09,112 : INFO : collected 6981 word types from a corpus of 58152 raw words and 300 sentences
2020-06-03 03:54:09,113 : INFO : Loading a fresh vocabulary
2020-06-03 03:54:09,184 : INFO : effective_min_count=1 retains 6981 unique words (100% of original 6981, drops 0)
2020-06-03 03:54:09,185 : INFO : effective_min_count=1 leaves 58152 word corpus (100% of original 58152, drops 0)
2020-06-03 03:54:09,213 : INFO : deleting the raw counts dictionary of 6981 items
2020-06-03 03:54:09,214 : INFO : sample=0.001 downsamples 43 most-common words
2020-06-03 03:54:09,215 : INFO : downsampling leaves estimated 45723 word corpus (78.6% of prior 58152)
2020-06-03 03:54:09,236 : INFO : estimated required memory for 6981 words and 100 dimensions: 9075300 bytes
2020-06-03 03:54:09,237 : INFO : resetting layer weights
20

676889.1875


In [15]:
import io
import os

import gensim.models.word2vec
import gensim.downloader as api
import smart_open


def head(path, size):
    with smart_open.open(path) as fin:
        return io.StringIO(fin.read(size))


def generate_input_data():
    lee_path = datapath('lee_background.cor')
    ls = gensim.models.word2vec.LineSentence(lee_path)
    ls.name = '25kB'
    yield ls

    text8_path = api.load('text8').fn
    labels = ('1MB', '10MB', '50MB', '100MB')
    sizes = (1024 ** 2, 10 * 1024 ** 2, 50 * 1024 ** 2, 100 * 1024 ** 2)
    for l, s in zip(labels, sizes):
        ls = gensim.models.word2vec.LineSentence(head(text8_path, s))
        ls.name = l
        yield ls


input_data = list(generate_input_data())

2020-06-03 03:57:21,033 : ERROR : caught non-fatal exception while trying to update gensim-data cache from 'https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/list.json'; using local cache at '/home/tione/gensim-data/information.json' instead
Traceback (most recent call last):
  File "/opt/conda/envs/tensorflow2_py3/lib/python3.6/urllib/request.py", line 1318, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "/opt/conda/envs/tensorflow2_py3/lib/python3.6/http/client.py", line 1239, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "/opt/conda/envs/tensorflow2_py3/lib/python3.6/http/client.py", line 1285, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "/opt/conda/envs/tensorflow2_py3/lib/python3.6/http/client.py", line 1234, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "/opt/conda/envs/tensorflow2_py3/lib/python3.6/http/client.py", 



2020-06-03 04:07:26,653 : ERROR : caught non-fatal exception while trying to update gensim-data cache from 'https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/list.json'; using local cache at '/home/tione/gensim-data/information.json' instead
Traceback (most recent call last):
  File "/opt/conda/envs/tensorflow2_py3/lib/python3.6/urllib/request.py", line 1318, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "/opt/conda/envs/tensorflow2_py3/lib/python3.6/http/client.py", line 1239, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "/opt/conda/envs/tensorflow2_py3/lib/python3.6/http/client.py", line 1285, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "/opt/conda/envs/tensorflow2_py3/lib/python3.6/http/client.py", line 1234, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "/opt/conda/envs/tensorflow2_py3/lib/python3.6/http/client.py", 




2020-06-03 04:07:26,656 : INFO : text8 downloaded


In [17]:
# Temporarily reduce logging verbosity
logging.root.level = logging.ERROR

import time
import numpy as np
import pandas as pd

train_time_values = []
seed_val = 42
sg_values = [0, 1]
hs_values = [0, 1]

fast = True
if fast:
    input_data_subset = input_data[:3]
else:
    input_data_subset = input_data


for data in input_data_subset:
    for sg_val in sg_values:
        for hs_val in hs_values:
            for loss_flag in [True, False]:
                time_taken_list = []
                for i in range(3):
                    start_time = time.time()
                    w2v_model = gensim.models.Word2Vec(
                        data,
                        compute_loss=loss_flag,
                        sg=sg_val,
                        hs=hs_val,
                        seed=seed_val,
                        workers=12,
                    )
                    time_taken_list.append(time.time() - start_time)

                time_taken_list = np.array(time_taken_list)
                time_mean = np.mean(time_taken_list)
                time_std = np.std(time_taken_list)

                model_result = {
                    'train_data': data.name,
                    'compute_loss': loss_flag,
                    'sg': sg_val,
                    'hs': hs_val,
                    'train_time_mean': time_mean,
                    'train_time_std': time_std,
                }
                print("Word2vec model #%i: %s" % (len(train_time_values), model_result))
                train_time_values.append(model_result)

train_times_table = pd.DataFrame(train_time_values)
train_times_table = train_times_table.sort_values(
    by=['train_data', 'sg', 'hs', 'compute_loss'],
    ascending=[False, False, True, False],
)
print(train_times_table)

Word2vec model #0: {'train_data': '25kB', 'compute_loss': True, 'sg': 0, 'hs': 0, 'train_time_mean': 0.6586259206136068, 'train_time_std': 0.03622667899798521}
Word2vec model #1: {'train_data': '25kB', 'compute_loss': False, 'sg': 0, 'hs': 0, 'train_time_mean': 0.6543711026509603, 'train_time_std': 0.01124924227606954}
Word2vec model #2: {'train_data': '25kB', 'compute_loss': True, 'sg': 0, 'hs': 1, 'train_time_mean': 0.8261089324951172, 'train_time_std': 0.015550244165232914}
Word2vec model #3: {'train_data': '25kB', 'compute_loss': False, 'sg': 0, 'hs': 1, 'train_time_mean': 0.8217370510101318, 'train_time_std': 0.012713353096740439}
Word2vec model #4: {'train_data': '25kB', 'compute_loss': True, 'sg': 1, 'hs': 0, 'train_time_mean': 0.9065844217936198, 'train_time_std': 0.01439750488154397}
Word2vec model #5: {'train_data': '25kB', 'compute_loss': False, 'sg': 1, 'hs': 0, 'train_time_mean': 0.9083569049835205, 'train_time_std': 0.035316131028825665}
Word2vec model #6: {'train_data': 

In [18]:
# re-enable logging
logging.root.level = logging.INFO

most_similars_precalc = {word : model.wv.most_similar(word) for word in model.wv.index2word}
for i, (key, value) in enumerate(most_similars_precalc.items()):
    if i == 3:
        break
    print(key, value)

2020-06-03 06:18:22,595 : INFO : precomputing L2-norms of word weight vectors


the [('australia', 0.9999100565910339), ('yesterday', 0.9998998641967773), ('an', 0.9998995065689087), ('on', 0.9998971223831177), ('his', 0.9998931884765625), ('up', 0.9998925924301147), ('world', 0.9998903274536133), ('their', 0.9998873472213745), ('also', 0.999887228012085), ('which', 0.9998868703842163)]
to [('up', 0.9999508857727051), ('would', 0.9999493360519409), ('will', 0.9999490976333618), ('says', 0.9999411106109619), ('for', 0.9999390840530396), ('with', 0.9999386072158813), ('has', 0.9999384880065918), ('company', 0.9999381899833679), ('about', 0.999937891960144), ('could', 0.9999357461929321)]
of [('in', 0.9999499320983887), ('at', 0.9999431371688843), ('by', 0.9999421834945679), ('after', 0.9999419450759888), ('australian', 0.9999417066574097), ('with', 0.9999417066574097), ('on', 0.9999412894248962), ('and', 0.9999366998672485), ('two', 0.9999355673789978), ('also', 0.9999325275421143)]


In [19]:
import time
words = ['voted', 'few', 'their', 'around']

In [20]:
start = time.time()
for word in words:
    result = model.wv.most_similar(word)
    print(result)
end = time.time()
print(end - start)

[('national', 0.9988196492195129), ('victoria', 0.9987943172454834), ('old', 0.9987684488296509), ('hill', 0.9987670183181763), ('also', 0.9987571239471436), ('then', 0.9987561106681824), ('all', 0.9987560510635376), ('world', 0.998752236366272), ('israel', 0.9987504482269287), ('second', 0.9987483024597168)]
[('at', 0.999677300453186), ('around', 0.9996689558029175), ('sydney', 0.9996559619903564), ('north', 0.999655544757843), ('not', 0.999650239944458), ('before', 0.9996500611305237), ('an', 0.9996490478515625), ('in', 0.9996485114097595), ('australian', 0.999646008014679), ('which', 0.9996452927589417)]
[('with', 0.9999502897262573), ('and', 0.9999480843544006), ('his', 0.9999470710754395), ('which', 0.999945878982544), ('also', 0.9999457001686096), ('who', 0.9999422430992126), ('about', 0.9999409914016724), ('an', 0.9999402761459351), ('by', 0.9999399781227112), ('if', 0.9999396204948425)]
[('and', 0.9999417066574097), ('for', 0.9999378323554993), ('by', 0.9999364614486694), ('ove

In [21]:
start = time.time()
for word in words:
    if 'voted' in most_similars_precalc:
        result = most_similars_precalc[word]
        print(result)
    else:
        result = model.wv.most_similar(word)
        most_similars_precalc[word] = result
        print(result)

end = time.time()
print(end - start)

[('national', 0.9988196492195129), ('victoria', 0.9987943172454834), ('old', 0.9987684488296509), ('hill', 0.9987670183181763), ('also', 0.9987571239471436), ('then', 0.9987561106681824), ('all', 0.9987560510635376), ('world', 0.998752236366272), ('israel', 0.9987504482269287), ('second', 0.9987483024597168)]
[('at', 0.999677300453186), ('around', 0.9996689558029175), ('sydney', 0.9996559619903564), ('north', 0.999655544757843), ('not', 0.999650239944458), ('before', 0.9996500611305237), ('an', 0.9996490478515625), ('in', 0.9996485114097595), ('australian', 0.999646008014679), ('which', 0.9996452927589417)]
[('with', 0.9999502897262573), ('and', 0.9999480843544006), ('his', 0.9999470710754395), ('which', 0.999945878982544), ('also', 0.9999457001686096), ('who', 0.9999422430992126), ('about', 0.9999409914016724), ('an', 0.9999402761459351), ('by', 0.9999399781227112), ('if', 0.9999396204948425)]
[('and', 0.9999417066574097), ('for', 0.9999378323554993), ('by', 0.9999364614486694), ('ove

In [24]:
# Visualising the Word Embeddings
# !pip install plotly
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    vectors = [] # positions in vector space
    labels = [] # keep track of words to label our data again later
    for word in model.wv.vocab:
        vectors.append(model.wv[word])
        labels.append(word)

    # convert both lists into numpy vectors for reduction
    vectors = np.asarray(vectors)
    labels = np.asarray(labels)

    # reduce using t-SNE
    vectors = np.asarray(vectors)
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))
    plt.show()

# try:
#     get_ipython()
# except Exception:
#     plot_function = plot_with_matplotlib
# else:
#     plot_function = plot_with_plotly
plot_function = plot_with_matplotlib

plot_function(x_vals, y_vals, labels)

2020-06-03 06:32:22,126 : INFO : font search path ['/opt/conda/envs/tensorflow2_py3/lib/python3.6/site-packages/matplotlib/mpl-data/fonts/ttf', '/opt/conda/envs/tensorflow2_py3/lib/python3.6/site-packages/matplotlib/mpl-data/fonts/afm', '/opt/conda/envs/tensorflow2_py3/lib/python3.6/site-packages/matplotlib/mpl-data/fonts/pdfcorefonts']
2020-06-03 06:32:23,170 : INFO : generated new fontManager


<Figure size 1200x1200 with 1 Axes>

In [26]:
import matplotlib.pyplot as plt
plt.show()

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [2]:
def get_model():
  # Create a simple model.
  inputs = keras.Input(shape=(32,))
  outputs = keras.layers.Dense(1)(inputs)
  model = keras.Model(inputs, outputs)
  model.compile(optimizer='adam', loss='mean_squared_error')
  return model

model = get_model()

In [3]:
test_input = np.random.random((128, 32))
test_target = np.random.random((128, 1))
model.fit(test_input, test_target)

Train on 128 samples


<tensorflow.python.keras.callbacks.History at 0x7fb8880dee10>

In [4]:
model.save('my_model')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: my_model/assets


In [5]:

reconstructed_model = keras.models.load_model('my_model')

In [7]:
np.testing.assert_array_equal(
  model.predict(test_input),
  reconstructed_model.predict(test_input))

In [8]:
model.predict(test_input)

array([[-5.9197176e-01],
       [ 5.3354502e-01],
       [ 5.3221565e-01],
       [-2.4643463e-01],
       [ 2.9604721e-01],
       [-1.7099570e-01],
       [ 2.1050741e-01],
       [ 6.4309746e-02],
       [ 6.6867054e-01],
       [ 1.1164248e+00],
       [ 2.5894290e-01],
       [-4.5113632e-01],
       [ 3.9030150e-01],
       [ 1.0314107e+00],
       [-1.0560724e-01],
       [ 2.3955140e-02],
       [ 3.8627669e-01],
       [ 5.2383876e-01],
       [-3.2709050e-01],
       [-4.2951077e-01],
       [ 4.0141118e-01],
       [ 1.5115704e-01],
       [ 1.7143428e-01],
       [-1.9378874e-01],
       [ 2.8906655e-02],
       [ 1.8324424e-01],
       [ 2.2799170e-01],
       [ 7.0667975e-02],
       [-7.9695687e-02],
       [ 6.4121252e-01],
       [ 8.7026050e-03],
       [-8.8156843e-01],
       [ 2.2460826e-01],
       [-2.7920315e-01],
       [ 1.9006982e-01],
       [ 5.0772643e-01],
       [ 8.3085018e-01],
       [ 9.5130503e-01],
       [ 6.3061476e-02],
       [ 3.1734544e-01],


In [9]:
reconstructed_model.predict(test_input)

array([[-5.9197176e-01],
       [ 5.3354502e-01],
       [ 5.3221565e-01],
       [-2.4643463e-01],
       [ 2.9604721e-01],
       [-1.7099570e-01],
       [ 2.1050741e-01],
       [ 6.4309746e-02],
       [ 6.6867054e-01],
       [ 1.1164248e+00],
       [ 2.5894290e-01],
       [-4.5113632e-01],
       [ 3.9030150e-01],
       [ 1.0314107e+00],
       [-1.0560724e-01],
       [ 2.3955140e-02],
       [ 3.8627669e-01],
       [ 5.2383876e-01],
       [-3.2709050e-01],
       [-4.2951077e-01],
       [ 4.0141118e-01],
       [ 1.5115704e-01],
       [ 1.7143428e-01],
       [-1.9378874e-01],
       [ 2.8906655e-02],
       [ 1.8324424e-01],
       [ 2.2799170e-01],
       [ 7.0667975e-02],
       [-7.9695687e-02],
       [ 6.4121252e-01],
       [ 8.7026050e-03],
       [-8.8156843e-01],
       [ 2.2460826e-01],
       [-2.7920315e-01],
       [ 1.9006982e-01],
       [ 5.0772643e-01],
       [ 8.3085018e-01],
       [ 9.5130503e-01],
       [ 6.3061476e-02],
       [ 3.1734544e-01],
