# Run Universal Sentence Encoder

## Load required modules

In [15]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from scipy.spatial.distance import cosine

In [13]:
MODULE_URL = "https://tfhub.dev/google/universal-sentence-encoder/2" 

In [14]:
EMBED = hub.Module(MODULE_URL)

## Get embeddings and cosine similarity

In [17]:
def get_embedding(session, input_placeholder, word, embedding_encoder):
    message_embeddings = session.run(embedding_encoder, feed_dict={input_placeholder: word})
    dist = cosine(np.array(message_embeddings[0]), np.array(message_embeddings[1]))

    return dist

In [6]:
def get_embedding_vector(session, word_placeholder, word, embedding_encoder):
    message_embeddings = session.run(embedding_encoder, feed_dict={word_placeholder: word})
    # dist = cosine(np.array(message_embeddings[0]), np.array(message_embeddings[1]))

    return message_embeddings

In [12]:
# Compute a representation for each message, showing various lengths supported.
# word = "knowledge graph"
# word2 = "ontology"
# word3 = "searchengine"
# word4 = "donald trump"
# word5 = "test staging today"
# sentence = "Entities and relations need to be defined well to build good graph representations."
# sentence2 = "General knowledge is a subject that I learnt in school and I think it is cool."
# paragraph = (
#     "Universal Sentence Encoder embeddings also support short paragraphs. "
#     "There is no hard limit on how long the paragraph is. Roughly, the longer "
#     "the more 'diluted' the embedding will be.")
messages = ["test", "netflix"]

In [10]:
input_placeholder = tf.placeholder(tf.string, shape=(None))
word_encodings = embed(input_placeholder)
# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    start = timer()
    message_embeddings = get_embedding_vector(session, input_placeholder, messages, word_encodings)
    end = timer()
    print("Time taken: {}".format(end-start))
    for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
        print("Message: {}".format(messages[i]))
        print("Embedding size: {}".format(len(message_embedding)))
        message_embedding_snippet = ", ".join(
            (str(x) for x in message_embedding[:3]))
        print("Embedding: [{}, ...]".format(message_embedding_snippet))
        dist = cosine(np.array(message_embeddings[0]), message_embedding)
        print("Cosine similarity: {}\n".format(1-dist))

Time taken: 0.3972410730057163
Message: knowledge graph
Embedding size: 512
Embedding: [0.007142378017306328, 0.012228659354150295, -0.0032539102248847485, ...]
Cosine similarity: 1.000000036336652

Message: Entities and relations need to be defined well to build good graph representations.
Embedding size: 512
Embedding: [0.04317520558834076, -0.03976033255457878, -0.03193525969982147, ...]
Cosine similarity: 0.43334046563711204

Message: General knowledge is a subject that I learnt in school and I think it is cool.
Embedding size: 512
Embedding: [-0.03692379966378212, 0.037080660462379456, 0.03148544952273369, ...]
Cosine similarity: 0.4512204099164938

Message: Universal Sentence Encoder embeddings also support short paragraphs. There is no hard limit on how long the paragraph is. Roughly, the longer the more 'diluted' the embedding will be.
Embedding size: 512
Embedding: [0.018790962174534798, 0.045365139842033386, -0.020010892301797867, ...]
Cosine similarity: 0.21234553781395893

