In [1]:
import logging
from six import iteritems
from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999
from web.embeddings import fetch_FastText
from web.evaluate import evaluate_similarity

In [12]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text  # Imports TF ops for preprocessing.

# Define some sentences to feed into the model
sentences = [
  "Here We Go Then, You And I is a 1999 album by Norwegian pop artist Morten Abel. It was Abel's second CD as a solo artist.",
  "The album went straight to number one on the Norwegian album chart, and sold to double platinum.",
  "Ceylon spinach is a common name for several plants and may refer to: Basella alba Talinum fruticosum",
  "A solar eclipse occurs when the Moon passes between Earth and the Sun, thereby totally or partly obscuring the image of the Sun for a viewer on Earth.",
  "A partial solar eclipse occurs in the polar regions of the Earth when the center of the Moon's shadow misses the Earth.",
]

# Load the BERT encoder and preprocessing models
preprocess = hub.load('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert = hub.load('https://tfhub.dev/google/experts/bert/pubmed/2')

In [13]:
bert_inputs = preprocess(sentences)

# Feed the inputs to the model to get the pooled and sequence outputs
bert_outputs = bert(bert_inputs, training=False)
pooled_output = bert_outputs['pooled_output']
sequence_output = bert_outputs['sequence_output']

print('\nSentences:')
print(sentences)
print('\nPooled output:')
print(pooled_output)
print('\nSequence output:')
print(sequence_output)



Sentences:
["Here We Go Then, You And I is a 1999 album by Norwegian pop artist Morten Abel. It was Abel's second CD as a solo artist.", 'The album went straight to number one on the Norwegian album chart, and sold to double platinum.', 'Ceylon spinach is a common name for several plants and may refer to: Basella alba Talinum fruticosum', 'A solar eclipse occurs when the Moon passes between Earth and the Sun, thereby totally or partly obscuring the image of the Sun for a viewer on Earth.', "A partial solar eclipse occurs in the polar regions of the Earth when the center of the Moon's shadow misses the Earth."]

Pooled output:
tf.Tensor(
[[ 0.16779432 -0.3931237   0.53747344 ...  0.58478045 -0.43332016
  -0.60146844]
 [ 0.4183169  -0.11058168  0.3771507  ...  0.42176116 -0.25798595
   0.09233482]
 [-0.55007124  0.36924237 -0.06870123 ... -0.5558867  -0.75570697
  -0.4532632 ]
 [ 0.05911509  0.08546973 -0.5964464  ... -0.55465823 -0.7894636
  -0.79851866]
 [ 0.30475447 -0.02681021 -0.61

In [11]:
bert2 = hub.load('https://tfhub.dev/google/experts/bert/wiki_books/2')
# Feed the inputs to the model to get the pooled and sequence outputs
bert_outputs2 = bert2(bert_inputs, training=False)
pooled_output2 = bert_outputs2['pooled_output']
sequence_output2 = bert_outputs2['sequence_output']

print('\nSentences:')
print(sentences)
print('\nPooled output:')
print(pooled_output2)
print('\nSequence output:')
print(sequence_output2)


Sentences:
["Here We Go Then, You And I is a 1999 album by Norwegian pop artist Morten Abel. It was Abel's second CD as a solo artist.", 'The album went straight to number one on the Norwegian album chart, and sold to double platinum.', 'Ceylon spinach is a common name for several plants and may refer to: Basella alba Talinum fruticosum', 'A solar eclipse occurs when the Moon passes between Earth and the Sun, thereby totally or partly obscuring the image of the Sun for a viewer on Earth.', "A partial solar eclipse occurs in the polar regions of the Earth when the center of the Moon's shadow misses the Earth."]

Pooled output:
tf.Tensor(
[[ 0.7975976  -0.4858056   0.49781555 ... -0.34488398  0.3972774
  -0.20639606]
 [ 0.5712031  -0.41205284  0.7048917  ... -0.3518512   0.19032407
  -0.40418974]
 [-0.3572719   0.7708981   0.15756561 ...  0.44185397 -0.86448216
   0.04505212]
 [ 0.9107705   0.4150143   0.560635   ... -0.49263868  0.39640468
  -0.05036111]
 [ 0.90502894 -0.15505318  0.72

In [5]:
# Configure logging
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

In [10]:
# Fetch GloVe embedding (warning: it might take few minutes)
w_glove = fetch_FastText()

Downloading data from https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.vec ...
Error while fetching file wiki.en.vec. Dataset fetching aborted.


HTTPError: HTTP Error 403: Forbidden

In [6]:
# Define tasks
tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999()
}

In [7]:
# Print sample data
for name, data in iteritems(tasks):
    print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".format(name, data.X[0][0], data.X[0][1], data.y[0]))

Sample data from SIMLEX999: pair "old" and "new" is assigned score 1.58
Sample data from MEN: pair "sun" and "sunlight" is assigned score [ 10.]
Sample data from WS353: pair "love" and "sex" is assigned score 6.77


In [8]:
# Calculate results using helper function
for name, data in iteritems(tasks):
    print "Spearman correlation of scores on {} {}".format(name, evaluate_similarity(w_glove, data.X, data.y))



Spearman correlation of scores on SIMLEX999 0.370500357109
Spearman correlation of scores on MEN 0.737464696981
Spearman correlation of scores on WS353 0.521712569525
