In [6]:
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

import pandas as pd

In [7]:
def read_sts(path):
    df = pd.read_csv(path, sep='\n', header=None, names=['row'])
    df = pd.DataFrame(df.row.str.split('\t', 6).tolist(), columns=['', 'genre','filename','year','score','sentence1','sentence2'])
    df.drop(df.columns[[0,1,2,3]], inplace=True, axis=1)
    df["score"] = pd.to_numeric(df["score"])
    return df

sts_train = read_sts('data/sts_benchmark/sts-train.csv')
sts_dev = read_sts('data/sts_benchmark/sts-dev.csv')
sts_test = read_sts('data/sts_benchmark/sts-test.csv')

## Universal Sentence Encoder

In [8]:
embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")

INFO:tensorflow:Using /var/folders/4d/b1py937d21l9j91lbp5fr4_w0000gn/T/tfhub_modules to cache modules.


In [9]:
train_embs_1 = embed(sts_train['sentence1'].tolist())
train_embs_2 = embed(sts_train['sentence2'].tolist())

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [10]:
%%time

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    embs_1 = sess.run(train_embs_1)
    embs_2 = sess.run(train_embs_2)

CPU times: user 25.8 s, sys: 3.43 s, total: 29.3 s
Wall time: 26.3 s


In [11]:
cos_sims = cosine_similarity(embs_1, embs_2)

cos_sim = cos_sims.diagonal()

pearson = np.corrcoef(cos_sim.ravel(), sts_train['score'].values)

print('Correlation =', pearson[0, 1])

Correlation = 0.6950971155875105


## ELMO

In [12]:
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=False)

In [None]:
elmo_1 = elmo(sts_train['sentence1'].tolist(), signature="default", as_dict=True)["default"]
elmo_2 = elmo(sts_train['sentence2'].tolist(), signature="default", as_dict=True)["default"]

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [None]:
%%time

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    el_embs_1 = sess.run(elmo_1)
    el_embs_2 = sess.run(elmo_2)

In [None]:
cos_sims = cosine_similarity(el_embs_1, el_embs_2)

cos_sim = cos_sims.diagonal()

pearson = np.corrcoef(cos_sim.ravel(), sts_train['score'].values)

print('Correlation =', pearson[0, 1])

## Word2Vec

In [None]:
from gensim.models import word2vec

## Fasttext

In [3]:
from gensim.models import FastText

In [5]:
FastText.load_fasttext_format('data/fasttext/wiki-news-300d-1M.vec')

FileNotFoundError: [Errno 2] No such file or directory: 'data/fasttext/wiki-news-300d-1M.vec.bin'