In [1]:
import nltk

In [2]:
class Sentence:
    def __init__(self, s):
        self.raw = s
        normalized_sentence = s.replace("‘", "'").replace("’", "'")
        self.tokens = [t.lower() for t in nltk.word_tokenize(normalized_sentence)]

In [3]:
import math
from collections import Counter

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


def run_avg_benchmark(sentences1, sentences2, model=None):

    sim = 0

    tokens1 = sentences1.tokens
    tokens2 = sentences2.tokens

    tokens1 = [token for token in tokens1 if token in model]
    tokens2 = [token for token in tokens2 if token in model]

    if len(tokens1) == 0 or len(tokens2) == 0:
        return sim

    tokfreqs1 = Counter(tokens1)
    tokfreqs2 = Counter(tokens2)

    weights1 = None
    weights2 = None

    embedding1 = np.average(
        [model[token] for token in tokfreqs1], axis=0, weights=weights1
    ).reshape(1, -1)
    embedding2 = np.average(
        [model[token] for token in tokfreqs2], axis=0, weights=weights2
    ).reshape(1, -1)

    sim = cosine_similarity(embedding1, embedding2)[0][0]
    return sim

In [11]:
def run_wmd_benchmark(sentences1, sentences2, model):
    sim = 0

    tokens1 = sentences1.tokens
    tokens2 = sentences2.tokens

    tokens1 = [token for token in tokens1 if token in model]
    tokens2 = [token for token in tokens2 if token in model]

    if len(tokens1) == 0 or len(tokens2) == 0:
        tokens1 = [token for token in sent1.tokens if token in model]
        tokens2 = [token for token in sent2.tokens if token in model]

    sim = model.wmdistance(tokens1, tokens2)

    return sim

In [26]:
import os

model_path = os.path.join(os.getcwd(), "data/word2vec/word2vec.bin")
word2vec_path = os.path.join(
    os.getcwd(), "data/word2vec_gz/GoogleNews-vectors-negative300.bin.gz"
)

In [47]:
from gensim.models import KeyedVectors

model = KeyedVectors.load(model_path, mmap="r")
model.syn0norm = model.syn0

  model.syn0norm = model.syn0
  model.syn0norm = model.syn0


In [48]:
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [49]:
s1 = Sentence("A man is smoking.")
s2 = Sentence("A man is liking.")

In [52]:
s1 = Sentence("A plane is taking off.")
s2 = Sentence("An air plane is taking off.")

In [55]:
s1 = Sentence("A man is playing a flute.")
s2 = Sentence("A man is playing a bamboo flute.")

In [56]:
print(run_avg_benchmark(s1, s2, model))
print(run_avg_benchmark(s1, s2, word2vec))

0.9303886
0.8953323


In [57]:
print(run_wmd_benchmark(s1, s2, model))
print(run_wmd_benchmark(s1, s2, word2vec))

0.268616868273288
0.8011494848677397
