In [20]:
import tensorflow as tf
import pandas as pd
import os


In [69]:
def load_2_sentences(s1,s2):
    return pd.DataFrame([(s1,s2,float(0))], columns=["sent_1", "sent_2", "sim"])


In [18]:
def load_STS_dataset(file_path):
    sent_pairs = []
    with tf.io.gfile.GFile(file_path, "r") as f:
        for line in f:
            ts = line.strip().split("\t")
            sent_pairs.append((ts[5], ts[6], float(ts[4])))
    return pd.DataFrame(sent_pairs, columns=["sent_1", "sent_2", "sim"])

In [21]:
STS_dev = load_STS_dataset(os.path.join(os.getcwd(),'sentence-similarity/data/stsbenchmark/sts-dev.csv'))
STS_test = load_STS_dataset(os.path.join(os.getcwd(),'sentence-similarity/data/stsbenchmark/sts-test.csv'))

In [22]:
STS_dev[:5]

Unnamed: 0,sent_1,sent_2,sim
0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.,5.0
1,A young child is riding a horse.,A child is riding a horse.,4.75
2,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.,5.0
3,A woman is playing the guitar.,A man is playing guitar.,2.4
4,A woman is playing the flute.,A man is playing a flute.,2.75


In [24]:
STS_test[:5]

Unnamed: 0,sent_1,sent_2,sim
0,A girl is styling her hair.,A girl is brushing her hair.,2.5
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,4.2
4,A man is playing a harp.,A man is playing a keyboard.,1.5


In [1]:
import gensim
import os
from gensim.models import Word2Vec, KeyedVectors
from threading import Semaphore
import pickle

In [2]:
model = KeyedVectors.load('data/word2vec/GoogleNews-vectors-gensim-normed.bin',mmap='r')

In [3]:
with open('model.pickle','wb') as f:
    pickle.dump(model,f,pickle.HIGHEST_PROTOCOL)

In [5]:
model.syn0norm = model.syn0

  """Entry point for launching an IPython kernel.
  """Entry point for launching an IPython kernel.


In [21]:
model.most_similar('hate')

[('despise', 0.6712517142295837),
 ('Hate', 0.6400400400161743),
 ('detest', 0.6179036498069763),
 ('hatred', 0.6156139969825745),
 ('hating', 0.6103581190109253),
 ('hates', 0.6091769933700562),
 ('HATE', 0.6020098328590393),
 ('dislike', 0.6013234853744507),
 ('love', 0.600395679473877),
 ('hated', 0.5922117233276367)]

In [2]:
PATH_TO_WORD2VEC = os.path.join(os.getcwd(),'data/GoogleNews-vectors-negative300.bin.gz')

In [3]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format(PATH_TO_WORD2VEC, binary=True)

In [4]:
word2vec.init_sims(replace=True)

In [6]:
word2vec.save("data/word2vec/GoogleNews-vectors-gensim-normed.bin")

In [165]:
import csv

PATH_TO_FREQUENCIES_FILE = os.path.join(os.getcwd(),'sentence-similarity/data/frequencies.tsv')
PATH_TO_DOC_FREQUENCIES_FILE = os.path.join(os.getcwd(),'sentence-similarity/data/doc_frequencies.tsv')

def read_tsv(f):
    frequencies = {}
    with open(f) as tsv:
        tsv_reader = csv.reader(tsv, delimiter="\t")
        for row in tsv_reader: 
            frequencies[row[0]] = int(row[1])
        
    return frequencies
        
frequencies = read_tsv(PATH_TO_FREQUENCIES_FILE)
doc_frequencies = read_tsv(PATH_TO_DOC_FREQUENCIES_FILE)
doc_frequencies["NUM_DOCS"] = 1288431

In [164]:
# use Smooth Inverse Frequency
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import Counter
import math
def run_avg_benchmark(sentences1, sentences2, model=None, use_stoplist=False, doc_freqs=None): 

    if doc_freqs is not None:
        N = doc_freqs["NUM_DOCS"]
    
    sims = []
    for (sent1, sent2) in zip(sentences1, sentences2):
    
        tokens1 = sent1.tokens_without_stop if use_stoplist else sent1.tokens
        tokens2 = sent2.tokens_without_stop if use_stoplist else sent2.tokens

        tokens1 = [token for token in tokens1 if token in model]
        tokens2 = [token for token in tokens2 if token in model]
        
        if len(tokens1) == 0 or len(tokens2) == 0:
            sims.append(0)
            continue
        
        tokfreqs1 = Counter(tokens1)
        tokfreqs2 = Counter(tokens2)
        
        weights1 = [tokfreqs1[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                    for token in tokfreqs1] if doc_freqs else None
        weights2 = [tokfreqs2[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                    for token in tokfreqs2] if doc_freqs else None
                
        embedding1 = np.average([model[token] for token in tokfreqs1], axis=0, weights=weights1).reshape(1, -1)
        embedding2 = np.average([model[token] for token in tokfreqs2], axis=0, weights=weights2).reshape(1, -1)

        sim = cosine_similarity(embedding1, embedding2)[0][0]
        sims.append(sim)

    return sims

def remove_first_principal_component(X):
    svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0)
    svd.fit(X)
    pc = svd.components_
    XX = X - X.dot(pc.transpose()) * pc
    return XX


def run_sif_benchmark(sentences1, sentences2, model, freqs={}, use_stoplist=False, a=0.001): 
    total_freq = sum(freqs.values())
    
    embeddings = []
    
    # SIF requires us to first collect all sentence embeddings and then perform 
    # common component analysis.
    for (sent1, sent2) in zip(sentences1, sentences2): 
        
        tokens1 = sent1.tokens_without_stop if use_stoplist else sent1.tokens
        tokens2 = sent2.tokens_without_stop if use_stoplist else sent2.tokens
        
        tokens1 = [token for token in tokens1 if token in model]
        tokens2 = [token for token in tokens2 if token in model]
        
        weights1 = [a/(a+freqs.get(token,0)/total_freq) for token in tokens1]
        weights2 = [a/(a+freqs.get(token,0)/total_freq) for token in tokens2]
        
        embedding1 = np.average([model[token] for token in tokens1], axis=0, weights=weights1)
        embedding2 = np.average([model[token] for token in tokens2], axis=0, weights=weights2)
        
        embeddings.append(embedding1)
        embeddings.append(embedding2)
        
    embeddings = remove_first_principal_component(np.array(embeddings))
    sims = [cosine_similarity(embeddings[idx*2].reshape(1, -1), 
                              embeddings[idx*2+1].reshape(1, -1))[0][0] 
            for idx in range(int(len(embeddings)/2))]

    return sims

In [58]:
import nltk

STOP = set(nltk.corpus.stopwords.words("english"))

class Sentence:
    
    def __init__(self, sentence):
        self.raw = sentence
        normalized_sentence = sentence.replace("‘", "'").replace("’", "'")
        self.tokens = [t.lower() for t in nltk.word_tokenize(normalized_sentence)]
        self.tokens_without_stop = [t for t in self.tokens if t not in STOP]

In [65]:
import scipy
def run_experiment(df, benchmarks): 
    
    sentences1 = [Sentence(s) for s in df['sent_1']]
    sentences2 = [Sentence(s) for s in df['sent_2']]
    
    pearson_cors, spearman_cors = [], []
    for label, method in benchmarks:
        sims = method(sentences1, sentences2)
        pearson_correlation = scipy.stats.pearsonr(sims, df['sim'])[0]
        print(label, pearson_correlation)
        pearson_cors.append(pearson_correlation)
        spearman_correlation = scipy.stats.spearmanr(sims, df['sim'])[0]
        spearman_cors.append(spearman_correlation)
        
    return pearson_cors, spearman_cors

In [60]:
import functools as ft

benchmarks = [("SIF-W2V", ft.partial(run_sif_benchmark, freqs=frequencies, model=word2vec, use_stoplist=False))]

In [157]:
pearson_results, spearman_results = {}, {}
pearson_results["STS-DEV"], spearman_results["STS-DEV"] = run_experiment(STS_dev, benchmarks)
pearson_results["STS-TEST"], spearman_results["STS-TEST"] = run_experiment(STS_test, benchmarks)  

SIF-W2V 0.767571761029641
SIF-W2V 0.68976121890589


In [183]:
# single_sents = load_2_sentences("A man with a hard hat is dancing.","A man wearing a hard hat is dancing.")
s1 = [Sentence("A woman is playing the guitar.")]
s2 = [Sentence("A man is playing the guitar.")]

a = run_sif_benchmark(sentences1=s1,sentences2=s2,freqs=frequencies,model=word2vec,use_stoplist=True)

b = run_sif_benchmark(sentences1=s1,sentences2=s2,freqs=frequencies,model=word2vec,use_stoplist=False)

c =run_avg_benchmark(sentences1=s1,sentences2=s2,model=word2vec,use_stoplist=True,doc_freqs=doc_frequencies)

d =run_avg_benchmark(sentences1=s1,sentences2=s2,model=word2vec,use_stoplist=False,doc_freqs=doc_frequencies)

e =run_avg_benchmark(sentences1=s1,sentences2=s2,model=word2vec,use_stoplist=True)

f =run_avg_benchmark(sentences1=s1,sentences2=s2,model=word2vec,use_stoplist=False)

In [184]:
print(a,b,c,d,e,f)

[-1.0000000000000004] [-0.9999999999999996] [0.9541760896030935] [0.9550532493934184] [0.9501258] [0.9625326]
