In [1]:
import tensorflow as tf
import pandas as pd
import os

In [2]:
def load_2_sentences(s1,s2):
    return pd.DataFrame([(s1,s2,float(0))], columns=["sent_1", "sent_2", "sim"])


In [3]:
def load_STS_dataset(file_path):
    sent_pairs = []
    with tf.io.gfile.GFile(file_path, "r") as f:
        for line in f:
            ts = line.strip().split("\t")
            sent_pairs.append((ts[5], ts[6], float(ts[4])))
    return pd.DataFrame(sent_pairs, columns=["sent_1", "sent_2", "sim"])

In [4]:
os.getcwd()

'/Users/i342202/projects/sentence-similarity'

In [5]:
STS_dev = load_STS_dataset(os.path.join(os.getcwd(),'data/stsbenchmark/sts-dev.csv'))
STS_test = load_STS_dataset(os.path.join(os.getcwd(),'data/stsbenchmark/sts-test.csv'))

In [6]:
STS_dev[:6]

Unnamed: 0,sent_1,sent_2,sim
0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.,5.0
1,A young child is riding a horse.,A child is riding a horse.,4.75
2,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.,5.0
3,A woman is playing the guitar.,A man is playing guitar.,2.4
4,A woman is playing the flute.,A man is playing a flute.,2.75
5,A woman is cutting an onion.,A man is cutting onions.,2.615


In [7]:
STS_test[:5]

Unnamed: 0,sent_1,sent_2,sim
0,A girl is styling her hair.,A girl is brushing her hair.,2.5
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,4.2
4,A man is playing a harp.,A man is playing a keyboard.,1.5


In [15]:
import nltk

STOP = set(nltk.corpus.stopwords.words("english"))

class Sentence:
    
    def __init__(self, sentence):
        self.raw = sentence
        normalized_sentence = sentence.replace("‘", "'").replace("’", "'")
        self.tokens = [t.lower() for t in nltk.word_tokenize(normalized_sentence)]
        self.tokens_without_stop = [t for t in self.tokens if t not in STOP]

In [9]:
import gensim
import os
from gensim.models import Word2Vec, KeyedVectors
from threading import Semaphore
import pickle

In [10]:
path_word2vec =os.path.join(os.getcwd(),'data/word2vec_gz/GoogleNews-vectors-negative300.bin.gz')
word2vec = gensim.models.KeyedVectors.load_word2vec_format(path_word2vec, binary=True,limit=500000)

In [16]:
PATH_TO_FREQUENCIES_FILE =os.path.join(os.getcwd(),'data\\frequencies.tsv')
PATH_TO_DOC_FREQUENCIES_FILE = os.path.join(os.getcwd(),'data\\doc_frequencies.tsv')

In [17]:
import csv
def read_tsv(f):
    frequencies = {}
    with open(f) as tsv:
        tsv_reader = csv.reader(tsv, delimiter="\t")
        for row in tsv_reader: 
            frequencies[row[0]] = int(row[1])
    return frequencies

In [18]:
frequencies = read_tsv(PATH_TO_FREQUENCIES_FILE)
doc_frequencies = read_tsv(PATH_TO_DOC_FREQUENCIES_FILE)
doc_frequencies["NUM_DOCS"] = 1288431

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import Counter
import math
def run_avg_benchmark(sentences1, sentences2, model=None, use_stoplist=False, doc_freqs=None): 

    if doc_freqs is not None:
        N = doc_freqs["NUM_DOCS"]
    
    sims = []
    for (sent1, sent2) in zip(sentences1, sentences2):
    
        tokens1 = sent1.tokens_without_stop if use_stoplist else sent1.tokens
        tokens2 = sent2.tokens_without_stop if use_stoplist else sent2.tokens

        tokens1 = [token for token in tokens1 if token in model]
        tokens2 = [token for token in tokens2 if token in model]
        
        if len(tokens1) == 0 or len(tokens2) == 0:
            sims.append(0)
            continue
        
        tokfreqs1 = Counter(tokens1)
        tokfreqs2 = Counter(tokens2)
        
        weights1 = [tokfreqs1[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                    for token in tokfreqs1] if doc_freqs else None
        weights2 = [tokfreqs2[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                    for token in tokfreqs2] if doc_freqs else None
                
        embedding1 = np.average([model[token] for token in tokfreqs1], axis=0, weights=weights1).reshape(1, -1)
        embedding2 = np.average([model[token] for token in tokfreqs2], axis=0, weights=weights2).reshape(1, -1)

        sim = cosine_similarity(embedding1, embedding2)[0][0]
        sims.append(sim)

    return sims

In [12]:
import scipy
def run_experiment(df, benchmarks): 
    
    sentences1 = [Sentence(s) for s in df['sent_1']]
    sentences2 = [Sentence(s) for s in df['sent_2']]
    
    pearson_cors, spearman_cors = [], []
    for label, method in benchmarks:
        sims = method(sentences1, sentences2)
        pearson_correlation = scipy.stats.pearsonr(sims, df['sim'])[0]
        print(label, pearson_correlation)
        pearson_cors.append(pearson_correlation)
        spearman_correlation = scipy.stats.spearmanr(sims, df['sim'])[0]
        spearman_cors.append(spearman_correlation)
        
    return pearson_cors, spearman_cors

In [16]:
import functools as ft

benchmarks = [("SIF-W2V", ft.partial(run_avg_benchmark, model=word2vec, use_stoplist=False))]

In [17]:
pearson_results, spearman_results = {}, {}
pearson_results["STS-DEV"], spearman_results["STS-DEV"] = run_experiment(STS_dev, benchmarks)
pearson_results["STS-TEST"], spearman_results["STS-TEST"] = run_experiment(STS_test, benchmarks)  

SIF-W2V 0.7015813358771543
SIF-W2V 0.60572531231956


In [6]:
word2vec.init_sims(replace=True)
word2vec.save("data/word2vec/word2vec.bin")

In [19]:
model_path = os.path.join(os.getcwd(),'data/word2vec/word2vec.bin')
model = KeyedVectors.load(model_path, mmap='r')
model.syn0norm = model.syn0

  model.syn0norm = model.syn0
  model.syn0norm = model.syn0
