In [74]:
import re
import os
import numpy as np
from sklearn.decomposition import TruncatedSVD
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr
from scipy.linalg import svd
import nltk
import sys
from functools import reduce

In [194]:
class word2prob(object):
    """Map words to their probabilities."""
    def __init__(self, count_fn):
        """Initialize a word2prob object.

        Args:
            count_fn: word count file name (one word per line) 
        """
        self.prob = {}
        total = 0.0

        for line in open(count_fn):
            k, v = line.split()
            v = int(v)
            k = k.lower()

            self.prob[k] = v
            total += v

        self.prob = {k : (self.prob[k] / total) for k in self.prob}
        self.min_prob = min(self.prob.values())
        self.count = total

    def __getitem__(self, w):
        return self.prob.get(w.lower(), self.min_prob)

    def __contains__(self, w):
        return w.lower() in self.prob

    def __len__(self):
        return len(self.prob)

    def vocab(self):
        return iter(self.prob.keys())

In [195]:
class word2vec(object):
    """Map words to their embeddings."""
    def __init__(self, vector_fn):
        """Initialize a word2vec object.

        Args:
            vector_fn: embedding file name (one word per line)
        """
        self.vectors = {}

        for line in open(vector_fn, 'r', encoding="utf-8"):
            line = line.split()

            # skip first line if needed
            if len(line) == 2:
                continue

            word = line[0]
            embedding = np.array([float(val) for val in line[1:]])
            self.vectors[word] = embedding

    def __getitem__(self, w):
        return self.vectors[w]

    def __contains__(self, w):
        return w in self.vectors
    
    def _print(self):
        print(self.vectors)

In [212]:
class uSIF(object):
    """Embed sentences using unsupervised smoothed inverse frequency."""
    def __init__(self, vec, prob, n=11, m=5):
        """Initialize a sent2vec object.

        Variable names (e.g., alpha, a) all carry over from the paper.

        Args:
            vec: word2vec object
            prob: word2prob object
            n: expected random walk length. This is the avg sentence length, which
                should be estimated from a large representative sample. For STS
                tasks, n ~ 11. n should be a positive integer.
            m: number of common discourse vectors (in practice, no more than 5 needed)
        """
        self.vec = vec
        self.m = m

        if not (isinstance(n, int) and n > 0):
            raise TypeError("n should be a positive integer")

        vocab_size = float(len(prob))
        threshold = 1 - (1 - 1/vocab_size) ** n
        alpha = len([ w for w in prob.vocab() if prob[w] > threshold ]) / vocab_size
        Z = 0.5 * vocab_size
        self.a = (1 - alpha)/(alpha * Z)

        self.weight = lambda word: (self.a / (0.5 * self.a + prob[word])) 

    def _to_vec(self, sentence):
        """Vectorize a given sentence.

        Args:
            sentence: a sentence (string) 
        """
        # regex for non-punctuation
        not_punc = re.compile('.*[A-Za-z0-9].*')

        # preprocess a given token
        def preprocess(t):
            t = t.lower().strip("';.:()").strip('"')
            t = 'not' if t == "n't" else t
            return t

#         tokens = map(preprocess, filter(lambda t: not_punc.match(t), nltk.word_tokenize(sentence)))
        tokens = [preprocess(x) for x in filter(lambda t: not_punc.match(t), nltk.word_tokenize(sentence))]
#         tokens = reduce(lambda a, b: a + b, [[]] + map(lambda t: re.split(r'[-]', t), tokens))
        splitter = lambda t: re.split(r'[-]', t)
        tokens = reduce(lambda a, b: a + b, [[]] + [splitter(x) for x in tokens])
        
        tokens = list(filter(lambda t: t in self.vec, tokens))
        
        weighter = lambda i, t: self.weight(t) * v_t[i,:]
        vectorizer = lambda i, t: self.vec[t]
        # if no parseable tokens, return a vector of a's  
        if tokens == []:
            return np.zeros(300) + self.a
        else:
#             v_t = np.array(map(lambda i, t: self.vec[t], enumerate(tokens)))
            v_t = np.array([vectorizer(i, t) for i, t in enumerate(tokens)])
            v_t = v_t * (1.0 / np.linalg.norm(v_t, axis=0))
#             v_t = np.array(map(lambda i, t: self.weight(t) * v_t[i,:], enumerate(tokens)))
            v_t = np.array([weighter(i, t) for i,t in enumerate(tokens)])
            return np.mean(v_t, axis=0) 

    def embed(self, sentences):
        """Embed a list of sentences.

        Args:
            sentences: a list of sentences (strings)
        """
#         vectors = map(self._to_vec, sentences)
        vectors = [self._to_vec(sentence) for sentence in sentences]
        if self.m == 0:
            return vectors

        proj = lambda a, b: a.dot(b.transpose()) * b
        svd = TruncatedSVD(n_components=self.m, random_state=0).fit(vectors)

        projector = lambda v_s: v_s - lambda_i * proj(v_s, pc)
        # remove the weighted projections on the common discourse vectors
        for i in range(self.m):
            lambda_i = (svd.singular_values_[i] ** 2) / (svd.singular_values_ ** 2).sum()
            pc = svd.components_[i]
#             vectors = map(lambda v_s: v_s - lambda_i * proj(v_s, pc), vectors) 
            vectors = [projector(v_s) for v_s in vectors]

        return vectors

In [213]:
def test_STS(model):
    """Test the performance on the STS tasks and print out the results.

    Expected results:
        STS2012: 0.683
        STS2013: 0.661
        STS2014: 0.784
        STS2015: 0.790
        SICK2014: 0.735
        STSBenchmark: 0.795

    Args:
        model: a uSIF object
    """ 
    test_dirs = [
        'STS/STS-data/STS2012-gold/',
        'STS/STS-data/STS2013-gold/',
        'STS/STS-data/STS2014-gold/',
        'STS/STS-data/STS2015-gold/',
        'STS/SICK-data/',
        'STSBenchmark/'
    ]

    for td in test_dirs:
        test_fns = list(filter(lambda fn: '.input.' in fn and fn.endswith('txt'), os.listdir(td)))
        scores = []

        for fn in test_fns:
            sentences = re.split(r'\t|\n', open(td + fn, encoding="utf-8").read().strip())
            vectors = model.embed(sentences)
            y_hat = [1 - cosine(vectors[i], vectors[i+1]) for i in range(0, len(vectors), 2)]
#             y = map(float, open(td + fn.replace('input', 'gs')).read().strip().split('\n'))
            y = [float(x) for x in open(td + fn.replace('input', 'gs')).read().strip().split('\n')]

            score = pearsonr(y, y_hat)[0]
            scores.append(score)

            print(fn, "\t", score)

        print(td, np.mean(scores), "\n")

In [214]:
def get_paranmt_usif():
    """Return a uSIF embedding model that used pre-trained ParaNMT word vectors."""
    prob = word2prob('enwiki_vocab_min200.txt')
    vec = word2vec('vectors/czeng.txt')
    return uSIF(vec, prob)

In [215]:
model = get_paranmt_usif()

In [216]:
test_STS(model)

STS.input.MSRpar.txt 	 0.5832344220178628
STS.input.MSRvid.txt 	 0.90059731875375
STS.input.SMTeuroparl.txt 	 0.54678977813508
STS.input.surprise.OnWN.txt 	 0.7424428233207502
STS.input.surprise.SMTnews.txt 	 0.6405004439431515
STS/STS-data/STS2012-gold/ 0.6827129572341188 

STS.input.FNWN.txt 	 0.5362598491434404
STS.input.OnWN.txt 	 0.8751195336553216
STS.input.SMT.txt 	 0.4201164598100904
STS.input.headlines.txt 	 0.8120590733052622
STS/STS-data/STS2013-gold/ 0.6608887289785286 

STS.input.OnWN.txt 	 0.8886050369707091
STS.input.deft-forum.txt 	 0.585211949036511
STS.input.deft-news.txt 	 0.7836139433268715
STS.input.headlines.txt 	 0.7908262011798272
STS.input.images.txt 	 0.8710372652845926
STS.input.tweet-news.txt 	 0.7892451581165936
STS/STS-data/STS2014-gold/ 0.7847565923191842 

STS.input.answers-forums.txt 	 0.7191337358397288
STS.input.answers-students.txt 	 0.7352044560438865
STS.input.belief.txt 	 0.7918937332763348
STS.input.headlines.txt 	 0.8308740059648289
STS.input.im