In [18]:
from gensim import models
import pickle
import os

In [19]:
# method taken from SentEval Examples
def get_dictionary(sentences, threshold=0):
    words = {}
    for s in sentences:
        for word in s:
            words[word] = words.get(word, 0) + 1

    if threshold > 0:
        newwords = {}
        for word in words:
            if words[word] >= threshold:
                newwords[word] = words[word]
        words = newwords
    words['<s>'] = 1e9 + 4
    words['</s>'] = 1e9 + 3
    words['<p>'] = 1e9 + 2

    sorted_words = sorted(words.items(), key=lambda x: -x[1])  # inverse sort
    id2word = []
    word2id = {}
    for i, (w, _) in enumerate(sorted_words):
        id2word.append(w)
        word2id[w] = i

    return id2word, word2id


# Skip gram word2vec
def get_wordvec(model, word2id):
    '''
    model: gensim trained skipgram model
    word2id: word2index
    '''
    word_vec = {}
    
    for word in word2id:
        if word in model.wv.vocab:
            word_vec[word] = model.wv[word]

    return word_vec

In [28]:
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#

from __future__ import absolute_import, division, unicode_literals

import sys
import numpy as np
import logging
import sklearn
# import SentEval.examples.data as data

# Set PATHs
# path to senteval
PATH_TO_SENTEVAL = 'SentEval/'
# path to the NLP datasets 
PATH_TO_DATA = 'SentEval/data'
# path to SkipGram model
PATH_TO_VEC = 'model/200_10_20_skip.bin'
model = models.Word2Vec.load(PATH_TO_VEC)

# import SentEval
sys.path.insert(0, PATH_TO_SENTEVAL)
import senteval


def prepare(params, samples):
    """
    For Skip gram,
    """
    _, params.word2id = get_dictionary(samples)
    # load glove/word2vec format 
    params.word_vec = get_wordvec(model, params.word2id)
    # dimensionality of Skip embeddings
    params.wvec_dim = 200
    return

def batcher(params, batch):
    """
    For Skip gram
    
    """
    # if a sentence is empty dot is set to be the only token
    batch = [sent if sent != [] else [''] for sent in batch]
    embeddings = []

    for sent in batch:
        sentvec = []
        # the format of a sentence is a lists of words (tokenized and lowercased)
        for word in sent:
            if word in params.word_vec:
                # [number of words, embedding dimensionality]
                sentvec.append(params.word_vec[word])
        if not sentvec:
            vec = np.zeros(params.wvec_dim)
            # [number of words, embedding dimensionality]
            sentvec.append(vec)
        # average of word embeddings for sentence representation
        # [embedding dimansionality]
        sentvec = np.mean(sentvec, 0)
        embeddings.append(sentvec)
    # [batch size, embedding dimensionality]
    embeddings = np.vstack(embeddings)
    return embeddings


# Set params for SentEval
# we use logistic regression (usepytorch: Fasle) and kfold 10
# In this dictionary you can add extra information that you model needs for initialization
# for example the path to a dictionary of indices, of hyper parameters
# this dictionary is passed to the batched and the prepare fucntions
params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': False, 'kfold': 10}
# this is the config for the NN classifier but we are going to use scikit-learn logistic regression with 10 kfold
# usepytorch = False 
#params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
#                                 'tenacity': 3, 'epoch_size': 2}

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    se = senteval.engine.SE(params_senteval, batcher, prepare)
    
    # here you define the NLP taks that your embedding model is going to be evaluated
    # in (https://arxiv.org/abs/1802.05883) we use the following :
    # SICKRelatedness (Sick-R) needs torch cuda to work (even when using logistic regression), 
    # but STS14 (semantic textual similarity) is a similar type of semantic task
    transfer_tasks = ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC',
                      'MRPC', 'SICKEntailment', 'STS14']
    # senteval prints the results and returns a dictionary with the scores
    results = se.eval(transfer_tasks)
    print(results)
    with open(os.path.join("model","200_10_20_skip.pkl"),'wb') as f:
        pickle.dump(results, f)

2018-05-31 13:44:32,383 : loading Word2Vec object from model/200_10_20_skip.bin
2018-05-31 13:44:32,384 : {'kw': {}, 'mode': 'rb', 'uri': 'model/200_10_20_skip.bin'}
2018-05-31 13:44:32,385 : encoding_wrapper: {'mode': 'rb', 'errors': 'strict', 'fileobj': <_io.BufferedReader name='model/200_10_20_skip.bin'>, 'encoding': None}
2018-05-31 13:44:32,839 : loading wv recursively from model/200_10_20_skip.bin.wv.* with mmap=None
2018-05-31 13:44:32,840 : setting ignored attribute vectors_norm to None
2018-05-31 13:44:32,841 : loading vocabulary recursively from model/200_10_20_skip.bin.vocabulary.* with mmap=None
2018-05-31 13:44:32,842 : loading trainables recursively from model/200_10_20_skip.bin.trainables.* with mmap=None
2018-05-31 13:44:32,843 : setting ignored attribute cum_table to None
2018-05-31 13:44:32,843 : loaded model/200_10_20_skip.bin
2018-05-31 13:44:32,960 : ***** Transfer task : MR *****


2018-05-31 13:44:33,160 : Generating sentence embeddings
2018-05-31 13:44:33,637 : 

2018-05-31 13:52:31,051 : Evaluating...
2018-05-31 13:52:31,193 : Dev acc : 70.27 Test acc 71.01; Test F1 80.78 for MRPC.

2018-05-31 13:52:31,194 : ***** Transfer task : SICK-Entailment*****


2018-05-31 13:52:31,288 : Computing embedding for dev
2018-05-31 13:52:31,324 : Computed dev embeddings
2018-05-31 13:52:31,324 : Computing embedding for train
2018-05-31 13:52:31,524 : Computed train embeddings
2018-05-31 13:52:31,525 : Computing embedding for test
2018-05-31 13:52:31,733 : Computed test embeddings
2018-05-31 13:52:31,746 : Training sklearn-LogReg with standard validation..
2018-05-31 13:52:35,545 : [('reg:0.25', 69.0), ('reg:0.5', 68.4), ('reg:1', 68.6), ('reg:2', 67.0), ('reg:4', 65.4), ('reg:8', 65.2)]
2018-05-31 13:52:35,545 : Validation : best param found is reg = 0.25 with score             69.0
2018-05-31 13:52:35,546 : Evaluating...
2018-05-31 13:52:35,970 : 
Dev acc : 69.0 Test acc : 70.06 for                        SICK entailment

2018-05-31 13:52:35,971 : ***** Tran

{'MRPC': {'f1': 80.78, 'ndev': 4076, 'devacc': 70.27, 'ntest': 1725, 'acc': 71.01}, 'SUBJ': {'ndev': 10000, 'devacc': 83.9, 'ntest': 10000, 'acc': 84.13}, 'SST2': {'ndev': 872, 'devacc': 71.22, 'ntest': 1821, 'acc': 72.71}, 'MPQA': {'ndev': 10606, 'devacc': 85.05, 'ntest': 10606, 'acc': 84.99}, 'TREC': {'ndev': 5452, 'devacc': 62.18, 'ntest': 500, 'acc': 62.8}, 'SICKEntailment': {'ndev': 500, 'devacc': 69.0, 'ntest': 4927, 'acc': 70.06}, 'STS14': {'all': {'spearman': {'mean': 0.6264608768182562, 'wmean': 0.6400951189476622}, 'pearson': {'mean': 0.6529429836676489, 'wmean': 0.6661760356081586}}, 'deft-news': {'nsamples': 300, 'spearman': SpearmanrResult(correlation=0.666322124925226, pvalue=7.192141899761077e-40), 'pearson': (0.7061927534573326, 1.3415285600255232e-46)}, 'OnWN': {'nsamples': 750, 'spearman': SpearmanrResult(correlation=0.7867638353783685, pvalue=6.850496592676228e-159), 'pearson': (0.7705091308881843, 1.961780466557028e-148)}, 'headlines': {'nsamples': 750, 'spearman': 