In [6]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from collections import Counter
import pickle
from datetime import datetime

In [None]:
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#

from __future__ import absolute_import, division, unicode_literals

import sys
import numpy as np
import logging
import sklearn

import imp
data = imp.load_source('data', 'examples/data.py')

# Set PATHs
# path to senteval
PATH_TO_SENTEVAL = '../'
# path to the NLP datasets 
PATH_TO_DATA = 'data/'

operations = ['sum', 'mean','concatenate']

def get_sentence_embed(sentvecs, operation):
    
    ax = 0
    
    if operation == 'sum':
        return np.sum(sentvecs, axis = ax)
    
    elif operation == 'mean':
        return np.mean(sentvecs, axis = ax)
    
    elif operation == 'concatenate':
        
        mean = np.mean(sentvecs, axis = ax)
        sum_ = np.sum(sentvecs, axis = ax)
        concatenated = np.hstack((mean, sum_))
        return concatenated
    
    
# import SentEval
sys.path.insert(0, PATH_TO_SENTEVAL)
sys.path.insert(0, PATH_TO_DATA)
import senteval

class dotdict(dict):
    """ dot.notation access to dictionary attributes """
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__
    
def prepare(params, samples):
    """
    In this example we are going to load Glove, 
    here you will initialize your model.
    remember to add what you model needs into the params dictionary
    """

    params.model = KeyedVectors.load_word2vec_format(PATH_TO_MODEL)
    
    return


def batcher(params, batch):
    """
    In this example we use the average of word embeddings as a sentence representation.
    Each batch consists of one vector for sentence.
    Here you can process each sentence of the batch, 
    or a complete batch (you may need masking for that).
    
    """
    # if a sentence is empty dot is set to be the only token
    # you can change it into NULL dependening in your model
    batch = [sent if sent != [] else ['.'] for sent in batch]
    embeddings = []

    for sent in batch:
        sentvec = []
        # the format of a sentence is a lists of words (tokenized and lowercased)
        for word in sent:
            if word in params.model:
                # [number of words, embedding dimensionality]
                sentvec.append(params.model[word])
        if not sentvec:
            vec = [params.model['.']]
            # [number of words, embedding dimensionality]
            sentvec.append(vec)
        # average of word embeddings for sentence representation
        # [embedding dimansionality]
        
        sentvec = get_sentence_embed(sentvec, params_senteval.operation)
        if np.isnan(sentvec.sum()):
            sentvec = np.nan_to_num(sentvec)    
        
        embeddings.append(sentvec)
    # [batch size, embedding dimensionality]
    embeddings = np.vstack(embeddings)
    return embeddings


# Set params for SentEval
# we use logistic regression (usepytorch: Fasle) and kfold 10
# In this dictionary you can add extra information that you model needs for initialization
# for example the path to a dictionary of indices, of hyper parameters
# this dictionary is passed to the batched and the prepare fucntions
params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': False, 'kfold': 10}
# this is the config for the NN classifier but we are going to use scikit-learn logistic regression with 10 kfold
# usepytorch = False 

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    

    params_senteval = {'task_path': '',
                       'usepytorch': False,
                       'kfold': 10,
                        'operation' : None}
    # made dictionary a dotdict
    params_senteval = dotdict(params_senteval)

    size = [100,300]
    window = [2,5]
    neg_samples = [5,20]
    iterations = [10,20]
    
    
    for s in size:
        for w in window:
            for n in neg_samples:
                for i in iterations:

                    if s == 300 and n == 20:
                        #skip because we didn't train them
                        break
                        
                    #print(s,w,n,i)
                    ext = 'en_' + str(s) + '_' +  str(w)+ '_'  + str(n)+ '_' +  str(i)
                    
                    PATH_TO_MODEL = 'model_'+ext+'.bin'

                    for op in operations:
                        # here you define the NLP taks that your embedding model is going to be evaluated
                        # in (https://arxiv.org/abs/1802.05883) we use the following :
                        # SICKRelatedness (Sick-R) needs torch cuda to work (even when using logistic regression), 
                        # but STS14 (semantic textual similarity) is a similar type of semantic task
                        transfer_tasks = ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC',
                      'MRPC', 'SICKEntailment', 'STS12','STS13','STS14','STS15','STS16']
                                              
                            
                        #no cuda tasks...
                
                        #cuda ["SNLI","STSBenchmark","SICKRelatedness","ImageCaptionRetrieval"]
                        
                        probing_tasks = ['BigramShift', 'CoordinationInversion', 'ObjNumber', 'OddManOut', 
                            'Tense', 'Length', 'SubjNumber','TopConstituents', 'Depth', 'WordContent']
                            
                        params_senteval.operation = op
                        params_senteval.task_path = '/home/ece/Desktop/ulllab/ull3/SentEval/data' 

                        se = senteval.engine.SE(params_senteval, batcher, prepare)

                        # senteval prints the results and returns a dictionary with the scores
                        results_tr = se.eval(transfer_tasks)
                        print(results_tr)                        
                        
                        filename = 'skipgram_tr_' + ext + '_' + op + '.pickle'

                        with open(filename, 'wb') as file:
                            pickle.dump(results, file)
                        
                        results_pr = se.eval(probing_tasks)
                        print(results_pr)

                        
                        filename = 'skipgram_pr_' + ext + '_' + op + '.pickle'

                        with open(filename, 'wb') as file:
                            pickle.dump(results, file)

2018-05-29 11:53:44,546 : ***** Transfer task : MR *****


2018-05-29 11:53:44,576 : loading projection weights from model_en_100_2_5_10.bin
2018-05-29 11:53:44,577 : {'kw': {}, 'mode': 'rb', 'uri': 'model_en_100_2_5_10.bin'}
2018-05-29 11:53:44,577 : encoding_wrapper: {'encoding': None, 'fileobj': <_io.BufferedReader name='model_en_100_2_5_10.bin'>, 'mode': 'rb', 'errors': 'strict'}
2018-05-29 11:53:50,253 : loaded (77338, 100) matrix from model_en_100_2_5_10.bin
2018-05-29 11:53:50,262 : Generating sentence embeddings
2018-05-29 11:53:51,174 : Generated sentence embeddings
2018-05-29 11:53:51,175 : Training sklearn-LogReg with (inner) 10-fold cross-validation
2018-05-29 11:54:01,414 : Best param found at split 1: l2reg = 0.5                 with score 68.95
2018-05-29 11:54:12,509 : Best param found at split 2: l2reg = 2                 with score 68.71
2018-05-29 11:54:23,787 : Best param found at split 3: l2reg = 2                 with score 68.83
2018-05-29 11:54:35,285 : Best par