# Evals Methods

In [1]:
from fieldembed.keyedvectors import KeyedVectors
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn import metrics

from nlptext.sentence import Sentence
from nlptext.base import BasicObject
import numpy as np


Data_Dir = 'data/fudan/word/'
BasicObject.INIT_FROM_PICKLE(Data_Dir)
nlptext = BasicObject


def getsent2matrix(sent, wv):
    token_strs = [i[0] for i in sent.get_grain_str('token')]
    wv = wv.derivative_wv
    wv.set_GU_and_TU()
    TU = wv.TU # LGU in derivative wv is LTU
    if TU is None:
        TU = wv.GU
    # this code is verbose
    # TODO: how to deal with unk tokens
    token_idxes = [TU[1].get(token_str) for token_str in token_strs if token_str in TU[1]] # 0 is not unk, to fix it in the future
    # token_idxes = [i[0] for i in token_idxes]
    # print(token_idxes)
    # print(token_idxes)
    matrix = wv.vectors[token_idxes]
    return matrix

def convert_document_to_X_and_Y(nlptext, wv):
    doc_num = nlptext.SENT['length']
    
    docmatrix = np.zeros((doc_num, wv.vector_size))
    labels = np.zeros(doc_num, dtype = int)
    
    for sentidx in range(doc_num):
        sent = Sentence(sentidx)
        matrix = getsent2matrix(sent, wv)
        # matrix.append(sent)
        docvector = np.mean(matrix, axis = 0)
        docmatrix[sentidx] = docvector
        labels[sentidx] = sent.IdxGroup
    return docmatrix, labels

CORPUS	read from pickle file : data/fudan/word/Pyramid/CORPUS.p
CORPUS	the length of it is   : 1
GROUP	read from pickle file : data/fudan/word/Pyramid/GROUP.p
GROUP	the length of it is   : 5
TEXT	read from pickle file : data/fudan/word/Pyramid/TEXT.p
TEXT	the length of it is   : 5885
SENT	read from pickle file : data/fudan/word/Pyramid/SENT.p
SENT	the length of it is   : 5885
TOKEN	read from pickle file : data/fudan/word/Pyramid/TOKEN.p
TOKEN	the length of it is   : 17252831
**************************************** 



# Baseline Models

In [2]:
import os
base_dir = 'embeddings/baseline/'

def EmbeddingModelsReader(base_dir):
    results = [x for x in os.walk(base_dir) if x[2]]
    d = {i[0]: i[2] for i in results}
    L = []
    for path, names in d.items():
        for name in names:
            if 'word' in name:
                modelname = os.path.join(path, name)
                L.append(modelname)
                # print(modelname)
    return L


    
    
modelnames = EmbeddingModelsReader(base_dir)
modelnames

['embeddings/baseline/WikiChinese/word/word2vec/sg-it1-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/word200',
 'embeddings/baseline/WikiChinese/word/word2vec/cb-it1-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/word200',
 'embeddings/baseline/WikiChinese/word/cwe/cb-it1-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/word200',
 'embeddings/baseline/WikiChinese/word/jwe/cb-it1-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/word200']

In [4]:
D = {}

In [6]:

for wv_file in modelnames:
    # 1. load data
    if wv_file in D: continue 
    print(wv_file)
    sep = ' ' if 'cwe' not in wv_file else '\t'
    word_vec = KeyedVectors.load_word2vec_format(wv_file, sep = sep)
 
    # 2. lexical evals
    evals_result = word_vec.lexical_evals()
    
    # 3. sent clf
    docmatrix, labels = convert_document_to_X_and_Y(nlptext, word_vec)
    X = docmatrix
    Y = labels

    # 70% training and 30% testing
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 109)

    # clf = svm.SVC(kernel = 'linear')
    clf = svm.SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3,
                  gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True,
                  tol=0.001, verbose=False)

    clf.fit(X_train, y_train)

    print(clf)
    y_pred = clf.predict(X_test)

    print(metrics.accuracy_score(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))

    scores = cross_val_score(clf, X, Y, cv = 5)
    print(scores.mean())
    print(scores.std() * 2)

    evals_result['cls_score_mean'] = scores.mean()
    evals_result['cls_score_2std'] = scores.std() * 2
    D[wv_file] = evals_result
    print(evals_result)
    


embeddings/baseline/WikiChinese/word/word2vec/sg-it1-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/word200
390106 200

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.8465458663646659
[[337  13  10  16   1]
 [ 12 281   2  25   7]
 [  6   0 288  19   0]
 [  4   3  28 401  12]
 [  2  25   2  84 188]]
0.8637169386015764
0.011571134764834078
{'sim240_spearman': 0.5101948970523248, 'sim297_spearman': 0.5929414085454954, 'ana_capital-common-countries': 0.6936758893280632, 'ana_city-in-state': 0.7314285714285714, 'ana_family': 0.48161764705882354, 'ana_Total accuracy': 0.640083945435467, 'cls_score_mean': 0.8637169386015764, 'cls_score_2std': 0.011571134764834078}
embeddings/baseline/WikiChinese/word/word2vec/cb-it1-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/word200
390106 200

SVC(C=10.0, cache_size=200, class_weight=Non

In [12]:
import pandas as pd
pd.DataFrame(D).T[['sim240_spearman', 'sim297_spearman', 'ana_Total accuracy',  'cls_score_mean' ]]

Unnamed: 0,sim240_spearman,sim297_spearman,ana_Total accuracy,cls_score_mean
embeddings/baseline/WikiChinese/word/word2vec/sg-it1-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/word200,0.510195,0.592941,0.640084,0.863717
embeddings/baseline/WikiChinese/word/word2vec/cb-it1-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/word200,0.459014,0.572294,0.534103,0.927782
embeddings/baseline/WikiChinese/word/cwe/cb-it1-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/word200,0.504903,0.537895,0.101784,0.945622
embeddings/baseline/WikiChinese/word/jwe/cb-it1-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/word200,0.477193,0.583152,0.405037,0.897871
