# Eval Word Embedding for Chinese Wiki

## Lexical Evals

In [2]:
from fieldembed.keyedvectors import KeyedVectors
wv_neg = KeyedVectors.load_word2vec_format('embeddings/word2vec/word_vec_cb')



390106 200



In [3]:
wv_neg.lexical_evals()



{'sim240_spearman': 0.5450100615852266,
 'sim297_spearman': 0.5854600845729826,
 'ana_capital-common-countries': 0.8656126482213439,
 'ana_city-in-state': 0.88,
 'ana_family': 0.6397058823529411,
 'ana_Total accuracy': 0.8037775445960126}

## Text CLS by Fudan Corpus

In [82]:
from nlptext.sentence import Sentence
from nlptext.base import BasicObject
import numpy as np


Data_Dir = 'data/fudan/word/'
BasicObject.INIT_FROM_PICKLE(Data_Dir)
nlptext = BasicObject


def getsent2matrix(sent, wv):
    token_strs = [i[0] for i in sent.get_grain_str('token')]
    wv = wv.derivative_wv
    wv.set_GU_and_TU()
    TU = wv.TU # LGU in derivative wv is LTU
    if TU is None:
        TU = wv.GU
    # this code is verbose
    # TODO: how to deal with unk tokens
    token_idxes = [TU[1].get(token_str) for token_str in token_strs if token_str in TU[1]] # 0 is not unk, to fix it in the future
    # token_idxes = [i[0] for i in token_idxes]
    # print(token_idxes)
    # print(token_idxes)
    matrix = wv.vectors[token_idxes]
    return matrix

def convert_document_to_X_and_Y(nlptext, wv):
    doc_num = nlptext.SENT['length']
    
    docmatrix = np.zeros((doc_num, wv.vector_size))
    labels = np.zeros(doc_num, dtype = int)
    
    for sentidx in range(doc_num):
        sent = Sentence(sentidx)
        matrix = getsent2matrix(sent, wv)
        # matrix.append(sent)
        docvector = np.mean(matrix, axis = 0)
        docmatrix[sentidx] = docvector
        labels[sentidx] = sent.IdxGroup
    return docmatrix, labels

CORPUS	read from pickle file : data/fudan/word/Pyramid/CORPUS.p
CORPUS	the length of it is   : 1
GROUP	read from pickle file : data/fudan/word/Pyramid/GROUP.p
GROUP	the length of it is   : 5
TEXT	read from pickle file : data/fudan/word/Pyramid/TEXT.p
TEXT	the length of it is   : 5885
SENT	read from pickle file : data/fudan/word/Pyramid/SENT.p
SENT	the length of it is   : 5885
TOKEN	read from pickle file : data/fudan/word/Pyramid/TOKEN.p
TOKEN	the length of it is   : 17252831
**************************************** 



In [28]:
docmatrix, labels = convert_document_to_X_and_Y(nlptext, wv_neg)
X = docmatrix
Y = labels

print(X.shape)
print(Y.shape)

(5885, 200)
(5885,)


In [54]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn import metrics

# 70% training and 30% testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 109)

# clf = svm.SVC(kernel = 'linear')
clf = svm.SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3,
              gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True,
              tol=0.001, verbose=False)

clf.fit(X_train, y_train)

print(clf)
y_pred = clf.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

scores = cross_val_score(clf, X, Y, cv = 5)
print(scores.mean())

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.9309173272933182
[[362   4   9   2   0]
 [  5 312   2   2   6]
 [  6   0 291  16   0]
 [  2   2  15 415  14]
 [  0  14   1  22 264]]
0.9361096658614663


## CBOW

In [64]:
D = {}
D

{}

In [69]:
path = 'embeddings/word2vec/word_vec_cb'
name = 'base_cb'

from fieldembed.keyedvectors import KeyedVectors
wv_neg = KeyedVectors.load_word2vec_format(path)
evals_result = wv_neg.lexical_evals()
docmatrix, labels = convert_document_to_X_and_Y(nlptext, wv_neg)
X = docmatrix
Y = labels

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn import metrics

# 70% training and 30% testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 109)

# clf = svm.SVC(kernel = 'linear')
clf = svm.SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3,
              gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True,
              tol=0.001, verbose=False)

clf.fit(X_train, y_train)

print(clf)
y_pred = clf.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

scores = cross_val_score(clf, X, Y, cv = 5)
print(scores.mean())
print(scores.std() * 2)

evals_result['cls_score_mean'] = scores.mean()
evals_result['cls_score_2std'] = scores.std() * 2
D[name] = evals_result
evals_result



390106 200





SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.9388448471121178
[[365   2   9   1   0]
 [  3 314   2   3   5]
 [  6   0 292  15   0]
 [  3   1  12 420  12]
 [  0  12   1  21 267]]
0.9369565433138604
0.005501751202785384


{'sim240_spearman': 0.5025686135046619,
 'sim297_spearman': 0.5780691034099906,
 'ana_capital-common-countries': 0.8379446640316206,
 'ana_city-in-state': 0.8285714285714286,
 'ana_family': 0.6139705882352942,
 'ana_Total accuracy': 0.7722980062959076,
 'cls_score_mean': 0.9369565433138604,
 'cls_score_2std': 0.005501751202785384}

## Skip-Gram

In [71]:
path = 'embeddings/word2vec/word_vec_sg'
name = 'base_sg'

from fieldembed.keyedvectors import KeyedVectors
wv_neg = KeyedVectors.load_word2vec_format(path)
evals_result = wv_neg.lexical_evals()
docmatrix, labels = convert_document_to_X_and_Y(nlptext, wv_neg)
X = docmatrix
Y = labels

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn import metrics

# 70% training and 30% testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 109)

# clf = svm.SVC(kernel = 'linear')
clf = svm.SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3,
              gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True,
              tol=0.001, verbose=False)

clf.fit(X_train, y_train)

print(clf)
y_pred = clf.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

scores = cross_val_score(clf, X, Y, cv = 5)
print(scores.mean())
print(scores.std() * 2)

evals_result['cls_score_mean'] = scores.mean()
evals_result['cls_score_2std'] = scores.std() * 2
D[name] = evals_result
evals_result



390106 200





SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.8725934314835787
[[344   9  11  13   0]
 [  9 291   2  20   5]
 [  4   0 292  17   0]
 [  3   2  28 403  12]
 [  1  24   2  63 211]]
0.8835977350327704
0.009534433012110214


{'sim240_spearman': 0.542354421841409,
 'sim297_spearman': 0.6049056145586297,
 'ana_capital-common-countries': 0.8853754940711462,
 'ana_city-in-state': 0.8971428571428571,
 'ana_family': 0.5735294117647058,
 'ana_Total accuracy': 0.7985309548793285,
 'cls_score_mean': 0.8835977350327704,
 'cls_score_2std': 0.009534433012110214}

## JWE

In [89]:
path = 'embeddings/jwe/word_vec'
name = 'base_jwe'

from fieldembed.keyedvectors import KeyedVectors
wv_neg = KeyedVectors.load_word2vec_format(path)
evals_result = wv_neg.lexical_evals()
docmatrix, labels = convert_document_to_X_and_Y(nlptext, wv_neg)
X = docmatrix
Y = labels

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn import metrics

# 70% training and 30% testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 109)

# clf = svm.SVC(kernel = 'linear')
clf = svm.SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3,
              gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True,
              tol=0.001, verbose=False)

clf.fit(X_train, y_train)

print(clf)
y_pred = clf.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

scores = cross_val_score(clf, X, Y, cv = 5)
print(scores.mean())
print(scores.std() * 2)

evals_result['cls_score_mean'] = scores.mean()
evals_result['cls_score_2std'] = scores.std() * 2
D[name] = evals_result
evals_result



390106 200





SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.912797281993205
[[353  11   8   5   0]
 [  6 308   2   6   5]
 [  7   0 289  17   0]
 [  4   2  17 409  16]
 [  0  19   1  28 253]]
0.9155475917526074
0.006865086097689862


{'sim240_spearman': 0.5372673026102034,
 'sim297_spearman': 0.6510196788483026,
 'ana_capital-common-countries': 0.7529644268774703,
 'ana_city-in-state': 0.7942857142857143,
 'ana_family': 0.5992647058823529,
 'ana_Total accuracy': 0.7166841552990556,
 'cls_score_mean': 0.9155475917526074,
 'cls_score_2std': 0.006865086097689862}

In [90]:
{'sim240_spearman': 0.5601079141506247,
 'sim297_spearman': 0.6526441917534458,
 'ana_capital-common-countries': 0.758893280632411,
 'ana_city-in-state': 0.8971428571428571,
 'ana_family': 0.5735294117647058,
 'ana_Total accuracy': 0.7313746065057712,
 'cls_score_mean': 0.9155470132926368,
 'cls_score_2std': 0.007761698426804199}

{'sim240_spearman': 0.5601079141506247,
 'sim297_spearman': 0.6526441917534458,
 'ana_capital-common-countries': 0.758893280632411,
 'ana_city-in-state': 0.8971428571428571,
 'ana_family': 0.5735294117647058,
 'ana_Total accuracy': 0.7313746065057712,
 'cls_score_mean': 0.9155470132926368,
 'cls_score_2std': 0.007761698426804199}

## Token Subcomp Pinyin

In [77]:
path = 'embeddings/fieldembed/WikiChinese/word/token_subcomp_pinyin/200'
name = 'WScP'

from fieldembed import FieldEmbedding

fldembed = FieldEmbedding.load(path)
wv_neg = fldembed.wv_neg




In [83]:
wv_neg.set_GU_and_TU()

In [84]:
wv_neg.TU

In [85]:
evals_result = wv_neg.lexical_evals()
docmatrix, labels = convert_document_to_X_and_Y(nlptext, wv_neg)
X = docmatrix
Y = labels

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn import metrics

# 70% training and 30% testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 109)

# clf = svm.SVC(kernel = 'linear')
clf = svm.SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3,
              gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True,
              tol=0.001, verbose=False)

clf.fit(X_train, y_train)

print(clf)
y_pred = clf.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

scores = cross_val_score(clf, X, Y, cv = 5)
print(scores.mean())
print(scores.std() * 2)

evals_result['cls_score_mean'] = scores.mean()
evals_result['cls_score_2std'] = scores.std() * 2
D[name] = evals_result
evals_result



SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.920158550396376
[[361   6   9   1   0]
 [  5 310   1   5   6]
 [  5   0 287  21   0]
 [  2   2  13 416  15]
 [  0  21   0  29 251]]
0.9216661480152399
0.005572252791890109


{'sim240_spearman': 0.49489222354673823,
 'sim297_spearman': 0.5897586893843983,
 'ana_capital-common-countries': 0.7924901185770751,
 'ana_city-in-state': 0.8,
 'ana_family': 0.5698529411764706,
 'ana_Total accuracy': 0.7303252885624344,
 'cls_score_mean': 0.9216661480152399,
 'cls_score_2std': 0.005572252791890109}

In [87]:
path = 'embeddings/fieldembed/WikiChinese/word/token/200'
name = 'W'

from fieldembed import FieldEmbedding

fldembed = FieldEmbedding.load(path)
wv_neg = fldembed.wv_neg

evals_result = wv_neg.lexical_evals()
docmatrix, labels = convert_document_to_X_and_Y(nlptext, wv_neg)
X = docmatrix
Y = labels

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn import metrics

# 70% training and 30% testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 109)

# clf = svm.SVC(kernel = 'linear')
clf = svm.SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3,
              gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True,
              tol=0.001, verbose=False)

clf.fit(X_train, y_train)

print(clf)
y_pred = clf.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

scores = cross_val_score(clf, X, Y, cv = 5)
print(scores.mean())
print(scores.std() * 2)

evals_result['cls_score_mean'] = scores.mean()
evals_result['cls_score_2std'] = scores.std() * 2
D[name] = evals_result
evals_result



SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.9020385050962627
[[350   7   9  10   1]
 [  7 308   2   5   5]
 [  8   0 287  18   0]
 [  2   1  17 410  18]
 [  0  26   0  37 238]]
0.910107720140554
0.006274966193246377


{'sim240_spearman': 0.5105737718386821,
 'sim297_spearman': 0.5998599825194435,
 'ana_capital-common-countries': 0.8122529644268774,
 'ana_city-in-state': 0.7885714285714286,
 'ana_family': 0.5441176470588235,
 'ana_Total accuracy': 0.7313746065057712,
 'cls_score_mean': 0.910107720140554,
 'cls_score_2std': 0.006274966193246377}

In [94]:
path = 'embeddings/fieldembed/WikiChinese/word/token_pinyin/200'
name = 'WP'

from fieldembed import FieldEmbedding

fldembed = FieldEmbedding.load(path)
wv_neg = fldembed.wv_neg

evals_result = wv_neg.lexical_evals()
docmatrix, labels = convert_document_to_X_and_Y(nlptext, wv_neg)
X = docmatrix
Y = labels

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn import metrics

# 70% training and 30% testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 109)

# clf = svm.SVC(kernel = 'linear')
clf = svm.SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3,
              gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True,
              tol=0.001, verbose=False)

clf.fit(X_train, y_train)

print(clf)
y_pred = clf.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

scores = cross_val_score(clf, X, Y, cv = 5)
print(scores.mean())
print(scores.std() * 2)

evals_result['cls_score_mean'] = scores.mean()
evals_result['cls_score_2std'] = scores.std() * 2
D[name] = evals_result
evals_result



SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.9116647791619479
[[358   7   9   3   0]
 [  5 310   2   3   7]
 [  9   0 284  20   0]
 [  4   1  17 409  17]
 [  0  22   0  30 249]]
0.9186040558276707
0.006157353876534347


{'sim240_spearman': 0.4869464017511846,
 'sim297_spearman': 0.591229623148572,
 'ana_capital-common-countries': 0.7944664031620553,
 'ana_city-in-state': 0.8171428571428572,
 'ana_family': 0.5845588235294118,
 'ana_Total accuracy': 0.7387198321091291,
 'cls_score_mean': 0.9186040558276707,
 'cls_score_2std': 0.006157353876534347}

In [None]:
path = 'embeddings/fieldembed/WikiChinese/word/token_pinyin/200'
name = 'WP'

from fieldembed import FieldEmbedding

fldembed = FieldEmbedding.load(path)
wv_neg = fldembed.wv_neg

evals_result = wv_neg.lexical_evals()
docmatrix, labels = convert_document_to_X_and_Y(nlptext, wv_neg)
X = docmatrix
Y = labels

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn import metrics

# 70% training and 30% testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 109)

# clf = svm.SVC(kernel = 'linear')
clf = svm.SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3,
              gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True,
              tol=0.001, verbose=False)

clf.fit(X_train, y_train)

print(clf)
y_pred = clf.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

scores = cross_val_score(clf, X, Y, cv = 5)
print(scores.mean())
print(scores.std() * 2)

evals_result['cls_score_mean'] = scores.mean()
evals_result['cls_score_2std'] = scores.std() * 2
D[name] = evals_result
evals_result

## All

In [None]:
{'sim240_spearman': 0.49489222354673823,
 'sim297_spearman': 0.5897586893843983,
 'ana_capital-common-countries': 0.7924901185770751,
 'ana_city-in-state': 0.8,
 'ana_family': 0.5698529411764706,
 'ana_Total accuracy': 0.7303252885624344,
 'cls_score_mean': 0.9216661480152399,
 'cls_score_2std': 0.005572252791890109}

{'sim240_spearman': 0.5105737718386821,
 'sim297_spearman': 0.5998599825194435,
 'ana_capital-common-countries': 0.8122529644268774,
 'ana_city-in-state': 0.7885714285714286,
 'ana_family': 0.5441176470588235,
 'ana_Total accuracy': 0.7313746065057712,
 'cls_score_mean': 0.910107720140554,
 'cls_score_2std': 0.006274966193246377}

In [93]:
import pandas as pd

pd.DataFrame(D).T[['sim240_spearman', 'sim297_spearman', 'ana_Total accuracy', 'cls_score_mean']]

Unnamed: 0,sim240_spearman,sim297_spearman,ana_Total accuracy,cls_score_mean
base_cb,0.502569,0.578069,0.772298,0.936957
base_sg,0.542354,0.604906,0.798531,0.883598
base_jwe,0.537267,0.65102,0.716684,0.915548
WScP,0.494892,0.589759,0.730325,0.921666
W,0.510574,0.59986,0.731375,0.910108
WC,0.486946,0.59123,0.73872,0.918604


# Eval Grain Embedding