# Word Similarity and Word Analogy

In [1]:
from fieldembed.keyedvectors import KeyedVectors
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn import metrics


lexical_files = {
    'sim': ['fieldembed/sources/eval_en/sim/ws353.txt',
           'fieldembed/sources/eval_en/sim/ws353_relatedness.txt',
           'fieldembed/sources/eval_en/sim/ws353_similarity.txt'],
    'ana': ['fieldembed/sources/eval_en/ana/google_ana.txt',
            'fieldembed/sources/eval_en/ana/msr_ana.txt',
           ], 
}

def lexical_evals(wv, lexical_files):
    if getattr(wv, 'LKP', None) is not None:
        raise('This is for token level embeddings')
    d = {}
    for task, files in lexical_files.items():
        if task == 'sim':
            for file in files:
                pearson, spearman, oov_ratio = wv.evaluate_word_pairs(file, restrict_vocab=500000, case_insensitive=False)
                d[file.split('/')[-1]] = spearman.correlation
                
        elif task == 'ana':
            for file in files:
                analogies_score, sections = wv.evaluate_word_analogies(file, restrict_vocab=500000, case_insensitive=False)
                for section in sections:
                    correct = len(section['correct'])
                    total = len(section['correct']) + len(section['incorrect'])
                    d[file.split('/')[-1] + '_' + section['section'] ] = correct/total
    return d



## Baseline

In [2]:
import os
base_dir = 'embeddings/baseline/WikiEnglish/'

def EmbeddingModelsReader(base_dir):
    results = [x for x in os.walk(base_dir) if x[2]]
    d = {i[0]: i[2] for i in results}
    L = []
    for path, names in d.items():
        for name in names:
            if 'word' in name:
                modelname = os.path.join(path, name)
                L.append(modelname)
                # print(modelname)
    return L
    
modelnames = EmbeddingModelsReader(base_dir)
modelnames

['embeddings/baseline/WikiEnglish/word/word2vec/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/word200',
 'embeddings/baseline/WikiEnglish/word/cwe/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/word200']

In [3]:
D_baseline = {}

In [4]:
for wv_file in modelnames:
# wv_file = modelnames[0]
# if True:
    
    # 1. load data
    #if wv_file in D: continue 
    print(wv_file)
    sep = ' ' if 'cwe' not in wv_file else '\t'
    word_vec = KeyedVectors.load_word2vec_format(wv_file, sep = sep)
    d = lexical_evals(word_vec, lexical_files)
    D_baseline[wv_file] = d

embeddings/baseline/WikiEnglish/word/word2vec/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/word200
649070 200



ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/floydluo/Environments/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-711a982bb0b7>", line 10, in <module>
    d = lexical_evals(word_vec, lexical_files)
  File "<ipython-input-1-c79f77b9e017>", line 28, in lexical_evals
    analogies_score, sections = wv.evaluate_word_analogies(file, restrict_vocab=500000, case_insensitive=False)
  File "/home/floydluo/Desktop/fieldembed/fieldembed/keyedvectors.py", line 918, in evaluate_word_analogies
    sims = self.most_similar(positive=[b, c], negative=[a], topn=5, restrict_vocab=restrict_vocab)
  File "/home/floydluo/Desktop/fieldembed/fieldembed/keyedvectors.py", line 435, in most_similar
    dists = dot(limited, mean)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/floydluo/Environmen

KeyboardInterrupt: 

In [None]:
import pandas as pd

pd.DataFrame(D_baseline).T#.columns

## Field Embed

In [56]:
import os
base_dir = 'embeddings/fieldembed/WikiEnglish/'

def EmbeddingModelsReader(base_dir):
    results = [x for x in os.walk(base_dir) if x[2]]
    d = {i[0]: i[2] for i in results}
    Left, Right = [], []
    for path, names in d.items():
        left = []
        for name in names:
            if '_right_' in name and '.npy' not in name:
                modelname = os.path.join(path, name)
                Right.append(modelname)
                # print(modelname)
            elif '_left_' in name and '.npy' not in name:
                modelname = os.path.join(path, name)
                left.append(modelname)
        Left.append(left)
    return Left, Right
    
Left, Right = EmbeddingModelsReader(base_dir)
Right


['embeddings/fieldembed/WikiEnglish/word/token/cb-it5-w5-ng10-lr0.025-smp1e-05-nsexp0.75-th4/LF1-SmpGrT/200_right_word',
 'embeddings/fieldembed/WikiEnglish/word/token/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th8/LF1-SmpGrT/200_right_word',
 'embeddings/fieldembed/WikiEnglish/word/token_char/cb-it5-w5-ng10-lr0.025-smp1e-05-nsexp0.75-th4/LF3-SmpGrT/200_right_word',
 'embeddings/fieldembed/WikiEnglish/word/token_char/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th8/LF3-SmpGrT/200_right_word',
 'embeddings/fieldembed/WikiEnglish/word/token_char_phoneme_pos_en/cb-it5-w5-ng10-lr0.025-smp1e-05-nsexp0.75-th4/LF3-SmpGrT/200_right_word',
 'embeddings/fieldembed/WikiEnglish/word/token_char_phoneme_pos_en/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th8/LF3-SmpGrT/200_right_word',
 'embeddings/fieldembed/WikiEnglish/word/token_char_phoneme/cb-it5-w5-ng10-lr0.025-smp1e-05-nsexp0.75-th4/LF3-SmpGrT/200_right_word',
 'embeddings/fieldembed/WikiEnglish/word/token_char_phoneme/cb-it5-w5-ng10-lr0.025-smp0.00

In [9]:
# D_all = {}
D_pack = D_all

In [57]:

for wv_file in Right:
    # 1. load data
    if wv_file in D_pack: continue 
    print(wv_file)
    word_vec = KeyedVectors.load(wv_file)
    d = lexical_evals(word_vec, lexical_files)
    D_pack[wv_file] = d
    print(d)
    

embeddings/fieldembed/WikiEnglish/word/token_char_phoneme_pos_en/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th8/LF3-SmpGrT/200_right_word
{'ws353.txt': 0.6851494654789579, 'ws353_relatedness.txt': 0.6130753807016097, 'ws353_similarity.txt': 0.7668048154561964, 'google_ana.txt_semantics': 0.7324957167332953, 'google_ana.txt_syntax': 0.6641686182669789, 'google_ana.txt_Total accuracy': 0.6949562532166752, 'msr_ana.txt_syntactial-analogy': 0.5900383141762452, 'msr_ana.txt_Total accuracy': 0.5900383141762452}


In [58]:
# renames = dict(zip(['embeddings/fieldembed/WikiEnglish/word/token/cb-it5-w5-ng10-lr0.025-smp1e-05-nsexp0.75-th4/LF1-SmpGrT/200_right_word',
#        'embeddings/fieldembed/WikiEnglish/word/token_char/cb-it5-w5-ng10-lr0.025-smp1e-05-nsexp0.75-th4/LF3-SmpGrT/200_right_word',
#        'embeddings/fieldembed/WikiEnglish/word/token_char_phoneme/cb-it5-w5-ng10-lr0.025-smp1e-05-nsexp0.75-th4/LF3-SmpGrT/200_right_word',
#        'embeddings/fieldembed/WikiEnglish/word/token_char_phoneme_pos_en/cb-it5-w5-ng10-lr0.025-smp1e-05-nsexp0.75-th4/LF3-SmpGrT/200_right_word'],
#                'T TC TCPh TCPhPos'.split()))

In [59]:
# D_fld = {}
# for name, value in D_all.items():
#     new_name = renames[name]
#     D_fld[new_name] = value
    
import pandas as pd
pd.DataFrame(D_pack).T[['ws353.txt', 'ws353_relatedness.txt',
       'ws353_similarity.txt', 'google_ana.txt_Total accuracy', 'google_ana.txt_semantics',
       'google_ana.txt_syntax', 'msr_ana.txt_Total accuracy' ]]

Unnamed: 0,ws353.txt,ws353_relatedness.txt,ws353_similarity.txt,google_ana.txt_Total accuracy,google_ana.txt_semantics,google_ana.txt_syntax,msr_ana.txt_Total accuracy
embeddings/fieldembed/WikiEnglish/word/token/cb-it5-w5-ng10-lr0.025-smp1e-05-nsexp0.75-th4/LF1-SmpGrT/200_right_word,0.713574,0.625117,0.812294,0.769686,0.805939,0.739953,0.636899
embeddings/fieldembed/WikiEnglish/word/token_char/cb-it5-w5-ng10-lr0.025-smp1e-05-nsexp0.75-th4/LF3-SmpGrT/200_right_word,0.7281,0.652469,0.804864,0.765878,0.81462,0.725902,0.611111
embeddings/fieldembed/WikiEnglish/word/token_char_phoneme_pos_en/cb-it5-w5-ng10-lr0.025-smp1e-05-nsexp0.75-th4/LF3-SmpGrT/200_right_word,0.726368,0.652042,0.806802,0.746423,0.80514,0.698267,0.585028
embeddings/fieldembed/WikiEnglish/word/token_char_phoneme/cb-it5-w5-ng10-lr0.025-smp1e-05-nsexp0.75-th4/LF3-SmpGrT/200_right_word,0.727665,0.650995,0.811416,0.751004,0.803541,0.707916,0.598291
embeddings/fieldembed/WikiEnglish/word/token/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th8/LF1-SmpGrT/200_right_word,0.681024,0.588098,0.772408,0.709727,0.718218,0.702763,0.636457
embeddings/fieldembed/WikiEnglish/word/token_char/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th8/LF3-SmpGrT/200_right_word,0.683803,0.600199,0.76915,0.700926,0.720388,0.684965,0.621574
embeddings/fieldembed/WikiEnglish/word/token_char_phoneme/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th8/LF3-SmpGrT/200_right_word,0.682279,0.606243,0.767073,0.690221,0.710908,0.673255,0.603448
embeddings/fieldembed/WikiEnglish/word/token_char_phoneme_pos_en/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th8/LF3-SmpGrT/200_right_word,0.685149,0.613075,0.766805,0.694956,0.732496,0.664169,0.590038


In [50]:
pd.DataFrame(D_baseline).T[['ws353.txt', 'ws353_relatedness.txt',
       'ws353_similarity.txt', 'google_ana.txt_Total accuracy', 'google_ana.txt_semantics',
       'google_ana.txt_syntax', 'msr_ana.txt_Total accuracy' ]]

Unnamed: 0,ws353.txt,ws353_relatedness.txt,ws353_similarity.txt,google_ana.txt_Total accuracy,google_ana.txt_semantics,google_ana.txt_syntax,msr_ana.txt_Total accuracy
embeddings/baseline/WikiEnglish/word/word2vec/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/word200,0.651389,0.560435,0.753469,0.651158,0.6506,0.651616,0.574565
embeddings/baseline/WikiEnglish/word/cwe/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/word200,0.654654,0.565359,0.760095,0.648121,0.648201,0.648056,0.573534


# Cls

In [5]:
from fieldembed.keyedvectors import KeyedVectors
import os

class FldEmbed_Container(object):
    
    def __init__(self, path):
        results = [x for x in os.walk(path) if x[2]]
        self.path = path
        d = {i[0]: i[2] for i in results}
        Left, Right = [], []
        for path, names in d.items():
            left = []
            for name in names:
                if '_right_' in name and '.npy' not in name:
                    modelname = os.path.join(path, name)
                    Right.append(modelname)
                    # print(modelname)
                elif '_left_' in name and '.npy' not in name:
                    modelname = os.path.join(path, name)
                    left.append(modelname)
            Left.append(left)
        self.Left = Left[0]
        self.Right = Right[0]
        
        self.wv_neg = KeyedVectors.load(self.Right)
        self.weights = {}
        for pth in self.Left:
            end = pth.split('/')[-1]
            # print(end)
            fld = 'pos_en' if 'pos_en' in end else end.split('_')[-1]
            # print(fld)
            if fld == 'token':
                self.weights[fld] = self.wv_neg
            else:
                self.weights[fld] = KeyedVectors.load(pth)
            

            import numpy as np

path = 'embeddings/fieldembed/WikiEnglish/word/token_char_phoneme_pos_en/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th8/LF3-SmpGrT/'
fldembed_container = FldEmbed_Container(path)
print(fldembed_container.wv_neg.vectors.shape)

(649068, 200)


In [6]:
from fieldembed.keyedvectors import KeyedVectors
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn import metrics

from nlptext.sentence import Sentence
from nlptext.base import BasicObject
import numpy as np


Data_Dir = 'data/newsgroup/word/'
BasicObject.INIT_FROM_PICKLE(Data_Dir)
nlptext = BasicObject


def getsent2matrix(sent, wv):
    token_strs = [i[0] for i in sent.get_grain_str('token')]
    wv = wv.derivative_wv
    wv.set_GU_and_TU()
    TU = wv.TU # LGU in derivative wv is LTU
    if TU is None:
        TU = wv.GU
    # this code is verbose
    # TODO: how to deal with unk tokens
    token_idxes = [TU[1].get(token_str) for token_str in token_strs if token_str in TU[1]] # 0 is not unk, to fix it in the future
    # token_idxes = [i[0] for i in token_idxes]
    # print(token_idxes)
    # print(token_idxes)
    matrix = wv.vectors[token_idxes]
    return matrix


def getsent2matrixfromhyper(sent, wv, fld, tagScheme = 'BIOES'):
    token_strs = [i[0] for i in sent.get_grain_str(fld, tagScheme = tagScheme)]
    DGU = wv.GU[1]
    token_idxes = [DGU.get(token_str) for token_str in token_strs if token_str in DGU] # 0 is not unk, to fix it in the future
    # token_idxes = [i[0] for i in token_idxes]
    # print(token_idxes)
    # print(token_idxes)
    matrix = wv.vectors[token_idxes]
    return matrix


def convert_document_to_X_and_Y(nlptext, wv, fld = 'token', tagScheme = 'BIOES'):
    doc_num = nlptext.SENT['length']
    
    docmatrix = np.zeros((doc_num, wv.vector_size))
    labels = np.zeros(doc_num, dtype = int)
    
    if fld not in ['pos', 'pos_en']:
        for sentidx in range(doc_num):
            sent = Sentence(sentidx)
            matrix = getsent2matrix(sent, wv)
            # matrix.append(sent)
            docvector = np.mean(matrix, axis = 0)
            docmatrix[sentidx] = docvector
            labels[sentidx] = sent.IdxGroup
    else:
        print('hyper fields')
        for sentidx in range(doc_num):
            sent = Sentence(sentidx)
            matrix = getsent2matrixfromhyper(sent, wv, fld, tagScheme)
            # matrix.append(sent)
            docvector = np.mean(matrix, axis = 0)
            docmatrix[sentidx] = docvector
            labels[sentidx] = sent.IdxGroup
    return docmatrix, labels

CORPUS	read from pickle file : data/newsgroup/word/Pyramid/CORPUS.p
CORPUS	the length of it is   : 1
GROUP	read from pickle file : data/newsgroup/word/Pyramid/GROUP.p
GROUP	the length of it is   : 20
TEXT	read from pickle file : data/newsgroup/word/Pyramid/TEXT.p
TEXT	the length of it is   : 18756
SENT	read from pickle file : data/newsgroup/word/Pyramid/SENT.p
SENT	the length of it is   : 18756
TOKEN	read from pickle file : data/newsgroup/word/Pyramid/TOKEN.p
TOKEN	the length of it is   : 6857474
**************************************** 



In [8]:
from nlptext.folder import Folder
for i in range(BasicObject.GROUP['length']):
    f = Folder(i)
    print(f.name)
    s, e = f.IdxSentStartEnd
    print(e - s)


corpus/newsgroup/talk.politics.guns
910
corpus/newsgroup/comp.os.ms-windows.misc
985
corpus/newsgroup/rec.motorcycles
994
corpus/newsgroup/talk.politics.misc
775
corpus/newsgroup/comp.sys.ibm.pc.hardware
977
corpus/newsgroup/soc.religion.christian
997
corpus/newsgroup/sci.med
987
corpus/newsgroup/talk.religion.misc
625
corpus/newsgroup/comp.sys.mac.hardware
946
corpus/newsgroup/misc.forsale
969
corpus/newsgroup/alt.atheism
792
corpus/newsgroup/sci.crypt
990
corpus/newsgroup/rec.sport.hockey
993
corpus/newsgroup/talk.politics.mideast
940
corpus/newsgroup/rec.autos
986
corpus/newsgroup/sci.electronics
975
corpus/newsgroup/comp.windows.x
976
corpus/newsgroup/sci.space
984
corpus/newsgroup/rec.sport.baseball
987
corpus/newsgroup/comp.graphics
968


In [7]:
from nlptext.sentence import Sentence

sent = Sentence(100)
sent
wv = fldembed_container.weights['pos_en']
# wv.GU[0]

fld = 'pos_en'
getsent2matrixfromhyper(sent, wv, fld, tagScheme = 'BIOES')

convert_document_to_X_and_Y(nlptext, wv, fld = fld, tagScheme = 'BIOES')

hyper fields


(array([[-0.1672108 , -0.01414133,  0.11257391, ...,  0.04140232,
          0.00370974,  0.0330977 ],
        [-0.22841461, -0.01087983,  0.11954738, ...,  0.0191622 ,
         -0.03443069,  0.09741449],
        [-0.1842076 , -0.00941621,  0.10770255, ...,  0.02106173,
         -0.00480307,  0.06171349],
        ...,
        [-0.20721146, -0.00891671,  0.10875595, ...,  0.01490752,
         -0.00537327,  0.05342511],
        [-0.18790171,  0.00268668,  0.11410276, ...,  0.02615875,
          0.0103227 ,  0.05485161],
        [-0.20053528,  0.00200385,  0.10773578, ...,  0.02444534,
         -0.02212308,  0.0611475 ]]), array([ 0,  0,  0, ..., 19, 19, 19]))

## Word

### Word Embedding

In [3]:
path = 'embeddings/baseline/WikiEnglish/word/word2vec/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/word200'

wv = KeyedVectors.load_word2vec_format(path)
docmatrix, labels = convert_document_to_X_and_Y(nlptext, wv)
X = docmatrix
Y = labels



##########################################################
clf = svm.SVC(C=10.0, 
              kernel='linear', 
              degree=3, # degree is only valid for poly
              gamma='auto', # coefficient for 'rbf', 'poly', and 'sigmoid',
              coef0=0.0,
              shrinking=True,
              probability=False,
              tol=0.001,
              class_weight=None,
              cache_size=200,  
              verbose=False,
              max_iter=-1, 
              decision_function_shape='ovo', 
              random_state=10, )
scores = cross_val_score(clf, X, Y, cv = 5)


evals_result = {}
evals_result['cls_score_mean'] = scores.mean()
evals_result['cls_score_2std'] = scores.std() * 2
# D[fldembed_container.path + 'WORD'] = evals_result


print("==="*30)
print(clf)
print(scores.mean())
print(scores.std() * 2)
print(evals_result)
print("==="*30, '\n\n')





649070 200

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=10, shrinking=True, tol=0.001,
    verbose=False)
0.7127835043893007
0.003427405158593468
{'cls_score_mean': 0.7127835043893007, 'cls_score_2std': 0.003427405158593468}


SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=10, shrinking=True, tol=0.001,
    verbose=False)
0.6947639826112375
0.016212765649845213
{'cls_score_mean': 0.6947639826112375, 'cls_score_2std': 0.016212765649845213}




NameError: name 'LogisticRegressionCV' is not defined

## Word.Char

In [6]:
import numpy as np
import os 

path = 'embeddings/fieldembed/WikiEnglish/word/token_char/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th8/LF3-SmpGrT/'
fldembed_container = FldEmbed_Container(path)
print(fldembed_container.wv_neg.vectors.shape)

(649068, 200)


### Word Embedding

In [7]:
# right word embedding
word_embedding = fldembed_container.wv_neg
word_vec = word_embedding
docmatrix, labels = convert_document_to_X_and_Y(nlptext, word_vec)
X = docmatrix
Y = labels



##########################################################
clf = svm.SVC(C=10.0, 
              kernel='linear', 
              degree=3, # degree is only valid for poly
              gamma='auto', # coefficient for 'rbf', 'poly', and 'sigmoid',
              coef0=0.0,
              shrinking=True,
              probability=False,
              tol=0.001,
              class_weight=None,
              cache_size=200,  
              verbose=False,
              max_iter=-1, 
              decision_function_shape='ovo', 
              random_state=10, )
scores = cross_val_score(clf, X, Y, cv = 5)


evals_result = {}
evals_result['cls_score_mean'] = scores.mean()
evals_result['cls_score_2std'] = scores.std() * 2
# D[fldembed_container.path + 'WORD'] = evals_result


print("==="*30)
print(clf)
print(scores.mean())
print(scores.std() * 2)
print(evals_result)
print("==="*30)



SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=10, shrinking=True, tol=0.001,
    verbose=False)
0.664584539997819
0.012377715489200474
{'cls_score_mean': 0.664584539997819, 'cls_score_2std': 0.012377715489200474}


SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=10, shrinking=True, tol=0.001,
    verbose=False)
0.7298418940688691
0.018615424533721486
{'cls_score_mean': 0.7298418940688691, 'cls_score_2std': 0.018615424533721486}
0.7142349386884663


### Grain Embedding

In [8]:
X_all = []
for fld, wv in fldembed_container.weights.items():
    # 3. sent clf
    docmatrix, labels = convert_document_to_X_and_Y(nlptext, wv)
    X_all.append(docmatrix)
    Y = labels
X = np.concatenate(X_all, 1)
print(X.shape, Y.shape)


##########################################################
clf = svm.SVC(C=10.0, 
              kernel='linear', 
              degree=3, # degree is only valid for poly
              gamma='auto', # coefficient for 'rbf', 'poly', and 'sigmoid',
              coef0=0.0,
              shrinking=True,
              probability=False,
              tol=0.001,
              class_weight=None,
              cache_size=200,  
              verbose=False,
              max_iter=-1, 
              decision_function_shape='ovo', 
              random_state=10, )
scores = cross_val_score(clf, X, Y, cv = 5)


evals_result = {}
evals_result['cls_score_mean'] = scores.mean()
evals_result['cls_score_2std'] = scores.std() * 2
# D[fldembed_container.path + 'WORD'] = evals_result


print("==="*30)
print(clf)
print(scores.mean())
print(scores.std() * 2)
print(evals_result)
print("==="*30)


(18756, 400) (18756,)
SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=10, shrinking=True, tol=0.001,
    verbose=False)
0.6418179989320805
0.013392143451656735
{'cls_score_mean': 0.6418179989320805, 'cls_score_2std': 0.013392143451656735}


SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=10, shrinking=True, tol=0.001,
    verbose=False)
0.7535698301124573
0.01681611458370847
{'cls_score_mean': 0.7535698301124573, 'cls_score_2std': 0.01681611458370847}
0.7497778567620401


## Word.Char.Phoneme

In [9]:
import numpy as np

path = 'embeddings/fieldembed/WikiEnglish/word/token_char_phoneme/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th8/LF3-SmpGrT/'
fldembed_container = FldEmbed_Container(path)
print(fldembed_container.wv_neg.vectors.shape)

(649068, 200)


### Word Embeding

In [10]:
# right word embedding
word_embedding = fldembed_container.wv_neg
word_vec = word_embedding
docmatrix, labels = convert_document_to_X_and_Y(nlptext, word_vec)
X = docmatrix
Y = labels


##########################################################
clf = svm.SVC(C=10.0, 
              kernel='linear', 
              degree=3, # degree is only valid for poly
              gamma='auto', # coefficient for 'rbf', 'poly', and 'sigmoid',
              coef0=0.0,
              shrinking=True,
              probability=False,
              tol=0.001,
              class_weight=None,
              cache_size=200,  
              verbose=False,
              max_iter=-1, 
              decision_function_shape='ovo', 
              random_state=10, )
scores = cross_val_score(clf, X, Y, cv = 5)


evals_result = {}
evals_result['cls_score_mean'] = scores.mean()
evals_result['cls_score_2std'] = scores.std() * 2
# D[fldembed_container.path + 'WORD'] = evals_result


print("==="*30)
print(clf)
print(scores.mean())
print(scores.std() * 2)
print(evals_result)
print("==="*30)



SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=10, shrinking=True, tol=0.001,
    verbose=False)
0.676952590332353
0.010823236931206069
{'cls_score_mean': 0.676952590332353, 'cls_score_2std': 0.010823236931206069}


SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=10, shrinking=True, tol=0.001,
    verbose=False)
0.730272738014059
0.01465113545646962
{'cls_score_mean': 0.730272738014059, 'cls_score_2std': 0.01465113545646962}
0.716012084592145


### Grain Embedding

In [11]:
X_all = []
for fld, wv in fldembed_container.weights.items():
    # 3. sent clf
    docmatrix, labels = convert_document_to_X_and_Y(nlptext, wv)
    X_all.append(docmatrix)
    Y = labels
X = np.concatenate(X_all, 1)
print(X.shape, Y.shape)


##########################################################
clf = svm.SVC(C=10.0, 
              kernel='linear', 
              degree=3, # degree is only valid for poly
              gamma='auto', # coefficient for 'rbf', 'poly', and 'sigmoid',
              coef0=0.0,
              shrinking=True,
              probability=False,
              tol=0.001,
              class_weight=None,
              cache_size=200,  
              verbose=False,
              max_iter=-1, 
              decision_function_shape='ovo', 
              random_state=10, )
scores = cross_val_score(clf, X, Y, cv = 5)


evals_result = {}
evals_result['cls_score_mean'] = scores.mean()
evals_result['cls_score_2std'] = scores.std() * 2
# D[fldembed_container.path + 'WORD'] = evals_result


print("==="*30)
print(clf)
print(scores.mean())
print(scores.std() * 2)
print(evals_result)
print("==="*30)




(18756, 600) (18756,)
SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=10, shrinking=True, tol=0.001,
    verbose=False)
0.6375000956620048
0.01085022399949969
{'cls_score_mean': 0.6375000956620048, 'cls_score_2std': 0.01085022399949969}


SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=10, shrinking=True, tol=0.001,
    verbose=False)
0.7682375148506465
0.013969245847357202
{'cls_score_mean': 0.7682375148506465, 'cls_score_2std': 0.013969245847357202}




KeyboardInterrupt: 

## Word.Char.Phoneme.Pos

In [7]:
import numpy as np

path = 'embeddings/fieldembed/WikiEnglish/word/token_char_phoneme_pos_en/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th8/LF3-SmpGrT/'
fldembed_container = FldEmbed_Container(path)
print(fldembed_container.wv_neg.vectors.shape)


(649068, 200)


In [8]:
fldembed_container.weights

{'char': <fieldembed.keyedvectors.Word2VecKeyedVectors at 0x7ffb77c164a8>,
 'phoneme': <fieldembed.keyedvectors.Word2VecKeyedVectors at 0x7ffb0b25f128>,
 'pos_en': <fieldembed.keyedvectors.Word2VecKeyedVectors at 0x7ffb11d7eeb8>,
 'token': <fieldembed.keyedvectors.Word2VecKeyedVectors at 0x7ffb77c16828>}

### Word Embedding

In [13]:
# right word embedding
word_embedding = fldembed_container.wv_neg
word_vec = word_embedding
docmatrix, labels = convert_document_to_X_and_Y(nlptext, word_vec)
X = docmatrix
Y = labels


##########################################################
clf = svm.SVC(C=10.0, 
              kernel='linear', 
              degree=3, # degree is only valid for poly
              gamma='auto', # coefficient for 'rbf', 'poly', and 'sigmoid',
              coef0=0.0,
              shrinking=True,
              probability=False,
              tol=0.001,
              class_weight=None,
              cache_size=200,  
              verbose=False,
              max_iter=-1, 
              decision_function_shape='ovo', 
              random_state=10, )
scores = cross_val_score(clf, X, Y, cv = 5)


evals_result = {}
evals_result['cls_score_mean'] = scores.mean()
evals_result['cls_score_2std'] = scores.std() * 2
# D[fldembed_container.path + 'WORD'] = evals_result


print("==="*30)
print(clf)
print(scores.mean())
print(scores.std() * 2)
print(evals_result)
print("==="*30)



SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=10, shrinking=True, tol=0.001,
    verbose=False)
0.6800999013760063
0.012479174058799584
{'cls_score_mean': 0.6800999013760063, 'cls_score_2std': 0.012479174058799584}


SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=10, shrinking=True, tol=0.001,
    verbose=False)
0.7703661898916893
0.007165291896210835
{'cls_score_mean': 0.7703661898916893, 'cls_score_2std': 0.007165291896210835}


KeyboardInterrupt: 

### Grain Embedding

In [9]:
fldembed_container.weights

{'char': <fieldembed.keyedvectors.Word2VecKeyedVectors at 0x7ffb77c164a8>,
 'phoneme': <fieldembed.keyedvectors.Word2VecKeyedVectors at 0x7ffb0b25f128>,
 'pos_en': <fieldembed.keyedvectors.Word2VecKeyedVectors at 0x7ffb11d7eeb8>,
 'token': <fieldembed.keyedvectors.Word2VecKeyedVectors at 0x7ffb77c16828>}

In [10]:
X_all = []
for fld, wv in fldembed_container.weights.items():
    # 3. sent clf
    docmatrix, labels = convert_document_to_X_and_Y(nlptext, wv, fld = fld, tagScheme = 'BIOES')
    X_all.append(docmatrix)
    Y = labels
X = np.concatenate(X_all, 1)
print(X.shape, Y.shape)


##########################################################
clf = svm.SVC(C=10.0, 
              kernel='linear', 
              degree=3, # degree is only valid for poly
              gamma='auto', # coefficient for 'rbf', 'poly', and 'sigmoid',
              coef0=0.0,
              shrinking=True,
              probability=False,
              tol=0.001,
              class_weight=None,
              cache_size=200,  
              verbose=False,
              max_iter=-1, 
              decision_function_shape='ovo', 
              random_state=10, )
scores = cross_val_score(clf, X, Y, cv = 5)


evals_result = {}
evals_result['cls_score_mean'] = scores.mean()
evals_result['cls_score_2std'] = scores.std() * 2
# D[fldembed_container.path + 'WORD'] = evals_result


print("==="*30)
print(clf)
print(scores.mean())
print(scores.std() * 2)
print(evals_result)
print("==="*30)



hyper fields
(18756, 800) (18756,)
SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=10, shrinking=True, tol=0.001,
    verbose=False)
0.6167086333710072
0.01005100089158855
{'cls_score_mean': 0.6167086333710072, 'cls_score_2std': 0.01005100089158855}


SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=10, shrinking=True, tol=0.001,
    verbose=False)
0.8012352790845343
0.006625365255477827
{'cls_score_mean': 0.8012352790845343, 'cls_score_2std': 0.006625365255477827}
