In [1]:
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

glove2word2vec('../embeddings/vectors.txt', '../embeddings/w2v_vectors.txt')
model = KeyedVectors.load_word2vec_format('../embeddings/w2v_vectors.txt')

vk_file = '../SO_word2vec/SO_vectors_200.bin'
vk_model = KeyedVectors.load_word2vec_format(vk_file, binary=True)

In [2]:
import pandas as pd
df = pd.read_csv('../so_tag_synonyms/TagSynonyms_20170613.txt')
print('Total SO Synonyms = ' + str(len(df)))

Total SO Synonyms = 3650


In [3]:
def get_model_pred_top1(model, df):
    pred = [0] * len(df)
    for index, row in df.iterrows():   
        if row['SourceTagName'] not in model.vocab:
            pred[index] = -1
            continue
        result = model.most_similar(positive=[row['SourceTagName']])
        if result[0][0] == row['TargetTagName']:
            pred[index] = 1
    return pred

def get_model_pred_top3(model, df):
    pred = [0] * len(df)
    for index, row in df.iterrows():   
        if row['SourceTagName'] not in model.vocab:
            pred[index] = -1
            continue
        result = model.most_similar(positive=[row['SourceTagName']])
        if result[0][0] == row['TargetTagName'] or \
        result[1][0] == row['TargetTagName'] or \
        result[2][0] == row['TargetTagName']:
            pred[index] = 1
    return pred

In [4]:
pred = get_model_pred_top1(model, df)
print('Not in vocab = ' + str(pred.count(-1)))
print('Top1 matches = ' + str(pred.count(1)))

pred = get_model_pred_top3(model, df)
print('Top3 matches = ' + str(pred.count(1)))

Not in vocab = 1729
Top1 matches = 560
Top3 matches = 790


In [5]:
pred = get_model_pred_top1(vk_model, df)
print('Not in vocab = ' + str(pred.count(-1)))
print('Top1 matches = ' + str(pred.count(1)))

pred = get_model_pred_top3(vk_model, df)
print('Top3 matches = ' + str(pred.count(1)))

Not in vocab = 983
Top1 matches = 485
Top3 matches = 742
