In [6]:
import codecs
import numpy as np
from numpy import linalg as LA
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr

def load_embeddings_from_np(filename):
    print('loading ...')
    with codecs.open(filename + '.vocab', 'r', 'utf-8') as f_embed:
        vocab = [line.strip() for line in f_embed]
        
    w2i = {w: i for i, w in enumerate(vocab)}
    wv = np.load(filename + '.wv.npy')

    return vocab, wv, w2i

def load_dhdglove(path):
    print('loading ...')
    debiased_embeds = pickle.load(open(path, 'rb'))
    wv = []
    vocab = []
    for w in debiased_embeds:
        wv.append(np.array(debiased_embeds[w]))
        vocab.append(str(w))
        
    w2i = {w: i for i, w in enumerate(vocab)}
    wv = np.array(wv).astype(float)
    print(len(vocab), wv.shape, len(w2i))
        
    return vocab, wv, w2i 

def load_wo_normalize(space, filename, vocab, wv, w2i):
    if filename[-3:]=='txt':
        vocab_muse, wv_muse, w2i_muse = load_embeddings_from_np(filename)
    else:
        vocab_muse, wv_muse, w2i_muse = load_dhdglove(filename)
    vocab[space] = vocab_muse 
    wv[space] = wv_muse
    w2i[space] = w2i_muse
    print('done')

In [7]:
vocab = {}
wv = {}
w2i = {}

load_wo_normalize('bef', 'Gender-Biased Word Relation Task/data/embeddings/glove_wiki_vectors.txt', vocab, wv, w2i)
load_wo_normalize('aft', 'Gender-Biased Word Relation Task/data/embeddings/vectors_hd.txt', vocab, wv, w2i)

loading ...
done
loading ...
done


In [8]:
orig_glove = dict(zip(vocab['bef'], wv['bef']))
post_glove = dict(zip(vocab['aft'], wv['aft']))

In [9]:
dataSets = ['EN-RG-65.txt', 'EN-WS-353-ALL.txt', 'EN-RW-STANFORD.txt', 'EN-MEN-TR-3k.txt', 'EN-MTurk-287.txt', 'EN-MTurk-771.txt', 'EN-SIMLEX-999.txt', 'EN-SimVerb-3500.txt']


def similarity_eval(dataSetAddress, wordVecModel_str):
    wordVecModel = eval(wordVecModel_str)
    vocab = set(list(wordVecModel.keys()))
    
    fread_simlex = open(dataSetAddress, "r")
    
    pair_list = []

    line_number = 0
    for line in fread_simlex:
#         if line_number > 0:
        tokens = line.split()
        word_i = tokens[0]
        word_j = tokens[1]
        score = float(tokens[2])
        if word_i in vocab and word_j in vocab:
            pair_list.append( ((word_i, word_j), score) )
#         line_number += 1

    pair_list.sort(key=lambda x: - x[1]) # order the pairs from highest score (most similar) to lowest score (least similar)


    extracted_scores = {}

    extracted_list = []
    
               
    for (x,y) in pair_list:
        (word_i, word_j) = x
        
        current_distance = 1- cosine_similarity( wordVecModel[word_i].reshape(1,-1)  , wordVecModel[word_j].reshape(1,-1) )        

        extracted_scores[(word_i, word_j)] = current_distance
        extracted_list.append(((word_i, word_j), current_distance))

    extracted_list.sort(key=lambda x: x[1])

    spearman_original_list = []
    spearman_target_list = []

    for position_1, (word_pair, score_1) in enumerate(pair_list):
        score_2 = extracted_scores[word_pair]
        position_2 = extracted_list.index((word_pair, score_2))
        spearman_original_list.append(position_1)
        spearman_target_list.append(position_2)

    spearman_rho = spearmanr(spearman_original_list, spearman_target_list)
    
    return spearman_rho[0]

In [10]:
resourceFile = 'data/' 

for dataset in dataSets:
    dataSetAddress = resourceFile + 'wordSimData/' +  dataset
    print('evaluating the data set', dataset)
    print('Glove + Orig : %.4f' %  similarity_eval(dataSetAddress, 'orig_glove'))
    print('Glove + HD : %.4f' %  similarity_eval(dataSetAddress, 'post_glove'),'\n')

evaluating the data set EN-RG-65.txt
Glove + Orig : 0.7540
Glove + HD : 0.7648 

evaluating the data set EN-WS-353-ALL.txt
Glove + Orig : 0.6199
Glove + HD : 0.6207 

evaluating the data set EN-RW-STANFORD.txt
Glove + Orig : 0.3722
Glove + HD : 0.3720 

evaluating the data set EN-MEN-TR-3k.txt
Glove + Orig : 0.7216
Glove + HD : 0.7212 

evaluating the data set EN-MTurk-287.txt
Glove + Orig : 0.6480
Glove + HD : 0.6468 

evaluating the data set EN-MTurk-771.txt
Glove + Orig : 0.6486
Glove + HD : 0.6504 

evaluating the data set EN-SIMLEX-999.txt
Glove + Orig : 0.3474
Glove + HD : 0.3501 

evaluating the data set EN-SimVerb-3500.txt
Glove + Orig : 0.2038
Glove + HD : 0.2034 

