# Analogy Evaluation

## Load in the embeddings

In [None]:

import pandas as pd
import numpy as np
import pickle
from itertools import islice
from sklearn import preprocessing  # Two samples, with 3 dimensions.
import analyze
import evaluate
import imp
import time
from sklearn.metrics import pairwise_distances

path = './data/'

embedding_names = ['bow2','bow5', 'deps']
embedding_names = ['bow2', 'deps']

simlexf = 'Simlex/SimLex-999.txt'

#
# embeddings: list of dicts holding the data for each embedding
# dict: {name - embedding name,
#       filename - embedding data filename
#       words - dict holding the words  and their vectors
#    
embeddings = []


for name in embedding_names:
    # words: dict, holds all words and their vectors for each embedding
    words = {}
    words_normalized = {}
    filename = name + '.words.bz2'
    embedding_df = pd.read_table(path + filename , sep=' ', header=None)
    for index, row in embedding_df.iterrows():
        words[row[0]] = row[1:]
    
    
    embedding_vectors = np.stack(list(words.values()))
    embedding_vocabulary = np.stack(list(words.keys()))
    embeddings.append({'name':name, 'filename': filename, 
                            'words': words, ' embedding_vectors':  embedding_vectors,
                       'embedding_vocabulary': embedding_vocabulary})
        



In [None]:
## Function to calculate MRR
def MRR(X, Y, indices, exclude):
    # 
    # X: query vectors matrix (size: A x M)
    # Y: embedding vectors matrix (size: Nwords x Mfeatures)
    # indices: indices of the real (size: A)
    # exclude = list of indices to exclude for each analogy word
    # Compute distances between each codeword and each other codeword
    distance_matrix = scipy.spatial.distance.cdist(X, Y, metric='cosine')
    exclude = np.array(exclude)
    m = np.amax(distance_matrix)
    
    for i in [0,1,2]:
        #print(exclude[:,0].flatten())
        distance_matrix[np.arange(X.shape[0]), exclude[:,i].flatten()] =  m
    # Rank is the number of distances smaller than the correct distance, a
    # specified by the indices arg
    n_le = distance_matrix.T <= distance_matrix[np.arange(X.shape[0]), indices]
    n_lt = distance_matrix.T < distance_matrix[np.arange(X.shape[0]), indices]
    return (np.mean(1./n_le.sum(axis=0)),
           np.mean(1./(n_lt.sum(axis=0) + 1)))




### Load in the analogy dataset

In [5]:
# Analogy Task

analogy_file = 'questions-words.txt'
analogy_words = pd.read_table(path + analogy_file , sep=' ', header=1)
analogy_words.head()



Unnamed: 0,Athens,Greece,Baghdad,Iraq
0,Athens,Greece,Bangkok,Thailand
1,Athens,Greece,Beijing,China
2,Athens,Greece,Berlin,Germany
3,Athens,Greece,Bern,Switzerland
4,Athens,Greece,Cairo,Egypt


## Compute analogy word and compare to answer

In [55]:
start =  time.time()

all_bstars = []

for embedding in embeddings:
    words = embedding['words']
    embedding_vectors = np.stack(list(words.values()))
    embedding_vocabulary = np.stack(list(words.keys()))

    # warning here I only used the 9000 first words!
    # also breaks on first one
    indexes_found = []
    answers = []
    
    answer_embedding_indexes = []
    to_exclude = []
    all_bstars = []
    for index, row in analogy_words.iterrows():
            if row[0] in words.keys() and row[1] in words.keys() and row[2] in words.keys() and row[3] in words.keys():   
                #only do for a subset:
                indexes_found.append(index)
                a = words[row[0]]
                astar = words[row[1]]
                b = words[row[2]]
                vocab_keys = list(embedding_vocabulary)

                actual_bstar_index = vocab_keys.index(row[3])

                # Compute offset a* - a, add to b
                bstar = b.T + (astar.T - a.T)            
                cosine_sims = []
                cosine_sims_keys = []
                # Now that we computed  bstar find the closest vector to it
                # compute cosine similarity with all vectors
                # obtain the closest 

                #exclude = [row[0], row[1], row[2]]

              
                to_exclude.append([vocab_keys.index(row[0]),
                                  vocab_keys.index(row[1]),
                                  vocab_keys.index(row[2])])
                all_bstars.append(bstar)
                answer_embedding_indexes.append(actual_bstar_index)
    
    all_bstars_np = np.array(all_bstars)



    embeddingMRR = MRR(all_bstars_np, embedding_vectors, 
                       answer_embedding_indexes, to_exclude)
    
    print(embeddingMRR,len(indexes_found ))
    embedding.update({'MRR':embeddingMRR, 
                      'analogy_words_found': len(indexes_found)})
                      
end = time.time()
print(end-start, "seconds")
            

(0.70005419519385836, 0.70005419519385836) 9516
(0.69025550542683178, 0.69025550542683178) 9516
(0.67642604656615557, 0.67642604656615557) 9516
6475.316911935806 seconds


In [None]:
# Qualitative comparison look at the first 4 words

# Compute accuracy
