# Analogy Evaluation

## Load in the embeddings

In [1]:

import pandas as pd
import numpy as np
import pickle
from itertools import islice
from sklearn import preprocessing  # Two samples, with 3 dimensions.
import analyze
import evaluate
import imp
import time
from sklearn.metrics import pairwise_distances
import scipy 

path = './data/'

embedding_names = ['bow2','bow5', 'deps']
embedding_names = ['bow2', 'deps']

simlexf = 'Simlex/SimLex-999.txt'

#
# embeddings: list of dicts holding the data for each embedding
# dict: {name - embedding name,
#       filename - embedding data filename
#       words - dict holding the words  and their vectors
#    
embeddings = []


for name in embedding_names:
    # words: dict, holds all words and their vectors for each embedding
    words = {}
    words_normalized = {}
    filename = name + '.words.bz2'
    embedding_df = pd.read_table(path + filename , sep=' ', header=None)
    for index, row in embedding_df.iterrows():
        words[row[0]] = row[1:]
    
    
    embedding_vectors = np.stack(list(words.values()))
    embedding_vocabulary = np.stack(list(words.keys()))
    embeddings.append({'name':name, 'filename': filename, 
                            'words': words, ' embedding_vectors':  embedding_vectors,
                       'embedding_vocabulary': embedding_vocabulary})
        



In [95]:
## Function to calculate MRR
def accuracy_and_MRR(X, Y, indices, exclude):
    # 
    # X: query vectors matrix (size: A x M)
    # Y: embedding vectors matrix (size: Nwords x Mfeatures)
    # indices: indices of the real (size: A)
    # exclude = list of indices to exclude for each analogy word
    # Compute distances between each codeword and each other codeword
    distance_matrix = scipy.spatial.distance.cdist(X, Y, metric='cosine')
    exclude = np.array(exclude)
    m = np.amax(distance_matrix)
    
    for i in [0,1,2]:
        #print(exclude[:,0].flatten())
        distance_matrix[np.arange(X.shape[0]), exclude[:,i].flatten()] =  m
    # Rank is the number of distances smaller than the correct distance, a
    # specified by the indices arg
    
    n_le = distance_matrix.T <= distance_matrix[np.arange(X.shape[0]), indices]
    answer_indexes = np.argsort(distance_matrix)[:,0]
    possible_answer_indexes = np.argsort(distance_matrix)[:,1:4]
    matches = np.array(indices - answer_indexes)
    acc = sum(matches == 0)/len(matches)
    n_lt = distance_matrix.T < distance_matrix[np.arange(X.shape[0]), indices]
    
    return (np.mean(1./n_le.sum(axis=0)),acc , answer_indexes, possible_answer_indexes)




In [34]:
from sklearn.metrics import accuracy_score

b=np.array([1,0,10,12,3])
sum(b==0)/len(b)

0.20000000000000001

### Load in the analogy dataset

In [71]:
# Analogy Task

analogy_file = 'questions-words.txt'
analogy_words = pd.read_table(path + analogy_file , sep=' ', header=1)
analogy_words.head()



Unnamed: 0,Athens,Greece,Baghdad,Iraq
0,Athens,Greece,Bangkok,Thailand
1,Athens,Greece,Beijing,China
2,Athens,Greece,Berlin,Germany
3,Athens,Greece,Bern,Switzerland
4,Athens,Greece,Cairo,Egypt


In [7]:
import scipy

## Compute analogy word and compare to answer

In [94]:
start =  time.time()

all_bstars = []

for embedding in embeddings:
    words = embedding['words']
    embedding_vectors = np.stack(list(words.values()))
    embedding_vocabulary = np.stack(list(words.keys()))

    # warning here I only used the 9000 first words!
    # also breaks on first one
    indexes_found = []
    answers = []
    
    actual_answer_indices = []
    to_exclude = []
    all_bstars = []
    for index, row in analogy_words.head(8400).iterrows():
            if row[0] in words.keys() and row[1] in words.keys() and row[2] in words.keys() and row[3] in words.keys():   
                #only do for a subset:
                indexes_found.append(index)
                a = words[row[0]]
                astar = words[row[1]]
                b = words[row[2]]
                vocab_keys = list(embedding_vocabulary)

                actual_bstar_index = vocab_keys.index(row[3])

                # Compute offset a* - a, add to b
                bstar = b.T + (astar.T - a.T)            
                cosine_sims = []
                cosine_sims_keys = []
                # Now that we computed  bstar find the closest vector to it
                # compute cosine similarity with all vectors
                # obtain the closest 

                #exclude = [row[0], row[1], row[2]]

              
                to_exclude.append([vocab_keys.index(row[0]),
                                  vocab_keys.index(row[1]),
                                  vocab_keys.index(row[2])])
                all_bstars.append(bstar)
                actual_answer_indices.append(actual_bstar_index)
    
    all_bstars_np = np.array(all_bstars)

    embeddingMRR, acc, answer_indexes, possible_answer_indexes = accuracy_and_MRR(all_bstars_np, 
                                            embedding_vectors, 
                                           actual_answer_indices, to_exclude)
    
    print(embeddingMRR,len(indexes_found ))
    embedding.update({'MRR':embeddingMRR, 
                      'analogy_indexes_found': indexes_found,
                      'answer_indexes': answer_indexes,
                      'actual_answer_indices': actual_answer_indices,
                      'possible_answer_indices': possible_answer_indexes,
                     'acc':acc})
                      
end = time.time()
print(end-start, "seconds")
            

0.869362745098 34
0.856582633053 34
61.18748211860657 seconds


### Look at the results qualitatively

In [116]:
for embedding in embeddings:
    words = embedding['words']
    print(embedding['name'])
    embedding_vocabulary = embedding['embedding_vocabulary']
    #answer_indexes = embedding['answer_indexes']
    analogy_indexes_found= embedding['analogy_indexes_found']
    actual_answer_indices= embedding['possible_answer_indices']
    
    #for i in range(len(analogy_indexes_found)): 
    for i in [20,33]:
        print('QUERY:', analogy_words.values[analogy_indexes_found[i]])
        print('ANSWERS:',embedding_vocabulary[actual_answer_indices[i]] ,' ')
        #print(analogy_words.ix[row.analogy_indexes_found,1:3])
            

bow2
QUERY: ['boy' 'girl' 'stepson' 'stepdaughter']
ANSWERS: ['step-daughter' 'niece' 'daughter-in-law']  
QUERY: ['brother' 'sister' 'man' 'woman']
ANSWERS: ['girl' 'person' 'tomboy']  
deps
QUERY: ['boy' 'girl' 'stepson' 'stepdaughter']
ANSWERS: ['sister-in-law' 'daughter-in-law' 'stepbrother']  
QUERY: ['brother' 'sister' 'man' 'woman']
ANSWERS: ['girl' 'waif' 'schoolgirl']  


In [105]:
print(analogy_indexes_found)

([8366, 8367, 8368, 8369, 8370, 8371, 8372, 8373, 8374, 8375, 8376, 8377, 8378, 8379, 8380, 8381, 8382, 8383, 8384, 8385, 8386, 8387, 8388, 8389, 8390, 8391, 8392, 8393, 8394, 8395, 8396, 8397, 8398, 8399],)


In [None]:
# Qualitative comparison look at the first 4 words

# Compute accuracy
