# Analogy Evaluation

## Load in the embeddings

In [1]:

import pandas as pd
import numpy as np
import pickle
from itertools import islice
from sklearn import preprocessing  # Two samples, with 3 dimensions.
import analyze
import evaluate
import imp
import time
from sklearn.metrics import pairwise_distances
import scipy 

path = './data/'

embedding_names = ['bow2','bow5', 'deps']
#embedding_names = ['bow2', 'bow5']

#
# embeddings: list of dicts holding the data for each embedding
# dict: {name - embedding name,
#       filename - embedding data filename
#       words - dict holding the words  and their vectors
#    
embeddings = []


for name in embedding_names:
    # words: dict, holds all words and their vectors for each embedding
    words = {}
    words_normalized = {}
    filename = name + '.words.bz2'
    embedding_df = pd.read_table(path + filename , sep=' ', header=None)
    for index, row in embedding_df.iterrows():
        words[row[0]] = row[1:]
    
    
    embedding_vectors = np.stack(list(words.values()))
    embedding_vocabulary = np.stack(list(words.keys()))
    embeddings.append({'name':name, 'filename': filename, 
                            'words': words, ' embedding_vectors':  embedding_vectors,
                       'embedding_vocabulary': embedding_vocabulary})
        



In [2]:
for embedding in embeddings:
    print(len(embedding['embedding_vocabulary']))

183870
183870


In [3]:
## Function to calculate MRR
def accuracy_and_MRR(X, Y, indices, exclude):
    # 
    # X: query vectors matrix (size: A x M)
    # Y: embedding vectors matrix (size: Nwords x Mfeatures)
    # indices: indices of the real (size: A)
    # exclude = list of indices to exclude for each analogy word
    # Compute distances between each codeword and each other codeword
    distance_matrix = scipy.spatial.distance.cdist(X, Y, metric='cosine')
    exclude = np.array(exclude)
    m = np.amax(distance_matrix)
    
    for i in [0,1,2]:
        #print(exclude[:,0].flatten())
        distance_matrix[np.arange(X.shape[0]), exclude[:,i].flatten()] =  m
    # Rank is the number of distances smaller than the correct distance, a
    # specified by the indices arg
    
    n_le = distance_matrix.T <= distance_matrix[np.arange(X.shape[0]), indices]
    answer_indexes = np.argsort(distance_matrix)[:,0]
    possible_answer_indexes = np.argsort(distance_matrix)[:,1:4]
    matches = np.array(indices - answer_indexes)
    acc = sum(matches == 0)/len(matches)
    n_lt = distance_matrix.T < distance_matrix[np.arange(X.shape[0]), indices]
    
    return (np.mean(1./n_le.sum(axis=0)),acc , answer_indexes, possible_answer_indexes)




### Load in the analogy dataset

In [6]:
# Analogy Task

analogy_file = 'questions-words.txt'
analogy_words_all = pd.read_table(path + analogy_file , sep=' ', header=1)
analogy_words_all.head()



Unnamed: 0,Athens,Greece,Baghdad,Iraq
0,Athens,Greece,Bangkok,Thailand
1,Athens,Greece,Beijing,China
2,Athens,Greece,Berlin,Germany
3,Athens,Greece,Bern,Switzerland
4,Athens,Greece,Cairo,Egypt


In [5]:
import scipy

## Compute analogy word and compare to answer

In [7]:
start =  time.time()

all_bstars = []

analogy_1 = analogy_words_all.ix[:10000,]
analogy_2 = analogy_words_all.ix[10000+1:16000,]
analogy_3 = analogy_words_all.ix[16000+1:20000,]
analogy_4 = analogy_words_all.ix[20000+1:,]


analogies = [analogy_4,analogy_2, analogy_3, analogy_1]
for embedding in embeddings:
    print(embedding['name'])
    words = embedding['words'] 
    embedding_vectors = np.stack(list(words.values()))
    embedding_vocabulary = np.stack(list(words.keys()))

    batch_accs = []
    batch_MRRs = []
    batch_lengths = []
    
    #batch because of memory constraints
    for analogy_words in analogies:
        start =  time.time()

        print('Calculating batch, size:', len(analogy_words))

        indexes_found = [] # indexes found in the analogy batch
        actual_answer_indices = [] # embedding index for the answers of 
                                    # the analogy questions
        to_exclude = [] # exclude words in query for each question          
        all_bstars = [] # holds all predicted vectors
        #Do the first 20000 words, and then the rest
        
        
        for index, row in analogy_words.iterrows():
                if row[0] in words.keys() and row[1] in words.keys() and row[2] in words.keys() and row[3] in words.keys():   
                    #only do for a subset:
                    indexes_found.append(index)
                    a = words[row[0]]
                    astar = words[row[1]]
                    b = words[row[2]]
                    vocab_keys = list(embedding_vocabulary)

                    actual_bstar_index = vocab_keys.index(row[3])

                    # Compute offset a* - a, add to b
                    bstar = b.T + (astar.T - a.T)            
                    cosine_sims = []
                    cosine_sims_keys = []

                    # Now that we computed  bstar find the closest vector to it
                    # compute cosine similarity with all vectors
                    # obtain the closest               
                    to_exclude.append([vocab_keys.index(row[0]),
                                      vocab_keys.index(row[1]),
                                      vocab_keys.index(row[2])])
                    all_bstars.append(bstar)
                    actual_answer_indices.append(actual_bstar_index)

        # check if it found something
        if len(indexes_found)!=0:
            all_bstars_np = np.array(all_bstars)

            embeddingMRR, acc, answer_indexes, possible_answer_indexes = accuracy_and_MRR(all_bstars_np, 
                                                    embedding_vectors, 
                                                   actual_answer_indices, to_exclude)

            print(embeddingMRR, acc, len(indexes_found ))
            batch_accs.append(acc)
            batch_MRRs.append(embeddingMRR)
            batch_lengths.append(len(indexes_found))
        
        
        end = time.time()
        print(end-start, "seconds")

        
    embedding.update({'MRR':batch_MRRs, 
                          'analogy_indexes_found': indexes_found,
                          'answer_indexes': answer_indexes,
                          'actual_answer_indices': actual_answer_indices,
                          'possible_answer_indices': possible_answer_indexes,
                          'batch_lengths':batch_lengths,
                         'acc':batch_accs})

end = time.time()
print(end-start, "seconds")
            

bow2
Calculating batch, size: 0
0.0003077983856201172 seconds
Calculating batch, size: 10001
0.444614031333 0.371708511941 1633
205.90572595596313 seconds
Calculating batch, size: 6000
0.744238190823 0.663048498845 4330
576.236487865448 seconds
Calculating batch, size: 3555
0.763610932179 0.686462144666 3553
446.7125039100647 seconds
bow5
Calculating batch, size: 0
0.08013510704040527 seconds
Calculating batch, size: 10001
0.466912463141 0.380281690141 1633
196.434720993042 seconds
Calculating batch, size: 6000
0.734695955207 0.641801385681 4330
589.0359320640564 seconds
Calculating batch, size: 3555
0.738747495437 0.640866873065 3553
521.11399102211 seconds
521.1686689853668 seconds


In [12]:
# Calculates wighted average for acc and mrr for each of the batches
for embedding in embeddings:
    
    amount = np.array(embedding['batch_lengths'])
    rate = embedding['acc']
    weights = amount/sum(amount)
    acc = np.average(rate, weights=weights)
    print(embedding['name'],'acc',acc)
    
    
for embedding in embeddings:
    
    amount = np.array(embedding['batch_lengths'])
    rate = embedding['MRR']
    weights = amount/sum(amount)
    mrr = np.average(rate, weights=weights)
    print(embedding['name'],'mrr',mrr)
   
    

bow2 acc 0.621794871795
bow5 acc 0.596574190836
bow2 mrr 0.700054195194
bow5 mrr 0.690255505427
