# Similarity Evaluation

## Load in the embeddings

In [2]:

import pandas as pd
import numpy as np
import pickle
from itertools import islice
from sklearn import preprocessing  # Two samples, with 3 dimensions.



path = './data/' # Ensure path exists with relevant data 

embedding_names = ['bow2','bow5']

embedding_names = ['bow2','deps'] # Due to memory issues the datasets are 
                                    # loaded 2 at a time

#
# embeddings: list of dicts holding the data for each embedding
# dict: {name - embedding name,
#       filename - embedding data filename
#       words - dict holding the words  and their vectors
#    
embeddings = []


for name in embedding_names:
    # words: dict, holds all words and their vectors for each embedding
    words = {}
    words_normalized = {}
    filename = name + '.words.bz2'
    embedding_df = pd.read_table(path + filename , sep=' ', header=None)
    for index, row in embedding_df.iterrows():
        words[row[0]] = row[1:]
    
    
    embedding_vectors = np.stack(list(words.values()))
    embedding_vocabulary = np.stack(list(words.keys()))
    embeddings.append({'name':name, 'filename': filename, 
                            'words': words, ' embedding_vectors':  embedding_vectors,
                       'embedding_vocabulary': embedding_vocabulary})
        




FileNotFoundError: [Errno 2] No such file or directory: './data/bow2.words.bz2'

In [5]:

import pandas as pd
import numpy as np
import pickle
from itertools import islice
from sklearn import preprocessing  # Two samples, with 3 dimensions.



path = './data/' # Ensure path exists with relevant data 

embedding_names = ['bow2','bow5']

embedding_names = ['embeddings_new','embeddings_new'] # Due to memory issues the datasets are 
                                    # loaded 2 at a time

#
# embeddings: list of dicts holding the data for each embedding
# dict: {name - embedding name,
#       filename - embedding data filename
#       words - dict holding the words  and their vectors
#    
embeddings = []


for name in embedding_names:
    # words: dict, holds all words and their vectors for each embedding
    words = {}
    words_normalized = {}
    filename = name + '.txt'
    embedding_df = pd.read_table(path + filename , sep=' ', header=1)
    for index, row in embedding_df.iterrows():
        words[row[0]] = row[1:]
    
    
    embedding_vectors = np.stack(list(words.values()))
    embedding_vocabulary = np.stack(list(words.keys()))
    embeddings.append({'name':name, 'filename': filename, 
                            'words': words, ' embedding_vectors':  embedding_vectors,
                       'embedding_vocabulary': embedding_vocabulary})
        




In [8]:
embeddings[0]['words']

{'Telecom': -0.137403    0.195948
 -0.396998     1.53496
 0.365169     0.358192
 0.183505     -1.56956
 0.138352     0.460271
 Name: 1398, dtype: object, 'reasons': -0.137403   -0.0764921
 -0.396998    -0.832885
 0.365169        1.2322
 0.183505     -0.557189
 0.138352        0.7982
 Name: 204, dtype: object, 'nuclear': -0.137403   -0.671467
 -0.396998    -0.26891
 0.365169     0.861873
 0.183505    -0.107214
 0.138352     0.736565
 Name: 1113, dtype: object, 'manner': -0.137403    -0.610678
 -0.396998     -1.47537
 0.365169      0.632648
 0.183505     -0.159905
 0.138352     0.0470464
 Name: 473, dtype: object, 'price': -0.137403     -0.42916
 -0.396998       1.0411
 0.365169     -0.802735
 0.183505      0.649124
 0.138352    -0.0918343
 Name: 1414, dtype: object, 'clause': -0.137403      1.91073
 -0.396998     -1.82051
 0.365169     0.0957393
 0.183505     -0.174326
 0.138352     0.0373683
 Name: 588, dtype: object, 'property': -0.137403    0.0344744
 -0.396998     0.229577
 0.365169

## Quantitative Comparison

## Compute correlation of similarity score against human judgements

### Evaluation against Simlex

In [17]:
#
# Compute the cosine similarity for each word pair in the simlex
#

import analyze
import imp
analyze = imp.reload(analyze)
    

cosine_similarities = []


path = './data/'
simlexf = 'SimLex-999/SimLex-999.txt'
simlex = pd.read_table(path + simlexf)
simlex_sim =  np.array(simlex["SimLex999"]).astype(np.float)

MEN_f = 'MEN/MEN_dataset_natural_form_full'
MEN = pd.read_table(path + MEN_f, header=None, sep=' ')
MEN_sim =  np.array(MEN.ix[:,2]).astype(np.float)

similarity_evaluation = pd.DataFrame({})



word_pairs = simlex

spearman = []
pearson = []

datasets = {'simlex': simlex_sim, 'MEN': MEN_sim}


def compute_correlations(similarities, predicted):
        
    df = pd.DataFrame({'control': similarities, 'predicted': predicted})
    return( {'pearson': df.corr(method = 'pearson')['control'][1], 
            'spearman': df.corr(method = 'spearman')['control'][1]})
    

# Calculate cosine similarity between each pair of words in the simlex

words_pairs = simlex 
dataset = 'simlex'
for embedding in embeddings:
    word_vectors = embedding['words']
    cosine_similarities = []
    cosine_similarities = analyze.evaluate_similarity(word_pairs, word_vectors)
    print(cosine_similarities)
    embedding.update({'cosine_similarities':cosine_similarities})

    cors = compute_correlations(simlex_sim, cosine_similarities)
    pearson.append(cors['pearson'])
    spearman.append(cors['spearman'])
    
similarity_evaluation = pd.DataFrame({'embedding': embedding_names, 
                                      'spearman': spearman,
                                     'pearson': pearson})


print(similarity_evaluation)



    

[nan, nan, 0.58324271521611415, nan, nan, nan, nan, 0.56280458381752796, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.7846060821996248, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, -0.16946834592851981, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.3819635955057023, nan, nan, nan, nan, nan, nan, nan, 0.40228343515113435, nan, nan, nan, nan, nan, nan, nan, nan, nan,

### Evaluation against MEN

In [None]:


spearman = []
pearson = []   

word_pairs = MEN

for embedding in embeddings:
    word_vectors = embedding['words']
    cosine_similarities = []
    cosine_similarities = analyze.evaluate_similarity(word_pairs, word_vectors)
    embedding.update({'cosine_similarities':cosine_similarities})

    cors = compute_correlations(MEN_sim, cosine_similarities)
    pearson.append(cors['pearson'])
    spearman.append(cors['spearman'])
    
        
similarity_evaluation = pd.DataFrame({'embedding': embedding_names, 
                                      'spearman': spearman,
                                     'pearson': pearson})


    
print(similarity_evaluation)


## Qualitative Comparison

## Look at the scores for example word-pairs for each embedding

In [None]:


# Qualitative Comparison
# Choose a few random  word pair indices to compare the score for the three embeddings
# Choose MEN or simlex dataset

dataset = simlex[['word1','word2', 'SimLex999']]   # MEN or simlex
dataset = MEN   # MEN or simlex

indices = [0,1,940]
for ind in indices:
    print('\n',dataset.ix[ind,0], dataset.ix[ind,1] )
    cos_sims = dataset.ix[:,2]
    normalized =(cos_sims - min(cos_sims))/(max(cos_sims)-min(cos_sims))
    print('human', normalized[ind])
    
    for embedding in embeddings:
        cos_sims = embedding['cosine_similarities']
        normalized =(cos_sims - min(cos_sims))/(max(cos_sims)-min(cos_sims))
        print("%s %.2f"%(embedding['name'],normalized[ind]))


In [None]:

def closest_words(embedding_vectors,embedding_vocabulary, word, k, exclude=[]):
    #
    # Obtain the k most similar words based on cosine similarity
    #
    D = pairwise_distances(embedding_vectors, word.reshape(1, -1), metric='cosine')
    possible_answers=[]
    kwords=k
    if exclude:
        kwords=k+len(exclude)
    for id in D.argsort(axis=0).flatten()[0:kwords]:
        
        # Exclude words in query
        if embedding_vocabulary[id] not in (exclude):
            possible_answers.append(embedding_vocabulary[id])
    return(possible_answers)




### Get the 5 most similar words for a query word

In [None]:
#
# Qualitative checks
#
#
# Look into 5 most similar words
# Choose a word
word_to_check='old'
word_to_check='smart'

for embedding in embeddings:
    words = embedding['words']
    most_similar = closest_words(embedding['embedding_vectors'],
                                 embedding['embedding_vocabulary'], 
                                 words[word_to_check], 6, exclude=[])
    
    print(most_similar)

