In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import Word2Vec, KeyedVectors
from nltk.corpus import wordnet as wn
import nltk
import numpy as np
from scipy.spatial.distance import cosine
import operator

In [None]:
glove_file="../data/glove.6B.100d.100K.txt"
original_file="../data/glove.6B.100d.100K.w2v.txt"
n, dimension = glove2word2vec(glove_file, original_file)

In [None]:
wv = KeyedVectors.load_word2vec_format(original_file, binary=False)

Q1: Implement the Lesk algorithm using word vectors \([Basile et al. 2014](https://www.aclweb.org/anthology/C/C14/C14-1151.pdf)\), where we measure the similarity between a gloss g = $\{ g_1, \ldots, g_G \}$ and context c = $\{ c_1, \ldots, c_C \}$ as the cosine similarity between the sum of distributed representations:

$$
\cos \left(\sum_{i=1}^G g_i, \sum_{i=1}^C c_i  \right)
$$

* The gloss for a synset can be found in `synset.definition()`; be sure to tokenize it appropriately.  
* You can find the cosine *distance* (not similarity) between two vectors using the `scipy.spatial.distance.cosine(vector_one, vector_two)` function.
* `wn.synsets(word, pos=part_of_speech)` gets you a list of the synsets for a word with a specific part of speech (e.g., "n" for noun)

In [None]:
def lesk(word, sentence, part_of_speech):
    context_tokens=nltk.word_tokenize(sentence)
    context_vector=np.zeros(dimension)

    for context_word in context_tokens:
        if context_word in wv and context_word != word:
            context_vector+=wv[context_word]
            
    synsets=wn.synsets(word, pos=part_of_speech)
    vals={}
    for synset in synsets:
        tokens=nltk.word_tokenize(synset.definition())
        vector=np.zeros(dimension)

        for def_word in tokens:
            if def_word in wv and def_word != word:
                vector+=wv[def_word]
        vals[synset]=1-cosine(context_vector, vector)
    
    sorted_x = sorted(vals.items(), key=operator.itemgetter(1), reverse=True)        
    for k,v in sorted_x:
        print("%.3f\t%s\t%s"% (v,k,k.definition()))


Execute the following two cells to check whether your implementation distinguishes between these two senses of "bank".

In [None]:
lesk("bank", "I deposited my money into my savings account at the bank", "n")

In [None]:
lesk("bank", "I ran along the river bank", "n")