In [5]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import Word2Vec, KeyedVectors
from nltk.corpus import wordnet as wn
import nltk
import numpy as np
from scipy.spatial.distance import cosine
import operator

In [6]:
glove_file="../data/glove.42B.300d.50K.txt"
original_file="../data/glove.42B.300d.50K.w2v.txt"
n, dimension = glove2word2vec(glove_file, original_file)

In [7]:
wv = KeyedVectors.load_word2vec_format(original_file, binary=False)

Q1: Implement the Lesk algorithm using word vectors \([Basile et al. 2014](https://www.aclweb.org/anthology/C/C14/C14-1151.pdf)\), where we measure the similarity between a gloss g = $\{ g_1, \ldots, g_G \}$ and context c = $\{ c_1, \ldots, c_C \}$ as the cosine similarity between the sum of distributed representations:

$$
\cos \left(\sum_{i=1}^G g_i, \sum_{i=1}^C c_i  \right)
$$

* The gloss for a synset can be found in `synset.definition()`; be sure to tokenize it appropriately.  
* You can find the cosine *distance* (not similarity) between two vectors using the `scipy.spatial.distance.cosine(vector_one, vector_two)` function.
* `wn.synsets(word, pos=part_of_speech)` gets you a list of the synsets for a word with a specific part of speech (e.g., "n" for noun)

In [17]:
def lesk(word, sentence, part_of_speech):
    context_tokens=nltk.word_tokenize(sentence)
    context_vector=np.zeros(dimension)

    for context_word in context_tokens:
        if context_word in wv and context_word != word:
            context_vector+=wv[context_word]
            
    synsets=wn.synsets(word, pos=part_of_speech)
    vals={}
    for synset in synsets:
        tokens=nltk.word_tokenize(synset.definition())
        vector=np.zeros(dimension)

        for def_word in tokens:
            if def_word in wv and def_word != word:
                vector+=wv[def_word]
        vals[synset]=1-cosine(context_vector, vector)
    
    sorted_x = sorted(vals.items(), key=operator.itemgetter(1), reverse=True)        
    for k,v in sorted_x:
        print("%.3f\t%s\t%s"% (v,k,k.definition()))


Does your implementation distinguish between these two senses of "bank"?

In [18]:
lesk("bank", "I deposited my money into my savings account at the bank", "n")

0.891	Synset('savings_bank.n.02')	a container (usually with a slot in the top) for keeping money at home
0.889	Synset('depository_financial_institution.n.01')	a financial institution that accepts deposits and channels the money into lending activities
0.878	Synset('bank.n.06')	the funds held by a gambling house or the dealer in some gambling games
0.875	Synset('bank.n.07')	a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force
0.859	Synset('bank.n.09')	a building in which the business of banking transacted
0.859	Synset('bank.n.05')	a supply or stock held in reserve for future use (especially in emergencies)
0.829	Synset('bank.n.04')	an arrangement of similar objects in a row or in tiers
0.828	Synset('bank.n.01')	sloping land (especially the slope beside a body of water)
0.812	Synset('bank.n.10')	a flight maneuver; aircraft tips laterally about its longitudinal axis (especially in turning)
0.807	Synset('bank.n.03

In [16]:
lesk("bank", "I ran along the river bank", "n")

0.857	Synset('bank.n.01')	sloping land (especially the slope beside a body of water)
0.845	Synset('bank.n.07')	a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force
0.821	Synset('bank.n.06')	the funds held by a gambling house or the dealer in some gambling games
0.808	Synset('bank.n.09')	a building in which the business of banking transacted
0.808	Synset('savings_bank.n.02')	a container (usually with a slot in the top) for keeping money at home
0.788	Synset('depository_financial_institution.n.01')	a financial institution that accepts deposits and channels the money into lending activities
0.786	Synset('bank.n.03')	a long ridge or pile
0.783	Synset('bank.n.10')	a flight maneuver; aircraft tips laterally about its longitudinal axis (especially in turning)
0.781	Synset('bank.n.05')	a supply or stock held in reserve for future use (especially in emergencies)
0.767	Synset('bank.n.04')	an arrangement of similar objec