# Word Embeddings

In [None]:
from time import time
import gensim
from gensim.models import Word2Vec
from nltk.corpus import brown
from nltk import pos_tag
from nltk import word_tokenize
import json
from stanfordcorenlp import StanfordCoreNLP

In [None]:
# Locate the pruned word2vec sample in NLTK
from nltk.data import find
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))

In [None]:
#start = time()
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False) # load in word2vec format
#print(time()-start)

In [None]:
# Get the model's vocabulary
vocab = set(model.vocab)

In [None]:
len(vocab)

In [None]:
# Note: words may have to be converted between lower and title case (and possibly upper case?) to see if they exist in the vocabulary
len(model['avocado']) # dimensions of vector representing word 'avocado' (and all other words)

In [None]:
model.most_similar('wagon', topn=10)

In [None]:
# most similar using cosine similarity
model.most_similar_cosmul('wagon', topn=15)

In [None]:
# select most "unlike" item
model.doesnt_match(['guitar', 'trumpet', 'violin', 'flute'])

In [None]:
# Capitalization and pluralization can lead to different most_similar results
for w in ('room', 'rooms', 'Room', 'Rooms'):
    print(w,'\t\n', model.most_similar(w),sep="")
    print('-'*50)

In [None]:
# most_similar takes collections of vectors to be added (positive) or subtracted (negative) 
# Can use to specify an analogy: Read as "Paris is to France as Madrid is to ?""
model.most_similar(positive=['France', 'Madrid'], negative=['Paris'], topn=10)

In [None]:
# cosine similarity between two words
model.similarity('bolt','bread')

In [None]:
# cosine similarity between two sets of words
model.n_similarity(['Elena','bought','the','hat','today'], ['He','rode','his','red','wagon'])

In [None]:
model.similar_by_word('roadster',topn=15)

In [None]:
model.similar_by_vector(model['roadster'],topn=15)

### Training a custom Word2Vec model

In [None]:
# How-to train custom Word2Vec model (this is trained with the Brown corpus)
custommodel = Word2Vec(brown.sents(), size=300, window=5, min_count=20)

In [None]:
# To supress deprecation error in custom models or those loaded from saved files, prefix method call with "wv.""
# so to call the most_similar method, use
custommodel.wv.most_similar('wagon',topn=15)  # Note that the answers are much different from those of the pre-trained word2vec model

# Lemmatization

### Extracting lemmas with NLTK

In [None]:
# create an instance of the WordNet Lemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
# The lemmatizer should be given a POS tag to return the lemma (defaults to nouns otherwise, and only works for simple cases like plurals)
# adjective = 'a'; adverb = 'r'
print(lemmatizer.lemmatize("sleeping",pos='v'))

### Extracting lemmas with StanfordCoreNLP (note: this will also return the part of speech tag)

In [None]:
# Get an instance of StanfordCoreNLP by connecting to the server
nlp = StanfordCoreNLP('http://jupyterlab-nfs-corenlp', port=9000)

In [None]:
txt = "The day was sunny and warm.  I decided I'd go boating."  # text to be annotated
props = {'annotators': 'lemma','outputFormat':'json'} # set annotator to provide lemma and get return as json (otherwise it's a string)
res = nlp.annotate(txt, properties=props)   # apply the annotator: results are in json format
d = json.loads(res)                         # load the json object into a dictionary

In [None]:
# d is the dictionary returned from loading the json response
d

In [None]:
# d['sentences'] is a list of dictionaries, one per sentence - so the second sentence (at index 1) is
first_sent_d = d['sentences'][1]

In [None]:
# each dictionary has a key 'index' (value is integer sentence index) and a key 'tokens' (value is list of token dictionaries)
first_sent_d

In [None]:
# each token dictionary contains (among other things) the original text, its corresponding lemma, and the POS tag:
for tok_d in first_sent_d['tokens']:
    print(tok_d['originalText'], tok_d['lemma'], tok_d['pos'])

# WordNet

In [None]:
# import WordNet
from nltk.corpus import wordnet as wn

In [None]:
wn.synsets('dog')  # Returns the synsets for the word "dog"

In [None]:
wn.synsets('dog', pos='n')  # Restrict returned synsets to verbs (also works with NOUN, ADJ, ADV)

In [None]:
# Select a specific synset
dog = wn.synset('dog.n.01')
dog.name() # dentifies the name of the synset of this variable

In [None]:
dog.lemmas() # Outputs full lemmas, including part of speech and sense #

In [None]:
dog.lemma_names() # Outputs lemma names

In [None]:
dog.definition() # The gloss (or definition)

In [None]:
dog.examples() # examples of use(s) in sentences

## Synonyms and Antonyms

In [None]:
def syns_ants(word):
    synonyms = [] 
    antonyms = []      
    for syn in wn.synsets(word): 
        for l in syn.lemmas(): 
            synonyms.append(l.name()) 
            if l.antonyms(): 
                antonyms.append(l.antonyms()[0].name())

    synonyms = set(synonyms)
    antonyms = set(antonyms)
    return synonyms, antonyms

In [None]:
syns, ants = syns_ants("calm")

print("Synonyms:",syns,"\n\nAntonyms:",ants,sep="")

## Hypernyms and Hyponyms

In [None]:
dog.hypernyms() # Hypernyms for dog (e.g., a dog is-a ...)

In [None]:
dog.hyponyms() # Hyponyms for dog (e.g., each of these is-a dog)

In [None]:
dog.hypernym_paths() # Paths to all hypernyms (as returned by hypernyms method above)

In [None]:
# Hypernym Tree
from pprint import pprint  # import pretty print 
hyp = lambda s:s.hypernyms() 
pprint(dog.tree(hyp)) # output hypernym tree 


In [None]:
cat = wn.synset('cat.n.01') # get cat synset

In [None]:
cat

In [None]:
dog.common_hypernyms(cat) # what hypernyms are common to both dogs and cats

In [None]:
# Lowest Common Hypernym between two synsets
wn.synset('hairdresser.n.01').lowest_common_hypernyms(wn.synset('chef.n.01'))

### Instance Hypernyms/Hyponyms

In [None]:
city=wn.synset('city.n.01') 
city.instance_hyponyms()[:10] # Lists the first 10 hyponyms of city (individual cities)

In [None]:
aca = wn.synset('acapulco.n.01')  # pick a city
aca.instance_hypernyms() # Lists all its hypernyms

## Holonyms and Meronyms

### Part Holonyms and Meronyms

In [None]:
car=wn.synset('car.n.01') # get synset for car
car.part_meronyms()[:10]  # Returns top 10 entities that compose a car (accelerator, airbag, etc)

In [None]:
accel = wn.synset('accelerator.n.01')  # get the synset for accelerator
accel.part_holonyms()                  # Returns holonyms for the synset: entities that the accelerator is a part-of

### Member Holonyms and Meronyms

In [None]:
forest = wn.synset('forest.n.01')  # get synset for forest
forest.member_meronyms()           # Returns entities that are member-of a forest

In [None]:
tree=wn.synset('tree.n.01')  # get the tree synset
tree.member_holonyms()       # Returns entities that the tree is a member-of

### Substance Holonyms and Meronyms

In [None]:
bread= wn.synset('bread.n.01')  # get synset for bread
bread.substance_meronyms()      # Returns entities that are substances of bread

In [None]:
flour = wn.synset('flour.n.01')  # get synset for flour
flour.substance_holonyms()       # Returns entities flour is a substances-of

## Similarity Measures

### Path-Based Similarities

In [None]:
love = wn.synset('love.n.01')
romance = wn.synset('romance.n.01')
hate = wn.synset('hate.n.01')

**Path Similarity** (returns 0 to 1)

In [None]:
# similarity is not synonymy
print('Love - Romance',love.path_similarity(romance))
print('Love - Hate',love.path_similarity(hate))
print('Romance - Hate',romance.path_similarity(hate))

**Leacock-Chodorow Similarity**

In [None]:
print('Love - Romance',love.lch_similarity(romance))
print('Love - Hate',love.lch_similarity(hate))
print('Romance - Hate',romance.lch_similarity(hate))

**Wu-Palmer Similarity** (returns 0 to 1)

In [None]:
print('Love - Romance',love.wup_similarity(romance))
print('Love - Hate',love.wup_similarity(hate))
print('Romance - Hate',romance.wup_similarity(hate))

### Information-Content Based Similarities

In [None]:
from nltk.corpus import wordnet_ic as wic  # import that allows loading of information content
ic = wic.ic('ic-brown.dat')                # load information content from the Brown corpus int variable

**Lin Similarity**

In [None]:
print('Love - Romance',love.lin_similarity(romance, ic))
print('Love - Hate',love.lin_similarity(hate, ic))
print('Romance - Hate',romance.lin_similarity(hate, ic))

**Resnik Similarity**

In [None]:
print('Love - Romance',love.res_similarity(romance, ic))
print('Love - Hate',love.res_similarity(hate, ic))
print('Romance - Hate',romance.res_similarity(hate, ic))

**Jiang-Conrath Similarity**

In [None]:
print('Love - Romance',love.jcn_similarity(romance, ic))
print('Love - Hate',love.jcn_similarity(hate, ic))
print('Romance - Hate',romance.jcn_similarity(hate, ic))