# Word Embeddings

In [8]:
from time import time
import gensim
from gensim.models import Word2Vec
from nltk.corpus import brown
from nltk import pos_tag
from nltk import word_tokenize
import json
from stanfordcorenlp import StanfordCoreNLP

In [9]:
# Locate the pruned word2vec sample in NLTK
from nltk.data import find
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))

In [10]:
#start = time()
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False) # load in word2vec format
#print(time()-start)

In [11]:
# Get the model's vocabulary
vocab = set(model.vocab)

In [12]:
len(vocab)

43981

In [13]:
# Note: words may have to be converted between lower and title case (and possibly upper case?) to see if they exist in the vocabulary
len(model['avocado']) # dimensions of vector representing word 'avocado' (and all other words)

300

In [14]:
model.most_similar('wagon', topn=10)

[('wagons', 0.6923391819000244),
 ('buckboard', 0.5725744366645813),
 ('tractor', 0.5646649599075317),
 ('truck', 0.5361799597740173),
 ('boxcar', 0.5161231756210327),
 ('Wagon', 0.5161173939704895),
 ('stagecoach', 0.5105308890342712),
 ('van', 0.5091939568519592),
 ('boxcars', 0.508715808391571),
 ('mule', 0.49818965792655945)]

In [27]:
# most similar using cosine similarity
model.most_similar_cosmul('wagon', topn=15)

[('wagons', 0.8461687564849854),
 ('buckboard', 0.7862864136695862),
 ('tractor', 0.7823317050933838),
 ('truck', 0.7680892944335938),
 ('boxcar', 0.758060872554779),
 ('Wagon', 0.7580579519271851),
 ('stagecoach', 0.7552647590637207),
 ('van', 0.7545962333679199),
 ('boxcars', 0.7543572187423706),
 ('mule', 0.7490941286087036),
 ('oxcart', 0.7466381192207336),
 ('chariot', 0.7398384809494019),
 ('buggy', 0.7397239208221436),
 ('locomotive', 0.7383973002433777),
 ('cart', 0.7375895977020264)]

In [28]:
# select most "unlike" item
model.doesnt_match(['guitar', 'trumpet', 'violin', 'flute'])

'trumpet'

In [29]:
# Capitalization and pluralization can lead to different most_similar results
for w in ('room', 'rooms', 'Room', 'Rooms'):
    print(w,'\t\n', model.most_similar(w),sep="")
    print('-'*50)

room	
[('rooms', 0.7605786323547363), ('upstairs', 0.6226500868797302), ('hallway', 0.6086891889572144), ('downstairs', 0.593078076839447), ('bathroom', 0.5513426065444946), ('kitchenette', 0.5502405166625977), ('basement', 0.5274615287780762), ('lounge', 0.5249817967414856), ('foyer', 0.5173395276069641), ('hallways', 0.5161101222038269)]
--------------------------------------------------
rooms	
[('room', 0.7605786323547363), ('Rooms', 0.634871780872345), ('bedrooms', 0.6270223259925842), ('bathrooms', 0.6007975339889526), ('suites', 0.5960015058517456), ('beds', 0.5761682987213135), ('lounges', 0.5643664598464966), ('kitchenette', 0.561974048614502), ('lounge', 0.5320551991462708), ('floors', 0.529586672782898)]
--------------------------------------------------
Room	
[('Rooms', 0.6796290278434753), ('Ballroom', 0.6294605731964111), ('Lounge', 0.5510711669921875), ('Auditorium', 0.5282275080680847), ('Building', 0.49195849895477295), ('Cafeteria', 0.49163320660591125), ('room', 0.485

In [30]:
# most_similar takes collections of vectors to be added (positive) or subtracted (negative) 
# Can use to specify an analogy: Read as "Paris is to France as Madrid is to ?""
model.most_similar(positive=['France', 'Madrid'], negative=['Paris'], topn=10)

[('Spain', 0.7776165008544922),
 ('Portugal', 0.6343989968299866),
 ('Argentina', 0.5653746724128723),
 ('Spanish', 0.5568934679031372),
 ('Porto', 0.5294696092605591),
 ('Italy', 0.5196953415870667),
 ('Brazil', 0.5041937828063965),
 ('Chile', 0.5033072233200073),
 ('Portuguese', 0.5029925107955933),
 ('Uruguay', 0.5019488334655762)]

In [31]:
# cosine similarity between two words
model.similarity('bolt','bread')

0.09666148

In [32]:
# cosine similarity between two sets of words
model.n_similarity(['Elena','bought','the','hat','today'], ['He','rode','his','red','wagon'])

0.42493808

In [33]:
model.similar_by_word('roadster',topn=15)

[('coupe', 0.8324612379074097),
 ('sedan', 0.6679246425628662),
 ('Bugatti', 0.6121565699577332),
 ('coachwork', 0.6103246212005615),
 ('sportiest', 0.60999596118927),
 ('sedans', 0.5992715954780579),
 ('Mustang', 0.5988028645515442),
 ('runabout', 0.5713080167770386),
 ('styling', 0.5562582612037659),
 ('hotrod', 0.5548204183578491),
 ('Jaguar', 0.5505430102348328),
 ('Giulietta', 0.5487755537033081),
 ('Volkswagens', 0.5426846742630005),
 ('streamliner', 0.5373216867446899),
 ('Chevy', 0.5369911789894104)]

In [34]:
model.similar_by_vector(model['roadster'],topn=15)

[('roadster', 1.0),
 ('coupe', 0.8324612379074097),
 ('sedan', 0.6679246425628662),
 ('Bugatti', 0.6121566295623779),
 ('coachwork', 0.6103246212005615),
 ('sportiest', 0.60999596118927),
 ('sedans', 0.5992715954780579),
 ('Mustang', 0.5988028645515442),
 ('runabout', 0.5713080167770386),
 ('styling', 0.5562582612037659),
 ('hotrod', 0.5548204183578491),
 ('Jaguar', 0.5505430102348328),
 ('Giulietta', 0.5487755537033081),
 ('Volkswagens', 0.5426846742630005),
 ('streamliner', 0.5373216867446899)]

### Training a custom Word2Vec model

In [35]:
# How-to train custom Word2Vec model (this is trained with the Brown corpus)
custommodel = Word2Vec(brown.sents(), size=300, window=5, min_count=20)

In [36]:
# To supress deprecation error in custom models or those loaded from saved files, prefix method call with "wv.""
# so to call the most_similar method, use
custommodel.wv.most_similar('wagon',topn=15)  # Note that the answers are much different from those of the pre-trained word2vec model

[('shoulders', 0.9726659059524536),
 ('seat', 0.9652628302574158),
 ('flying', 0.9648684859275818),
 ('holding', 0.9638241529464722),
 ('fingers', 0.9613649845123291),
 ('wind', 0.9608621597290039),
 ('coat', 0.9593055248260498),
 ('knee', 0.9570326805114746),
 ('rifle', 0.956610918045044),
 ('knocked', 0.9565622806549072),
 ('beneath', 0.9564557075500488),
 ('shop', 0.9559034705162048),
 ('swung', 0.9555985927581787),
 ('journey', 0.9555788040161133),
 ('breath', 0.9554819464683533)]

# Lemmatization

### Extracting lemmas with NLTK

In [37]:
# create an instance of the WordNet Lemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [39]:
# The lemmatizer should be given a POS tag to return the lemma (defaults to nouns otherwise, and only works for simple cases like plurals)
# adjective = 'a'; adverb = 'r'
print(lemmatizer.lemmatize("sleeping",pos='v'))

sleep


### Extracting lemmas with StanfordCoreNLP (note: this will also return the part of speech tag)

In [None]:
# Get an instance of StanfordCoreNLP by connecting to the server
#nlp = StanfordCoreNLP('http://jupyterlab-nfs-corenlp', port=9000)
nlp = StanfordCoreNLP('http://localhost', port=9000)

In [None]:
txt = "The day was sunny and warm.  I decided I'd go boating."  # text to be annotated
props = {'annotators': 'lemma','outputFormat':'json'} # set annotator to provide lemma and get return as json (otherwise it's a string)
res = nlp.annotate(txt, properties=props)   # apply the annotator: results are in json format
d = json.loads(res)                         # load the json object into a dictionary

In [None]:
# d is the dictionary returned from loading the json response
d

In [None]:
# d['sentences'] is a list of dictionaries, one per sentence - so the second sentence (at index 1) is
first_sent_d = d['sentences'][1]

In [None]:
# each dictionary has a key 'index' (value is integer sentence index) and a key 'tokens' (value is list of token dictionaries)
first_sent_d

In [None]:
# each token dictionary contains (among other things) the original text, its corresponding lemma, and the POS tag:
for tok_d in first_sent_d['tokens']:
    print(tok_d['originalText'], tok_d['lemma'], tok_d['pos'])

# WordNet

In [41]:
# import WordNet
from nltk.corpus import wordnet as wn

In [42]:
wn.synsets('dog')  # Returns the synsets for the word "dog"

[Synset('dog.n.01'),
 Synset('frump.n.01'),
 Synset('dog.n.03'),
 Synset('cad.n.01'),
 Synset('frank.n.02'),
 Synset('pawl.n.01'),
 Synset('andiron.n.01'),
 Synset('chase.v.01')]

In [43]:
wn.synsets('dog', pos='n')  # Restrict returned synsets to verbs (also works with NOUN, ADJ, ADV)

[Synset('dog.n.01'),
 Synset('frump.n.01'),
 Synset('dog.n.03'),
 Synset('cad.n.01'),
 Synset('frank.n.02'),
 Synset('pawl.n.01'),
 Synset('andiron.n.01')]

In [44]:
# Select a specific synset
dog = wn.synset('dog.n.01')
dog.name() # dentifies the name of the synset of this variable

'dog.n.01'

In [45]:
dog.lemmas() # Outputs full lemmas, including part of speech and sense #

[Lemma('dog.n.01.dog'),
 Lemma('dog.n.01.domestic_dog'),
 Lemma('dog.n.01.Canis_familiaris')]

In [46]:
dog.lemma_names() # Outputs lemma names

['dog', 'domestic_dog', 'Canis_familiaris']

In [47]:
dog.definition() # The gloss (or definition)

'a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds'

In [48]:
dog.examples() # examples of use(s) in sentences

['the dog barked all night']

## Synonyms and Antonyms

In [49]:
def syns_ants(word):
    synonyms = [] 
    antonyms = []      
    for syn in wn.synsets(word): 
        for l in syn.lemmas(): 
            synonyms.append(l.name()) 
            if l.antonyms(): 
                antonyms.append(l.antonyms()[0].name())

    synonyms = set(synonyms)
    antonyms = set(antonyms)
    return synonyms, antonyms

In [50]:
syns, ants = syns_ants("calm")

print("Synonyms:",syns,"\n\nAntonyms:",ants,sep="")

Synonyms:{'still', 'tranquilize', 'calm', 'calm_down', 'becalm', 'equanimity', 'simmer_down', 'lull', 'tranquil', 'tranquillise', 'settle_down', 'sedate', 'chill_out', 'cool_it', 'serene', 'tranquillize', 'cool_off', 'steady', 'quieten', 'unagitated', 'composure', 'calm_air', 'quiet', 'calmness'}

Antonyms:{'stormy', 'discomposure', 'stimulate', 'agitate'}


## Hypernyms and Hyponyms

In [51]:
dog.hypernyms() # Hypernyms for dog (e.g., a dog is-a ...)

[Synset('canine.n.02'), Synset('domestic_animal.n.01')]

In [52]:
dog.hyponyms() # Hyponyms for dog (e.g., each of these is-a dog)

[Synset('basenji.n.01'),
 Synset('corgi.n.01'),
 Synset('cur.n.01'),
 Synset('dalmatian.n.02'),
 Synset('great_pyrenees.n.01'),
 Synset('griffon.n.02'),
 Synset('hunting_dog.n.01'),
 Synset('lapdog.n.01'),
 Synset('leonberg.n.01'),
 Synset('mexican_hairless.n.01'),
 Synset('newfoundland.n.01'),
 Synset('pooch.n.01'),
 Synset('poodle.n.01'),
 Synset('pug.n.01'),
 Synset('puppy.n.01'),
 Synset('spitz.n.01'),
 Synset('toy_dog.n.01'),
 Synset('working_dog.n.01')]

In [53]:
dog.hypernym_paths() # Paths to all hypernyms (as returned by hypernyms method above)

[[Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('object.n.01'),
  Synset('whole.n.02'),
  Synset('living_thing.n.01'),
  Synset('organism.n.01'),
  Synset('animal.n.01'),
  Synset('chordate.n.01'),
  Synset('vertebrate.n.01'),
  Synset('mammal.n.01'),
  Synset('placental.n.01'),
  Synset('carnivore.n.01'),
  Synset('canine.n.02'),
  Synset('dog.n.01')],
 [Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('object.n.01'),
  Synset('whole.n.02'),
  Synset('living_thing.n.01'),
  Synset('organism.n.01'),
  Synset('animal.n.01'),
  Synset('domestic_animal.n.01'),
  Synset('dog.n.01')]]

In [54]:
# Hypernym Tree
from pprint import pprint  # import pretty print 
hyp = lambda s:s.hypernyms() 
pprint(dog.tree(hyp)) # output hypernym tree 


[Synset('dog.n.01'),
 [Synset('canine.n.02'),
  [Synset('carnivore.n.01'),
   [Synset('placental.n.01'),
    [Synset('mammal.n.01'),
     [Synset('vertebrate.n.01'),
      [Synset('chordate.n.01'),
       [Synset('animal.n.01'),
        [Synset('organism.n.01'),
         [Synset('living_thing.n.01'),
          [Synset('whole.n.02'),
           [Synset('object.n.01'),
            [Synset('physical_entity.n.01'),
             [Synset('entity.n.01')]]]]]]]]]]]]],
 [Synset('domestic_animal.n.01'),
  [Synset('animal.n.01'),
   [Synset('organism.n.01'),
    [Synset('living_thing.n.01'),
     [Synset('whole.n.02'),
      [Synset('object.n.01'),
       [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]]


In [55]:
cat = wn.synset('cat.n.01') # get cat synset

In [56]:
cat

Synset('cat.n.01')

In [57]:
dog.common_hypernyms(cat) # what hypernyms are common to both dogs and cats

[Synset('chordate.n.01'),
 Synset('physical_entity.n.01'),
 Synset('whole.n.02'),
 Synset('animal.n.01'),
 Synset('mammal.n.01'),
 Synset('organism.n.01'),
 Synset('carnivore.n.01'),
 Synset('living_thing.n.01'),
 Synset('placental.n.01'),
 Synset('entity.n.01'),
 Synset('vertebrate.n.01'),
 Synset('object.n.01')]

In [58]:
# Lowest Common Hypernym between two synsets
wn.synset('hairdresser.n.01').lowest_common_hypernyms(wn.synset('chef.n.01'))

[Synset('skilled_worker.n.01')]

### Instance Hypernyms/Hyponyms

In [59]:
city=wn.synset('city.n.01') 
city.instance_hyponyms()[:10] # Lists the first 10 hyponyms of city (individual cities)

[Synset('aachen.n.01'),
 Synset('aalborg.n.01'),
 Synset('abadan.n.01'),
 Synset('aberdeen.n.04'),
 Synset('abidjan.n.01'),
 Synset('abilene.n.01'),
 Synset('acapulco.n.01'),
 Synset('adana.n.01'),
 Synset('aden.n.01'),
 Synset('agra.n.01')]

In [60]:
aca = wn.synset('acapulco.n.01')  # pick a city
aca.instance_hypernyms() # Lists all its hypernyms

[Synset('city.n.01'), Synset('port.n.01')]

## Holonyms and Meronyms

### Part Holonyms and Meronyms

In [61]:
car=wn.synset('car.n.01') # get synset for car
car.part_meronyms()[:10]  # Returns top 10 entities that compose a car (accelerator, airbag, etc)

[Synset('accelerator.n.01'),
 Synset('air_bag.n.01'),
 Synset('auto_accessory.n.01'),
 Synset('automobile_engine.n.01'),
 Synset('automobile_horn.n.01'),
 Synset('buffer.n.06'),
 Synset('bumper.n.02'),
 Synset('car_door.n.01'),
 Synset('car_mirror.n.01'),
 Synset('car_seat.n.01')]

In [62]:
accel = wn.synset('accelerator.n.01')  # get the synset for accelerator
accel.part_holonyms()                  # Returns holonyms for the synset: entities that the accelerator is a part-of

[Synset('airplane.n.01'), Synset('car.n.01')]

### Member Holonyms and Meronyms

In [63]:
forest = wn.synset('forest.n.01')  # get synset for forest
forest.member_meronyms()           # Returns entities that are member-of a forest

[Synset('tree.n.01'), Synset('underbrush.n.01')]

In [64]:
tree=wn.synset('tree.n.01')  # get the tree synset
tree.member_holonyms()       # Returns entities that the tree is a member-of

[Synset('forest.n.01')]

### Substance Holonyms and Meronyms

In [65]:
bread= wn.synset('bread.n.01')  # get synset for bread
bread.substance_meronyms()      # Returns entities that are substances of bread

[Synset('flour.n.01')]

In [66]:
flour = wn.synset('flour.n.01')  # get synset for flour
flour.substance_holonyms()       # Returns entities flour is a substances-of

[Synset('bread.n.01'), Synset('dough.n.01'), Synset('pastry.n.02')]

## Similarity Measures

### Path-Based Similarities

In [67]:
love = wn.synset('love.n.01')
romance = wn.synset('romance.n.01')
hate = wn.synset('hate.n.01')

**Path Similarity** (returns 0 to 1)

In [68]:
# similarity is not synonymy
print('Love - Romance',love.path_similarity(romance))
print('Love - Hate',love.path_similarity(hate))
print('Romance - Hate',romance.path_similarity(hate))

Love - Romance 0.16666666666666666
Love - Hate 0.3333333333333333
Romance - Hate 0.16666666666666666


**Leacock-Chodorow Similarity**

In [69]:
print('Love - Romance',love.lch_similarity(romance))
print('Love - Hate',love.lch_similarity(hate))
print('Romance - Hate',romance.lch_similarity(hate))

Love - Romance 1.845826690498331
Love - Hate 2.538973871058276
Romance - Hate 1.845826690498331


**Wu-Palmer Similarity** (returns 0 to 1)

In [70]:
print('Love - Romance',love.wup_similarity(romance))
print('Love - Hate',love.wup_similarity(hate))
print('Romance - Hate',romance.wup_similarity(hate))

Love - Romance 0.6153846153846154
Love - Hate 0.8571428571428571
Romance - Hate 0.6153846153846154


### Information-Content Based Similarities

In [72]:
from nltk.corpus import wordnet_ic as wic  # import that allows loading of information content
ic = wic.ic('ic-brown.dat')                # load information content from the Brown corpus int variable

**Lin Similarity**

In [73]:
print('Love - Romance',love.lin_similarity(romance, ic))
print('Love - Hate',love.lin_similarity(hate, ic))
print('Romance - Hate',romance.lin_similarity(hate, ic))

Love - Romance 0.3148881810545228
Love - Hate 0.7071675666744289
Romance - Hate 0.31353592105655803


**Resnik Similarity**

In [74]:
print('Love - Romance',love.res_similarity(romance, ic))
print('Love - Hate',love.res_similarity(hate, ic))
print('Romance - Hate',romance.res_similarity(hate, ic))

Love - Romance 3.139908049247135
Love - Hate 6.089739897762795
Romance - Hate 3.139908049247135


**Jiang-Conrath Similarity**

In [75]:
print('Love - Romance',love.jcn_similarity(romance, ic))
print('Love - Hate',love.jcn_similarity(hate, ic))
print('Romance - Hate',romance.jcn_similarity(hate, ic))

Love - Romance 0.07318936425583651
Love - Hate 0.19827794926970765
Romance - Hate 0.07273150289041953
