<a href="https://colab.research.google.com/github/egipot/LP_DeepLearning_NLP/blob/main/1_4_pretrained_word_from_GloVe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python
# https://www.udemy.com/data-science-natural-language-processing-in-python

# Author: http://lazyprogrammer.me
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future


In [35]:
# WHERE TO GET THE VECTORS:
# GloVe: https://nlp.stanford.edu/projects/glove/
# Direct link: http://nlp.stanford.edu/data/glove.6B.zip

import numpy as np
from sklearn.metrics.pairwise import pairwise_distances


def dist1(a, b):
    return np.linalg.norm(a - b)
def dist2(a, b):
    return 1 - a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))


In [36]:

# pick a distance type
dist, metric = dist2, 'cosine'
# dist, metric = dist1, 'euclidean'


## more intuitive
# def find_analogies(w1, w2, w3):
#   for w in (w1, w2, w3):
#     if w not in word2vec:
#       print("%s not in dictionary" % w)
#       return

#   king = word2vec[w1]
#   man = word2vec[w2]
#   woman = word2vec[w3]
#   v0 = king - man + woman

#   min_dist = float('inf')
#   best_word = ''
#   for word, v1 in iteritems(word2vec):
#     if word not in (w1, w2, w3):
#       d = dist(v0, v1)
#       if d < min_dist:
#         min_dist = d
#         best_word = word
#   print(w1, "-", w2, "=", best_word, "-", w3)


## faster
def find_analogies(w1, w2, w3):
  for w in (w1, w2, w3):
    if w not in word2vec:
      print("%s not in dictionary" % w)
      return

  king = word2vec[w1]
  man = word2vec[w2]
  woman = word2vec[w3]
  v0 = king - man + woman

  distances = pairwise_distances(v0.reshape(1, D), embedding, metric=metric).reshape(V)
  idxs = distances.argsort()[:4]
  for idx in idxs:
    word = idx2word[idx]
    if word not in (w1, w2, w3):
      best_word = word
      break

  print(w1, "-", w2, "=", best_word, "-", w3)


def nearest_neighbors(w, n=5):
  if w not in word2vec:
    print("%s not in dictionary:" % w)
    return

  v = word2vec[w]
  distances = pairwise_distances(v.reshape(1, D), embedding, metric=metric).reshape(V)
  idxs = distances.argsort()[1:n+1]
  print("neighbors of: %s" % w)
  for idx in idxs:
    print("\t%s" % idx2word[idx])



upload glove.6B.50d.txt for simplified retrieval...

In [48]:
source_file =  r'/content/glove.6B.50d.txt'

In [49]:
!ls {source_file}  # Should print the filename if it exists

/content/glove.6B.50d.txt


In [51]:
#print first line just for checking...
with open(source_file) as f:
    first_line = f.readline().strip('\n')
print(first_line)

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581


In [52]:
# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
embedding = []
idx2word = []
with open(source_file, encoding='utf-8') as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
    embedding.append(vec)
    idx2word.append(word)
print('Found %s word vectors.' % len(word2vec))
embedding = np.array(embedding)
V, D = embedding.shape



Loading word vectors...
Found 400000 word vectors.


In [53]:
find_analogies('king', 'man', 'woman')

king - man = queen - woman


In [54]:
find_analogies('man', 'woman', 'she')

man - woman = he - she


In [55]:
find_analogies('france', 'paris', 'london')
find_analogies('france', 'paris', 'rome')
find_analogies('paris', 'france', 'italy')
find_analogies('france', 'french', 'english')
find_analogies('japan', 'japanese', 'chinese')
find_analogies('japan', 'japanese', 'italian')
find_analogies('japan', 'japanese', 'australian')
find_analogies('december', 'november', 'june')
find_analogies('miami', 'florida', 'texas')
find_analogies('einstein', 'scientist', 'painter')
find_analogies('china', 'rice', 'bread')
find_analogies('man', 'woman', 'aunt')
find_analogies('man', 'woman', 'sister')
find_analogies('man', 'woman', 'wife')
find_analogies('man', 'woman', 'actress')
find_analogies('man', 'woman', 'mother')
find_analogies('heir', 'heiress', 'princess')
find_analogies('nephew', 'niece', 'aunt')
find_analogies('france', 'paris', 'tokyo')
find_analogies('france', 'paris', 'beijing')
find_analogies('february', 'january', 'november')
find_analogies('france', 'paris', 'rome')
find_analogies('paris', 'france', 'italy')




france - paris = britain - london
france - paris = italy - rome
paris - france = rome - italy
france - french = england - english
japan - japanese = china - chinese
japan - japanese = italy - italian
japan - japanese = australia - australian
december - november = july - june
miami - florida = houston - texas
einstein - scientist = matisse - painter
china - rice = chinese - bread
man - woman = uncle - aunt
man - woman = brother - sister
man - woman = friend - wife
man - woman = actor - actress
man - woman = father - mother
heir - heiress = queen - princess
nephew - niece = uncle - aunt
france - paris = japan - tokyo
france - paris = china - beijing
february - january = october - november
france - paris = italy - rome
paris - france = rome - italy


In [56]:
nearest_neighbors('woman')

neighbors of: woman
	girl
	man
	mother
	her
	boy


In [57]:
nearest_neighbors('king')
nearest_neighbors('france')
nearest_neighbors('japan')
nearest_neighbors('einstein')
nearest_neighbors('nephew')
nearest_neighbors('february')
nearest_neighbors('rome')

neighbors of: king
	prince
	queen
	ii
	emperor
	son
neighbors of: france
	french
	belgium
	paris
	spain
	netherlands
neighbors of: japan
	japanese
	china
	korea
	tokyo
	taiwan
neighbors of: einstein
	relativity
	bohr
	physics
	heisenberg
	freud
neighbors of: nephew
	cousin
	brother
	grandson
	son
	uncle
neighbors of: february
	october
	december
	january
	august
	september
neighbors of: rome
	naples
	venice
	italy
	turin
	pope


In [58]:
#own test

find_analogies('sofia', 'bulgaria', 'philippines')

sofia - bulgaria = manila - philippines


In [59]:
find_analogies('polish', 'Poland', 'Bulgaria' )

Poland not in dictionary


In [60]:
find_analogies('polish', 'poland', 'bulgaria' )

polish - poland = bulgarian - bulgaria


In [61]:
find_analogies ('rakia', 'bulgaria', 'hungary')

rakia - bulgaria = makanda - hungary


In [63]:
find_analogies ('rakia', 'bulgaria', 'poland')

rakia - bulgaria = melsungen - poland


In [64]:
find_analogies ('rakia', 'bulgaria', 'philippines')

rakia - bulgaria = dactylorhiza - philippines


In [65]:
find_analogies ('bulgarian', 'bulgaria', 'philippines')

bulgarian - bulgaria = philippine - philippines


In [66]:
nearest_neighbors('rakia')

neighbors of: rakia
	makanda
	naem
	wallum
	asbjorn
	rataje


In [67]:
nearest_neighbors('philippines')

neighbors of: philippines
	indonesia
	philippine
	thailand
	peru
	manila


In [68]:
nearest_neighbors('bulgaria')

neighbors of: bulgaria
	romania
	hungary
	lithuania
	ukraine
	latvia


In [69]:
nearest_neighbors('moussaka')

neighbors of: moussaka
	spanakopita
	caponata
	sauerbraten
	omelettes
	frittatas
