In [1]:
# pip install -U future

In [3]:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range

In [5]:
# WHERE TO GET THE VECTORS:
# GloVe: https://nlp.stanford.edu/projects/glove/
# Direct link: http://nlp.stanford.edu/data/glove.6B.zip

In [6]:
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances

In [7]:
def dist1(a, b):
    return np.linalg.norm(a - b)
def dist2(a, b):
    return 1- a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [9]:
dist, metric = dist2, 'cosine'
# dist, metric = dist1, 'euclidean'

In [10]:
def find_analogies(w1, w2, w3):
    for w in (w1, w2, w3):
        if w not in word2vec:
            print("%s not in dictionary" % w)
            return
    king = word2vec[w1]
    man = word2vec[w2]
    woman = word2vec[w3]
    v0 = king - man + woman
    
    min_dist = float('inf')
    best_word = ''
    for word, v1 in iteritems(word2vec):
        if word not in (w1, w2, w3):
            d = dist(v0, v1)
            if d < min_dist:
                min_dist = d
                best_word = word
    print(w1, "-", w2, "=", best_word, "-", w3)

In [11]:
## faster
def find_analogies(w1, w2, w3):
  for w in (w1, w2, w3):
    if w not in word2vec:
      print("%s not in dictionary" % w)
      return

  king = word2vec[w1]
  man = word2vec[w2]
  woman = word2vec[w3]
  v0 = king - man + woman

  distances = pairwise_distances(v0.reshape(1, D), embedding, metric=metric).reshape(V)
  idxs = distances.argsort()[:4]
  for idx in idxs:
    word = idx2word[idx]
    if word not in (w1, w2, w3): 
      best_word = word
      break

  print(w1, "-", w2, "=", best_word, "-", w3)

In [12]:
def nearest_neighbors(w, n=5):
  if w not in word2vec:
    print("%s not in dictionary:" % w)
    return

  v = word2vec[w]
  distances = pairwise_distances(v.reshape(1, D), embedding, metric=metric).reshape(V)
  idxs = distances.argsort()[1:n+1]
  print("neighbors of: %s" % w)
  for idx in idxs:
    print("\t%s" % idx2word[idx])


In [15]:
# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
embedding = []
idx2word = []
with open('glove.6B/glove.6B.50d.txt', encoding='utf-8') as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
    embedding.append(vec)
    idx2word.append(word)
print('Found %s word vectors.' % len(word2vec))
embedding = np.array(embedding)
V, D = embedding.shape

Loading word vectors...
Found 400000 word vectors.


In [21]:
find_analogies('king', 'man', 'woman')

king - man = queen - woman


In [17]:
find_analogies('france', 'paris', 'london')

france - paris = britain - london


In [18]:
nearest_neighbors('king')

neighbors of: king
	prince
	queen
	ii
	emperor
	son


In [19]:
nearest_neighbors('france')

neighbors of: france
	french
	belgium
	paris
	spain
	netherlands


In [26]:
nearest_neighbors('fed')

neighbors of: fed
	rates
	inflation
	likely
	greenspan
	policymakers


In [28]:
pip install gensim

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting gensim
  Downloading gensim-4.3.1-cp311-cp311-win_amd64.whl (23.9 MB)
                                              0.0/23.9 MB ? eta -:--:--
     -                                        0.9/23.9 MB 29.1 MB/s eta 0:00:01
     ---------                                5.9/23.9 MB 75.9 MB/s eta 0:00:01
     ------------------                     11.5/23.9 MB 108.8 MB/s eta 0:00:01
     -----------------------                14.7/23.9 MB 110.0 MB/s eta 0:00:01
     --------------------------              16.3/23.9 MB 72.6 MB/s eta 0:00:01
     -----------------------------------     21.8/23.9 MB 72.6 MB/s eta 0:00:01
     --------------------------------------- 23.9/23.9 MB 65.6 MB/s eta 0:00:00
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-6.3.0-py3-none-any.whl (56 kB)
                                              0.0/56.8 kB ? eta -:--:--
     --------------------------------------

In [29]:
from gensim.models import KeyedVectors

In [30]:
# warning: takes quite awhile
# https://code.google.com/archive/p/word2vec/
# direct link: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing
# 3 million words and phrases
# D = 300

In [35]:
word_vectors = KeyedVectors.load_word2vec_format(
  'GoogleNews-vectors-negative300.bin',
  binary=True
)

In [36]:
def find_analogies(w1, w2, w3):
  r = word_vectors.most_similar(positive=[w1, w3], negative=[w2])
  print("%s - %s = %s - %s" % (w1, w2, r[0][0], w3))

In [37]:
def nearest_neighbors(w):
  r = word_vectors.most_similar(positive=[w])
  print("neighbors of: %s" % w)
  for word, score in r:
    print("\t%s" % word)

In [38]:
find_analogies('king', 'man', 'woman')

king - man = queen - woman


In [39]:
find_analogies('france', 'paris', 'london')
find_analogies('france', 'paris', 'rome')
find_analogies('paris', 'france', 'italy')
find_analogies('france', 'french', 'english')
find_analogies('japan', 'japanese', 'chinese')
find_analogies('japan', 'japanese', 'italian')
find_analogies('japan', 'japanese', 'australian')
find_analogies('december', 'november', 'june')
find_analogies('miami', 'florida', 'texas')
find_analogies('einstein', 'scientist', 'painter')
find_analogies('china', 'rice', 'bread')
find_analogies('man', 'woman', 'she')
find_analogies('man', 'woman', 'aunt')
find_analogies('man', 'woman', 'sister')
find_analogies('man', 'woman', 'wife')
find_analogies('man', 'woman', 'actress')
find_analogies('man', 'woman', 'mother')
find_analogies('heir', 'heiress', 'princess')
find_analogies('nephew', 'niece', 'aunt')
find_analogies('france', 'paris', 'tokyo')
find_analogies('france', 'paris', 'beijing')
find_analogies('february', 'january', 'november')
find_analogies('france', 'paris', 'rome')
find_analogies('paris', 'france', 'italy')

france - paris = england - london
france - paris = italy - rome
paris - france = lohan - italy
france - french = england - english
japan - japanese = tibet - chinese
japan - japanese = italy - italian
japan - japanese = queensland - australian
december - november = september - june
miami - florida = dallas - texas
einstein - scientist = jude - painter
china - rice = dinnerware - bread
man - woman = he - she
man - woman = uncle - aunt
man - woman = brother - sister
man - woman = son - wife
man - woman = actor - actress
man - woman = father - mother
heir - heiress = prince - princess
nephew - niece = uncle - aunt
france - paris = japan - tokyo
france - paris = chinese - beijing
february - january = april - november
france - paris = italy - rome
paris - france = lohan - italy


In [40]:
nearest_neighbors('king')
nearest_neighbors('france')
nearest_neighbors('japan')
nearest_neighbors('einstein')
nearest_neighbors('woman')
nearest_neighbors('nephew')
nearest_neighbors('february')
nearest_neighbors('rome')

neighbors of: king
	kings
	queen
	monarch
	crown_prince
	prince
	sultan
	ruler
	princes
	Prince_Paras
	throne
neighbors of: france
	spain
	french
	germany
	europe
	italy
	england
	european
	belgium
	usa
	serbia
neighbors of: japan
	japanese
	tokyo
	america
	europe
	germany
	chinese
	india
	hawaii
	usa
	korea
neighbors of: einstein
	nikki
	lmfao
	albert
	armstrong
	joan
	becky
	mcmahon
	conrad
	lori
	haley
neighbors of: woman
	man
	girl
	teenage_girl
	teenager
	lady
	teenaged_girl
	mother
	policewoman
	boy
	Woman
neighbors of: nephew
	son
	uncle
	brother
	grandson
	cousin
	father
	niece
	younger_brother
	nephews
	stepson
neighbors of: february
	january
	april
	september
	december
	july
	october
	november
	june
	feb
	norway
neighbors of: rome
	athens
	albert
	holmes
	donnie
	italy
	toni
	spain
	jh
	pablo
	malta


In [41]:
nearest_neighbors('brazil')

neighbors of: brazil
	argentina
	spain
	usa
	brazilian
	carlos
	portugal
	diego
	france
	barcelona
	switzerland
