# Imports 

In [1]:
from __future__ import print_function, division

from future.utils import iteritems
from builtins import range

import numpy as np
from sklearn.metrics.pairwise import pairwise_distances

# Distance Functions

In [2]:
# Euclidean distance
def dist1(a, b):
    return np.linalg.norm(a - b)

# Cosine distance
def dist2(a, b):
    return 1 - a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Distance Metric Configuration

In [3]:
# Choose distance function
dist, metric = dist2, 'cosine'
# dist, metric = dist1, 'euclidean'

# Analogy Solver

In [4]:
def find_analogies(w1, w2, w3):
    """
    Solves analogies of the form:
    w1 - w2 + w3 ≈ ?
    Example: king - man + woman ≈ queen
    """

    for w in (w1, w2, w3):
        if w not in word2vec:
            print("%s not in dictionary" % w)
            return

    v0 = word2vec[w1] - word2vec[w2] + word2vec[w3]

    min_dist = float('inf')
    best_word = None

    for word, v1 in iteritems(word2vec):
        if word not in (w1, w2, w3):
            d = dist(v0, v1)
            if d < min_dist:
                min_dist = d
                best_word = word

    print(w1, "-", w2, "=", best_word, "-", w3)

# Optimized Analogy Solver (Vectorized)

In [5]:
def find_analogies_fast(w1, w2, w3):
    """
    Faster analogy computation using pairwise distances
    """

    for w in (w1, w2, w3):
        if w not in word2vec:
            print("%s not in dictionary" % w)
            return

    v0 = word2vec[w1] - word2vec[w2] + word2vec[w3]

    distances = pairwise_distances(
        v0.reshape(1, D),
        embedding,
        metric=metric
    ).reshape(V)

    idxs = distances.argsort()[:4]

    for idx in idxs:
        word = idx2word[idx]
        if word not in (w1, w2, w3):
            print(w1, "-", w2, "=", word, "-", w3)
            break

# Nearest Neighbors Function

In [6]:
def nearest_neighbors(w, n=5):
    """
    Prints the n nearest neighbors of a word
    """

    if w not in word2vec:
        print("%s not in dictionary" % w)
        return

    v = word2vec[w]

    distances = pairwise_distances(
        v.reshape(1, D),
        embedding,
        metric=metric
    ).reshape(V)

    idxs = distances.argsort()[1:n+1]

    print("Neighbors of:", w)
    for idx in idxs:
        print("\t", idx2word[idx])

# GloVe 50D Model

# Load GloVe 50D Vectors

In [7]:
print("Loading GloVe 50D word vectors...")

word2vec = {}
embedding = []
idx2word = []

with open(
    r"C:\Users\gasse\OneDrive\Desktop\archive1\glove.6B.50d.txt",
    encoding="utf-8"
) as f:

    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype="float32")

        word2vec[word] = vec
        embedding.append(vec)
        idx2word.append(word)

embedding = np.array(embedding)
V, D = embedding.shape

print("Found %s word vectors." % V)

Loading GloVe 50D word vectors...
Found 400000 word vectors.


# Inspect Vocabulary & Vectors

In [8]:
for i, (word, vec) in enumerate(word2vec.items()):
    if i == 20:
        break
    print(f"Word: {word}, Vector sample: {vec[:10]}")

print (embedding[0][:10])
print ('-------------------------------')
print (idx2word[:20])
print ('-------------------------------')
print (len(embedding[0]))

Word: the, Vector sample: [ 0.418       0.24968    -0.41242     0.1217      0.34527    -0.044457
 -0.49688    -0.17862    -0.00066023 -0.6566    ]
Word: ,, Vector sample: [ 0.013441  0.23682  -0.16899   0.40951   0.63812   0.47709  -0.42852
 -0.55641  -0.364    -0.23938 ]
Word: ., Vector sample: [ 0.15164  0.30177 -0.16763  0.17684  0.31719  0.33973 -0.43478 -0.31086
 -0.44999 -0.29486]
Word: of, Vector sample: [ 0.70853  0.57088 -0.4716   0.18048  0.54449  0.72603  0.18157 -0.52393
  0.10381 -0.17566]
Word: to, Vector sample: [ 0.68047  -0.039263  0.30186  -0.17792   0.42962   0.032246 -0.41376
  0.13228  -0.29847  -0.085253]
Word: and, Vector sample: [ 0.26818   0.14346  -0.27877   0.016257  0.11384   0.69923  -0.51332
 -0.47368  -0.33075  -0.13834 ]
Word: in, Vector sample: [ 0.33042   0.24995  -0.60874   0.10923   0.036372  0.151    -0.55083
 -0.074239 -0.092307 -0.32821 ]
Word: a, Vector sample: [ 0.21705  0.46515 -0.46757  0.10082  1.0135   0.74845 -0.53104 -0.26256
  0.16812  0.

# Analogy Experiments

In [9]:
print (find_analogies_fast("king", "man", "woman"))
print ('-------------------------------')
print (find_analogies_fast("france", "paris", "london"))
print ('-------------------------------')
print (find_analogies_fast("japan", "japanese", "chinese"))
print ('-------------------------------')
print (find_analogies_fast("man", "woman", "mother"))
print ('-------------------------------')
print (find_analogies_fast("cairo", "egypt", "syria"))

king - man = queen - woman
None
-------------------------------
france - paris = britain - london
None
-------------------------------
japan - japanese = china - chinese
None
-------------------------------
man - woman = father - mother
None
-------------------------------
cairo - egypt = damascus - syria
None


# Nearest Neighbors

In [10]:
print (nearest_neighbors("king"))
print ('-------------------------------')
print (nearest_neighbors("france"))
print ('-------------------------------')
print (nearest_neighbors("einstein"))
print ('-------------------------------')
print (nearest_neighbors("woman"))
print ('-------------------------------')
print (nearest_neighbors("february"))

Neighbors of: king
	 prince
	 queen
	 ii
	 emperor
	 son
None
-------------------------------
Neighbors of: france
	 french
	 belgium
	 paris
	 spain
	 netherlands
None
-------------------------------
Neighbors of: einstein
	 relativity
	 bohr
	 physics
	 heisenberg
	 freud
None
-------------------------------
Neighbors of: woman
	 girl
	 man
	 mother
	 her
	 boy
None
-------------------------------
Neighbors of: february
	 october
	 december
	 january
	 august
	 september
None


# GloVe 300D Model

# Load GloVe 300D Vectors

In [11]:
print("Loading GloVe 300D word vectors...")

word2vec = {}
embedding = []
idx2word = []

with open(
    r"C:\Users\gasse\OneDrive\Desktop\archive\glove.6B.300d.txt",
    encoding="utf-8"
) as f:

    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype="float32")

        word2vec[word] = vec
        embedding.append(vec)
        idx2word.append(word)

embedding = np.array(embedding)
V, D = embedding.shape

print("Found %s word vectors." % V)

Loading GloVe 300D word vectors...
Found 400000 word vectors.


# Analogy Experiments

In [12]:
print (find_analogies_fast("king", "man", "woman"))
print ('-------------------------------')
print (find_analogies_fast("france", "paris", "rome"))
print ('-------------------------------')
print (find_analogies_fast("japan", "japanese", "italian"))
print ('-------------------------------')
print (find_analogies_fast("heir", "heiress", "princess"))
print ('-------------------------------')
print (find_analogies_fast("nephew", "niece", "aunt"))

king - man = queen - woman
None
-------------------------------
france - paris = italy - rome
None
-------------------------------
japan - japanese = italy - italian
None
-------------------------------
heir - heiress = prince - princess
None
-------------------------------
nephew - niece = uncle - aunt
None


# Nearest Neighbors

In [13]:
print (nearest_neighbors("king"))
print ('-------------------------------')
print (nearest_neighbors("france"))
print ('-------------------------------')
print (nearest_neighbors("japan"))
print ('-------------------------------')
print (nearest_neighbors("woman"))
print ('-------------------------------')
print (nearest_neighbors("rome"))
print ('-------------------------------')
print (nearest_neighbors("bolt"))
print ('-------------------------------')
print (nearest_neighbors("hello"))

Neighbors of: king
	 queen
	 prince
	 monarch
	 kingdom
	 throne
None
-------------------------------
Neighbors of: france
	 french
	 paris
	 belgium
	 spain
	 italy
None
-------------------------------
Neighbors of: japan
	 japanese
	 tokyo
	 korea
	 china
	 asia
None
-------------------------------
Neighbors of: woman
	 girl
	 man
	 mother
	 she
	 her
None
-------------------------------
Neighbors of: rome
	 italy
	 naples
	 turin
	 venice
	 roman
None
-------------------------------
Neighbors of: bolt
	 usain
	 locking
	 crossbow
	 bolts
	 asafa
None
-------------------------------
Neighbors of: hello
	 goodbye
	 hey
	 !
	 dolly
	 muddah
None
