# Loading word embeddings

In [1]:
import io
import numpy as np

In [2]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

In [5]:
src_path = 'dumped/debug/2c288k3usr/vectors-zh.txt'
tgt_path = 'dumped/debug/2c288k3usr/vectors-en.txt'
nmax = 50000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

In [12]:
src_word2id

{'</s>': 0,
 '-': 1,
 '/': 2,
 '【': 3,
 '】': 4,
 '(': 5,
 ')': 6,
 '包': 7,
 '手機殼': 8,
 '揹': 9,
 '+': 10,
 '~': 11,
 '系列': 12,
 '臺': 13,
 'dvd': 14,
 'baby': 15,
 '2': 16,
 '髮': 17,
 '女': 18,
 '日本': 19,
 '魔法': 20,
 'nike': 21,
 '*': 22,
 'plus': 23,
 '運動': 24,
 '小': 25,
 '入': 26,
 '3': 27,
 '時尚': 28,
 '後': 29,
 '灣': 30,
 'ifairies': 31,
 '經典': 32,
 '休閒': 33,
 '製': 34,
 '組': 35,
 '5': 36,
 '6': 37,
 '1': 38,
 '4': 39,
 'o': 40,
 'sports': 41,
 '上衣': 42,
 '的': 43,
 '黑色': 44,
 '_': 45,
 '白': 46,
 'pro': 47,
 'ultra': 48,
 '盒': 49,
 '短褲': 50,
 '10': 51,
 '墊': 52,
 'big': 53,
 '慢跑鞋': 54,
 '貼': 55,
 '外套': 56,
 '男': 57,
 '黑': 58,
 '玫瑰': 59,
 '套裝': 60,
 '卡通': 61,
 '鍊': 62,
 't': 63,
 '斜': 64,
 '褲': 65,
 '耐吉': 66,
 '藍': 67,
 'm': 68,
 '個': 69,
 '片': 70,
 '防水': 71,
 'sony': 72,
 'z5': 73,
 'polo': 74,
 '鞋': 75,
 '鋼筆': 76,
 '12': 77,
 '背心': 78,
 'x': 79,
 '殼': 80,
 's8': 81,
 '）': 82,
 '支': 83,
 '長': 84,
 'logo': 85,
 '#': 86,
 '現貨': 87,
 '完美': 88,
 '筆': 89,
 'new': 90,
 '華碩': 91,
 'adidas': 92,
 

# Get nearest neighbors

In [6]:
def get_nn(word, src_emb, src_id2word, tgt_emb, tgt_id2word, K=5):
    print("Nearest neighbors of \"%s\":" % word)
    word2id = {v: k for k, v in src_id2word.items()}
    word_emb = src_emb[word2id[word]]
    scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
    k_best = scores.argsort()[-K:][::-1]
    for i, idx in enumerate(k_best):
        print('%.4f - %s' % (scores[idx], tgt_id2word[idx]))

In [22]:
# printing nearest neighbors in the source space
src_word = 'plus'
get_nn(src_word, src_embeddings, src_id2word, src_embeddings, src_id2word, K=5)

Nearest neighbors of "plus":
1.0000 - plus
1.0000 - ultra
1.0000 - 手機殼
1.0000 - iphone
1.0000 - asus


In [23]:
# printing nearest neighbors in the target space
src_word = 'plus'
get_nn(src_word, src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=5)

Nearest neighbors of "plus":
0.9965 - double
0.9965 - bracelet
0.9964 - color
0.9963 - male
0.9963 - computer


# Visualize multilingual embeddings

In [19]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2, whiten=True)  # TSNE(n_components=2, n_iter=3000, verbose=2)
pca.fit(np.vstack([src_embeddings, tgt_embeddings]))
print('Variance explained: %.2f' % pca.explained_variance_ratio_.sum())

Variance explained: 1.00


In [20]:
import matplotlib.pyplot as plt


def plot_similar_word(src_words, src_word2id, src_emb, tgt_words, tgt_word2id, tgt_emb, pca):

    Y = []
    word_labels = []
    for sw in src_words:
        Y.append(src_emb[src_word2id[sw]])
        word_labels.append(sw)
    for tw in tgt_words:
        Y.append(tgt_emb[tgt_word2id[tw]])
        word_labels.append(tw)

    # find tsne coords for 2 dimensions
    Y = pca.transform(Y)
    x_coords = Y[:, 0]
    y_coords = Y[:, 1]

    # display scatter plot
    plt.figure(figsize=(10, 8), dpi=80)
    plt.scatter(x_coords, y_coords, marker='x')

    for k, (label, x, y) in enumerate(zip(word_labels, x_coords, y_coords)):
        color = 'blue' if k < len(src_words) else 'red'  # src words in blue / tgt words in red
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points', fontsize=19,
                     color=color, weight='bold')

    plt.xlim(x_coords.min() - 0.2, x_coords.max() + 0.2)
    plt.ylim(y_coords.min() - 0.2, y_coords.max() + 0.2)
    plt.title('Visualization of the multilingual word embedding space')

    plt.show()

In [21]:
# get 5 random input words
src_words = ['university', 'love', 'history', 'tennis', 'research', 'conference']
tgt_words = ['universidad', 'amor', 'historia', u'tenis',  u'investigación', 'conferencia']

# assert words in dictionaries
for sw in src_words:
    assert sw in src_word2id, '"%s" not in source dictionary' % sw
for tw in tgt_words:
    assert tw in tgt_word2id, '"%s" not in target dictionary' % sw

plot_similar_word(src_words, src_word2id, src_embeddings, tgt_words, tgt_word2id, tgt_embeddings, pca)

AssertionError: "university" not in source dictionary