In [14]:
import pickle

from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd

%matplotlib inline

In [8]:
DATA = '../../../../data'

In [2]:
def get_vectors(embeddings, words):
    """
    Input:
        embeddings: a word 
        fr_embeddings:
        words: a list of words
    Output: 
        X: a matrix where the rows are the embeddings corresponding to the
        rows on the list
    """
    m = len(words)
    X = np.zeros((1, 300))
    for word in words:
        english = word
        eng_emb = embeddings[english]
        X = np.row_stack((X, eng_emb))
    X = X[1:, :]
    return X

In [10]:
data = pd.read_csv(f'{DATA}/capitals.txt', delimiter=' ')
data.columns = ['city1', 'country1', 'city2', 'country2']
data.head()

Unnamed: 0,city1,country1,city2,country2
0,Athens,Greece,Bangkok,Thailand
1,Athens,Greece,Beijing,China
2,Athens,Greece,Berlin,Germany
3,Athens,Greece,Bern,Switzerland
4,Athens,Greece,Cairo,Egypt


In [11]:
data.shape

(4721, 4)

In [19]:
embeddings = KeyedVectors.load_word2vec_format(
    f'{DATA}/GoogleNews-vectors-negative300.bin', binary=True)

In [20]:
f = open(f'{DATA}/capitals.txt', 'r').read()
set_words = set(nltk.word_tokenize(f))
select_words = words = [
    'king', 'queen', 'oil', 'gas', 'happy', 'sad', 'city', 'town', 
    'village', 'country', 'continent', 'petroleum', 'joyful']
for w in select_words:
    set_words.add(w)

In [21]:
def get_word_embeddings(embeddings):
    word_embeddings = {}
    for word in embeddings.vocab:
        if word in set_words:
            word_embeddings[word] = embeddings[word]
    return word_embeddings

In [22]:
word_embeddings = get_word_embeddings(embeddings)
print(len(word_embeddings))
pickle.dump(word_embeddings, open('word_embeddings_subset.pkl', 'wb'))

245


In [23]:
word_embeddings = pickle.load(open('word_embeddings_subset.pkl', 'rb'))
len(word_embeddings)

245

In [24]:
print('Dim:', word_embeddings['Spain'].shape[0])

Dim: 300


The cosine similarity function is:

$$\cos (\theta)=\frac{\mathbf{A} \cdot \mathbf{B}}{\|\mathbf{A}\|\|\mathbf{B}\|}=\frac{\sum_{i=1}^{n} A_{i} B_{i}}{\sqrt{\sum_{i=1}^{n} A_{i}^{2}} \sqrt{\sum_{i=1}^{n} B_{i}^{2}}}\tag{1}$$

In [25]:
def cosine_similarity(A, B):
    '''
    Input:
        A: a numpy array which corresponds to a word vector
        B: A numpy array which corresponds to a word vector
    Output:
        cos: numerical number representing the cosine similarity between 
          A and B.
    '''
    dot = np.dot(A, B)
    norma = np.linalg.norm(A)
    normb = np.linalg.norm(B)
    cos = dot / (norma * normb)
    return cos

In [26]:
king = word_embeddings['king']
queen = word_embeddings['queen']
cosine_similarity(king, queen)

0.6510956

In [28]:
def euclidean(A, B):
    """
    Input:
        A: a numpy array which corresponds to a word vector
        B: A numpy array which corresponds to a word vector
    Output:
        d: numerical number representing the Euclidean distance between A and B.
    """
    d = np.sqrt(((A - B)**2).sum())
    return d

In [29]:
euclidean(king, queen)

2.4796925

In [None]:
def get_country(city1, country1, city2, embeddings):
    """
    Input:
        city1: a string (the capital city of country1)
        country1: a string (the country of capital1)
        city2: a string (the capital city of country2)
        embeddings: a dictionary where the keys are words and values are 
          their embeddings
    Output:
        countries: a dictionary with the most likely country and its 
          similarity score
    """
    group = set((city1, country1, city2))
    city1_emb = get_word_embeddings(city1)
    city2_emb = get_word_embeddings(city2)
    country1_emb = get_word_embeddings(country_1)
    # get embedding of country 2 (it's a combination of the embeddings of 
    # country 1, city 1 and city 2)
    # Remember: King - Man + Woman = Queen
    vec = cou

    # Initialize the similarity to -1 (it will be replaced by a similarities that are closer to +1)
    similarity = -1

    # initialize country to an empty string
    country = ''

    # loop through all words in the embeddings dictionary
    for word in embeddings.keys():

        # first check that the word is not already in the 'group'
        if word not in group:

            # get the word embedding
            word_emb = None

            # calculate cosine similarity between embedding of country 2 and the word in the embeddings dictionary
            cur_similarity = None

            # if the cosine similarity is more similar than the previously best similarity...
            if cur_similarity > similarity:

                # update the similarity to the new, better similarity
                similarity = None

                # store the country as a tuple, which contains the word and the similarity
                country = (None, None)

    ### END CODE HERE ###

    return country