In [14]:
import os
import itertools
import numpy as np

from nltk.cluster.util import cosine_distance

# Using GloVe Embeddings

Wikipedia 2014 + Gigaword 5
https://nlp.stanford.edu/projects/glove/
Word
embeddings are substantially successful in capturing semantic
relations among words,
The Euclidean distance (or cosine similarity) between two word vectors provides an effective method for measuring the linguistic or semantic similarity of the corresponding words

## Prepare GloVe for word embedding

In [17]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2019-06-27 20:33:53--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-06-27 20:33:53--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2019-06-27 20:33:54--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-0

In [2]:
def extract_word_vector(vector_path):
    word_embeddings = {}
    with open(vector_path, "r", encoding='utf-8') as vector_file:
        for line in vector_file:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            word_embeddings[word] = coefs
    return word_embeddings

In [3]:
word_embeddings_100d = extract_word_vector("./glove.6B.100d.txt")
word_embeddings_300d = extract_word_vector("./glove.6B.300d.txt")

The GloVe word embeddings contains 400K different terms. 
GloVe contains 4 different dimension vectors - 50d, 100d, 200d & 300d.
The difference relates to how much information from 400K terms are squeezed into each dimension vector.
Vectors with lower dimensional space would hold more syntactic information "a "

lower dimensional vectors such as 32 may contain primarily syntactic information, possibly because of the reduced dimensionality not having as much space to hold topical information. Vectors of 300 or 400 dimensions may have much more topical information. For a very crude description, in a very high dimensional space, basketball and hockey may have a relatively far cosine distance from each other, but in a lower dimensional space the distance may be closer together


In [6]:
def get_sentence_vector(word_embeddings, sentence, we_dim):
    vector = np.zeros((we_dim, ))

    sentence_length = len(sentence) + 0.001

    if sentence:
        sentence_embeddings = sum([word_embeddings.get(word, vector) for word in sentence])

        vector = sentence_embeddings/sentence_length

    return vector

In [9]:
def sentence_similarity(vector1, vector2):
    similarity_score = 1 - cosine_distance(vector1, vector2)

    if np.isnan(similarity_score):
        similarity_score = 0

    return similarity_score

In [10]:
def build_similarity_matrix(sentences, sentence_vectors, we_dim):
    sentence_length = len(sentences)
    
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((sentence_length, sentence_length))
    
    # create index of word pairs
    permutation_set = list(itertools.permutations(range(0, sentence_length), 2))

    for pair in permutation_set:
        idx1, idx2 = pair
        similarity_matrix[idx1][idx2] = sentence_similarity(sentence_vectors[idx1].reshape(we_dim), sentence_vectors[idx2].reshape(we_dim))

    return similarity_matrix

In [60]:
def get_glove_similarity_matrix(sample_sentences, word_embeddings, we_dim):
    sentence_vectors = []

    for review_sentence in sample_sentences:
        sentence_vector = get_sentence_vector(word_embeddings, review_sentence, we_dim)
        sentence_vectors.append(sentence_vector)
    
    glove_similarity_matrix = build_similarity_matrix(sample_sentences, sentence_vectors, we_dim)

    return glove_similarity_matrix

In [108]:
def closest_sentence(sample_sentences, similarity_matrix):
    print(similarity_matrix)
    max_indx = np.argmax(similarity_matrix, axis=0)
    return {" ".join(sentence): " ".join(sample_sentences[max_indx[e]]) for e, sentence in enumerate(sample_sentences)}

# Sports VS Food

In [105]:
sample_sentences4 = ["basketball", "taekwondo", "karate", "sushi", "burger"]
sample_sentences4 = [sentence.split(" ") for sentence in sample_sentences4]

In [106]:
gmat_100d_4 = get_glove_similarity_matrix(sample_sentences4, word_embeddings_100d, 100)
closest_sentence(sample_sentences4, gmat_100d_4)

[[ 0.          0.44787012  0.44513391  0.03073995  0.04361972]
 [ 0.44787012  0.          0.69275586  0.1619114  -0.07873317]
 [ 0.44513391  0.69275586  0.          0.33181058  0.1346807 ]
 [ 0.03073995  0.1619114   0.33181058  0.          0.48382605]
 [ 0.04361972 -0.07873317  0.1346807   0.48382605  0.        ]]


{'basketball': 'taekwondo',
 'taekwondo': 'karate',
 'karate': 'taekwondo',
 'sushi': 'burger',
 'burger': 'sushi'}

In [107]:
gmat_300d_4 = get_glove_similarity_matrix(sample_sentences4, word_embeddings_300d, 300)
closest_sentence(sample_sentences4, gmat_300d_4)

[[ 0.          0.35349359  0.30225721 -0.04833672  0.03087713]
 [ 0.35349359  0.          0.61689786  0.05764558 -0.11130884]
 [ 0.30225721  0.61689786  0.          0.22528532  0.0337832 ]
 [-0.04833672  0.05764558  0.22528532  0.          0.39702389]
 [ 0.03087713 -0.11130884  0.0337832   0.39702389  0.        ]]


{'basketball': 'taekwondo',
 'taekwondo': 'karate',
 'karate': 'taekwondo',
 'sushi': 'burger',
 'burger': 'sushi'}

## How close are these single words as concepts?
Looking at the similarity matrix of both dimensional spaces, the closest word pairs fall between 0.3 to 0.6. 

`taekwondo` is the closest term to `basketball` (0.45) as they both are considered sports.

However `taekwondo` is definitely closer to `karate` (0.69) because both of them are types of martial arts.

# Different sports

In [102]:
sample_sentences3 = ["soccer", "tennis", "rugby", "badminton"]
sample_sentences3 = [sentence.split(" ") for sentence in sample_sentences3]

In [103]:
gmat_100d_3 = get_glove_similarity_matrix(sample_sentences3, word_embeddings_100d, 100)
closest_sentence(sample_sentences3, gmat_100d_3)

[[0.         0.703636   0.7242489  0.57408756]
 [0.703636   0.         0.56346737 0.77421505]
 [0.7242489  0.56346737 0.         0.53717555]
 [0.57408756 0.77421505 0.53717555 0.        ]]


{'soccer': 'rugby',
 'tennis': 'badminton',
 'rugby': 'soccer',
 'badminton': 'tennis'}

In [109]:
gmat_300d_3 = get_glove_similarity_matrix(sample_sentences3, word_embeddings_300d, 300)
closest_sentence(sample_sentences3, gmat_300d_3)

[[0.         0.5584686  0.53458385 0.43728225]
 [0.5584686  0.         0.44168142 0.65907606]
 [0.53458385 0.44168142 0.         0.39214783]
 [0.43728225 0.65907606 0.39214783 0.        ]]


{'soccer': 'tennis',
 'tennis': 'badminton',
 'rugby': 'soccer',
 'badminton': 'tennis'}

In [None]:
## Similarity (Dissimilarity) of sports in different dimensional spaces?
All 4 terms share the same general concept of `sport`. But there are so many different kinds of sports as we know - martial arts, field ball kicking, racket hitting etc.

In both spaces, `tennis` is closer to `badminton` (racket hitting sports) as `rugby` is closer to `soccer` (field ball kicking sports). 
One interesting point found from above, in a lower dimensional where  `soccer` is said to be closer to `rugby`

In [63]:
sample_sentences = ["greek dish healthy", "malaysian food tasty", "movie comedy fun", "reading fictional entertaining"]
sample_sentences = [sentence.split(" ") for sentence in sample_sentences]

In [64]:
gmat_100d_1 = get_glove_similarity_matrix(sample_sentences, word_embeddings_100d, 100)
print(gmat_100d_1)
closest_sentence(sample_sentences, gmat_100d_1)

[[0.         0.64845293 0.41033192 0.43772947]
 [0.64845293 0.         0.41957035 0.33179244]
 [0.41033192 0.41957035 0.         0.71958436]
 [0.43772947 0.33179244 0.71958436 0.        ]]


{'greek dish healthy': 'malaysian food tasty',
 'malaysian food tasty': 'greek dish healthy',
 'movie comedy fun': 'reading fictional entertaining',
 'reading fictional entertaining': 'movie comedy fun'}

In [65]:
gmat_300d_1 = get_glove_similarity_matrix(sample_sentences, word_embeddings_300d, 300)
print(gmat_300d_1)
closest_sentence(sample_sentences, gmat_300d_1)

[[0.         0.50597631 0.23174479 0.25190822]
 [0.50597631 0.         0.24630075 0.18584596]
 [0.23174479 0.24630075 0.         0.5795299 ]
 [0.25190822 0.18584596 0.5795299  0.        ]]


{'greek dish healthy': 'malaysian food tasty',
 'malaysian food tasty': 'greek dish healthy',
 'movie comedy fun': 'reading fictional entertaining',
 'reading fictional entertaining': 'movie comedy fun'}

In [75]:
sample_sentences2 = ["clean beautiful flat", "tidy comfortable apartment", "host responsive helpful", "guest great friendly"]
sample_sentences2 = [sentence.split(" ") for sentence in sample_sentences2]

In [76]:
gmat_100d = get_glove_similarity_matrix(sample_sentences2, word_embeddings_100d, 100)
print(gmat_100d)
closest_sentence(sample_sentences2, gmat_100d)

[[0.         0.67390946 0.43533047 0.59942636]
 [0.67390946 0.         0.31802589 0.48821536]
 [0.43533047 0.31802589 0.         0.61105895]
 [0.59942636 0.48821536 0.61105895 0.        ]]


{'clean beautiful flat': 'tidy comfortable apartment',
 'tidy comfortable apartment': 'clean beautiful flat',
 'host responsive helpful': 'guest great friendly',
 'guest great friendly': 'host responsive helpful'}

In [77]:
gmat_300d = get_glove_similarity_matrix(sample_sentences2, word_embeddings_300d, 300)
print(gmat_300d)
closest_sentence(sample_sentences2, gmat_300d)

[[0.         0.5292614  0.25007188 0.41155782]
 [0.5292614  0.         0.14884599 0.2918403 ]
 [0.25007188 0.14884599 0.         0.42941805]
 [0.41155782 0.2918403  0.42941805 0.        ]]


{'clean beautiful flat': 'tidy comfortable apartment',
 'tidy comfortable apartment': 'clean beautiful flat',
 'host responsive helpful': 'guest great friendly',
 'guest great friendly': 'host responsive helpful'}