In [None]:
# plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt
# store plot outputs within the notebook document
%matplotlib inline

# efficient numerical computation library
import numpy as np

# word and text similarity modeling library
import gensim.downloader
from gensim.models import KeyedVectors

# GloVe: Global Vectors for Word Representation

Research paper written by Stanford researchers, came out in 2014.

* unsupervised learning algorithm for obtaining vector representations 
* uses aggregated global word-word co-occurrence statistics
* 50-300 dimensions to represent words and word meanings

In [None]:
# we are using the 100 dimensional vectors, which still give good results
model = gensim.downloader.load('glove-wiki-gigaword-100')
print(type(model))

In [None]:
len(model.wv.vocab)

In [None]:
model.wv.index2word[20000:20010]

In [None]:
model['bread'].shape

In [None]:
model['bread']

# Similarity

In [None]:
model.most_similar('bread')

In [None]:
model.most_similar('haircut')

In [None]:
model.similarity('bread','butter')

In [None]:
model.similarity('bread','headache')

In [None]:
model.similarity('river','ship')

In [None]:
model.similarity('river','banana')

# Arithmetic with words

* word vectors showcases interesting linear substructures of the word vector space
* we can notice linear structures in similarity and relatedness

In [None]:
# start with a capital city + madrid 
# substract the country component - spain
# add another country component + france
# result, get back the capital of the new country

res = model.most_similar(positive=['madrid','france'], negative=['spain'])
res

In [None]:
def analogy_isto(X1, istoX2, Y1):
  top10 = model.most_similar(positive=[istoX2,Y1], negative=[X1])
  return [res for res in top10[:2]]
analogy_isto('man','king','woman')

In [None]:
analogy_isto('germany','beer','france') # start with drink, substract/add country

In [None]:
#analogy_isto('strong','stronger','weak')
analogy_isto('sister','aunt','brother') # start with activity, substrac/add tool

In [None]:
analogy_isto('brush','painting','camera') # start with activity, substrac/add tool
analogy_isto('university','professor','school')

In [None]:
#analogy_isto('1','2','3')
#analogy_isto('love','red','hate')
analogy_isto('usa','obama','russia')

In [None]:
analogy_isto('100','200','300')
#analogy_isto('computer','mouse','screen')
#analogy_isto('running','shoe','swimming')
#analogy_isto('running','shoe','tennis')
#analogy_isto('beer','wine','rum')
#analogy_isto('usa','obama','hungary')

In [None]:
# knift + death
model.most_similar(positive=['knife','death'])
#model.most_similar(positive=['hollywood','leonardo'])

In [None]:
# horse - farm
model.most_similar(positive=['horse'], negative=['farm'])

In [None]:
model.most_similar(positive=['love'], negative=['red'])
model.most_similar(positive=['house'], negative=['building'])

Can you find other cool/interesting relationships in the data?

# Plotting into 2D - finding clusters

In [None]:
# ml library, used here for dimensionality reduction
# Linear dimensionality reduction project data to a lower dimensional space
from sklearn.decomposition import PCA

def display_pca_scatterplot(model,words):
  word_vectors = np.array([model[w] for w in words])

  two_dim_vectors = PCA().fit_transform(word_vectors)[:,:2]

  plt.figure(figsize=(6,6))
  plt.scatter(two_dim_vectors[:,0], two_dim_vectors[:,1])
  for w, (x,y) in zip(words, two_dim_vectors):
    plt.annotate(w, (x,y), xytext=(x+0.05,y+0.05))

In [None]:
display_pca_scatterplot(model,['hungary','spain','romania', 'iceland', 'bear', 'snake', 'dolphin','computer','database','video',])

Can you find sets of words that map in different regions, based on meanings?

# But why do these work?

* This is not obvious, there is nothing in the optimization algorithm that should result in these linear substructures
* There are some theories, but we don’t know yet :) 

# Key learnings


* A surprising result - word meaning can be represented well by large vectors of numbers
* These vectors can be calculated using a "simple" task of calculating and updating distributional similarities



References

- [Stanford Online: NLP with Deep Learning
](https://www.youtube.com/watch?v=rmVRLeJRkl4&list=PLoROMvodv4rOSH4v6133s9LFPRHjEmbmJ&index=1)
- [Gensim - python library for easy use of word vectors and topic modeling](https://radimrehurek.com/gensim/auto_examples/index.html#documentation)