# Visualizzazione dei word vector tramite GENSIM

In [2]:
import numpy as np

# Get the interactive Tools for Matplotlib
%matplotlib notebook
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.decomposition import PCA


from gensim.models import KeyedVectors
import gensim.downloader as api

Utilizzeremo gensim per visualizzare i vettori di parole. Gensim non è un pacchetto per fare deep learning è più una libreria per modellare la similitudine delle parole e del testo. Ha iniziato con implementare il topic modelling ed è cresciuta con la rappresentazione delle parole tramite SVD e word vectors. 
E' efficiente e scalabile e praticamente usata da tutti.

Useremo i vettori a 100 dimensioni per avere un buon mix tra velocità/consumo di memoria vs. qualità.
Se provassimo la dimensione a 50 dimensioni avremmo certamente una maggiore velocità ma chiaramente le analogie trovate non sarebbero così buone. 
Stessa cosa al contrario con 300 dimensioni la qualità del risultato risulterebbe maggiore ma la velocità ne risentirebbe.

In [3]:
model = api.load('glove-wiki-gigaword-100')



In [4]:
print(type(model))

<class 'gensim.models.keyedvectors.KeyedVectors'>


In [12]:
bread = model['bread']
print(bread)

[-0.66146    0.94335   -0.72214    0.17403   -0.42524    0.36303
  1.0135    -0.14802    0.25817   -0.20326   -0.64338    0.16632
  0.61518    1.397     -0.094506   0.0041843 -0.18976   -0.55421
 -0.39371   -0.22501   -0.34643    0.32076    0.34395   -0.7034
  0.23932    0.69951   -0.16461   -0.31819   -0.34034   -0.44906
 -0.069667   0.35348    0.17498   -0.95057   -0.2209     1.0647
  0.23231    0.32569    0.47662   -1.1206     0.28168   -0.75172
 -0.54654   -0.66337    0.34804   -0.69058   -0.77092   -0.40167
 -0.069351  -0.049238  -0.39351    0.16735   -0.14512    1.0083
 -1.0608    -0.87314   -0.29339    0.68278    0.61634   -0.088844
  0.88094    0.099809  -0.27161   -0.58026    0.50364   -0.93814
  0.67576   -0.43124   -0.10517   -1.2404    -0.74353    0.28637
  0.29012    0.89377    0.67406    0.86422   -0.30693   -0.14718
  0.078353   0.74013    0.32658   -0.052579  -1.1665     0.87079
 -0.69402   -0.75977   -0.37164   -0.11887    0.18551    0.041883
  0.59352    0.30519   -0.

In [13]:
croissant = model['croissant']
print(croissant)

[-0.25144    0.52157   -0.75452    0.28039   -0.31388    0.274
  1.1971    -0.10519    0.82544   -0.33398   -0.21417    0.22216
  0.14982    0.47384    0.41984    0.69397   -0.25999   -0.44414
  0.58296   -0.30851   -0.076455   0.33468    0.28055   -0.99012
  0.30349    0.39128    0.031526  -0.095395  -0.004745  -0.81347
  0.27869   -0.1812     0.14632   -0.42186    0.13857    1.139
  0.14925   -0.051459   0.37875   -0.2613     0.011081  -0.28881
 -0.38662   -0.3135    -0.1954     0.19248   -0.52995   -0.40674
 -0.25159    0.06272   -0.32724    0.28374   -0.2155    -0.061832
 -0.50134    0.0093959  0.30715    0.3873    -0.74554   -0.45947
  0.40032   -0.1378    -0.26968   -0.3946    -0.64876   -0.47149
 -0.085536   0.092795  -0.034018  -0.61906    0.19123    0.20563
  0.29056   -0.010908   0.15313    0.33144    0.33806    0.061708
  0.20785    0.65348   -0.053222   0.18589    0.32647   -0.11923
  0.42008   -0.26931    0.025489   0.0036535  0.1327    -0.22763
  0.07564    0.55773    0.2

In [15]:
from scipy.spatial import distance
distance.cosine(bread,croissant)

0.5135942995548248

In [5]:
model.most_similar('obama')

[('barack', 0.937216579914093),
 ('bush', 0.927285373210907),
 ('clinton', 0.8960004448890686),
 ('mccain', 0.8875634074211121),
 ('gore', 0.8000321984291077),
 ('hillary', 0.7933662533760071),
 ('dole', 0.7851964831352234),
 ('rodham', 0.7518897652626038),
 ('romney', 0.7488930821418762),
 ('kerry', 0.7472624778747559)]

In [6]:
model.most_similar('banana')

[('coconut', 0.7097253799438477),
 ('mango', 0.705482542514801),
 ('bananas', 0.6887733936309814),
 ('potato', 0.6629636287689209),
 ('pineapple', 0.6534532308578491),
 ('fruit', 0.6519855260848999),
 ('peanut', 0.6420576572418213),
 ('pecan', 0.6349173188209534),
 ('cashew', 0.6294420957565308),
 ('papaya', 0.6246591210365295)]

In [7]:
model.most_similar(negative='banana')

[('shunichi', 0.49618101119995117),
 ('ieronymos', 0.4736502170562744),
 ('pengrowth', 0.4668096899986267),
 ('höss', 0.4636845588684082),
 ('damaskinos', 0.4617849290370941),
 ('yadin', 0.4617374837398529),
 ('hundertwasser', 0.4588957130908966),
 ('ncpa', 0.4577339291572571),
 ('maccormac', 0.4566109776496887),
 ('rothfeld', 0.4523947238922119)]

In [8]:
result = model.most_similar(positive=['woman', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))

queen: 0.7699


![composition](./images/composition.png)

In [16]:
def analogy(x1, x2, y1):
    result = model.most_similar(positive=[y1, x2], negative=[x1])
    return result[0][0]

In [17]:
analogy('man', 'king', 'woman')

'queen'

In [18]:
analogy('king', 'man', 'queen')

'woman'

In [19]:
analogy('japan', 'japanese', 'australia')

'australian'

In [20]:
analogy('australia', 'beer', 'france')

'champagne'

In [21]:
analogy('obama', 'clinton', 'reagan')

'nixon'

In [22]:
analogy('tall', 'tallest', 'long')

'longest'

In [23]:
analogy('good', 'fantastic', 'bad')

'terrible'

In [46]:
#esempio mio
analogy('good', 'husband', 'bad')

'wife'

In [56]:
#puo essere sia un attore che un oggetto celeste (polisemia)
model.most_similar('star')

[('stars', 0.8661765456199646),
 ('superstar', 0.728345513343811),
 ('movie', 0.6531305313110352),
 ('legend', 0.6483873128890991),
 ('actor', 0.6472946405410767),
 ('player', 0.6397407650947571),
 ('best', 0.6355776190757751),
 ('hollywood', 0.6347972750663757),
 ('fame', 0.6246458292007446),
 ('named', 0.6231004595756531)]

In [57]:
model.most_similar('bank')

[('banks', 0.8057132959365845),
 ('banking', 0.7530706524848938),
 ('credit', 0.7037603259086609),
 ('investment', 0.6939943432807922),
 ('financial', 0.6777414083480835),
 ('securities', 0.668834388256073),
 ('lending', 0.664503276348114),
 ('funds', 0.6484885811805725),
 ('ubs', 0.6483405232429504),
 ('finance', 0.6462422609329224)]

In [59]:
model.most_similar(positive=['bank', 'river'], negative=['credit'])

[('rivers', 0.6518788933753967),
 ('danube', 0.6357226967811584),
 ('tigris', 0.6252236366271973),
 ('lake', 0.621152937412262),
 ('estuary', 0.6199785470962524),
 ('rhine', 0.6166054606437683),
 ('canal', 0.6140355467796326),
 ('valley', 0.6131061911582947),
 ('creek', 0.6076534986495972),
 ('euphrates', 0.6059175729751587)]

In [24]:
print(model.doesnt_match("breakfast cereal dinner lunch".split()))

cereal


In [50]:
def display_pca_scatterplot(model, words=None, sample=0):
    if words == None:
        if sample > 0:
            words = np.random.choice(list(model.key_to_index.keys()), sample)
        else:
            words = [ word for word in model.vocab ]
        
    word_vectors = np.array([model[w] for w in words])

    twodim = PCA().fit_transform(word_vectors)[:,:2]
    
    plt.figure(figsize=(6,6))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.05, y+0.05, word)

In [51]:
display_pca_scatterplot(model, 
                        ['coffee', 'tea', 'beer', 'wine', 'brandy', 'rum', 'champagne', 'water',
                         'spaghetti', 'borscht', 'hamburger', 'pizza', 'falafel', 'sushi', 'meatballs',
                         'dog', 'horse', 'cat', 'monkey', 'parrot', 'koala', 'lizard',
                         'frog', 'toad', 'monkey', 'ape', 'kangaroo', 'wombat', 'wolf',
                         'france', 'germany', 'hungary', 'luxembourg', 'australia', 'fiji', 'china',
                         'homework', 'assignment', 'problem', 'exam', 'test', 'class',
                         'school', 'college', 'university', 'institute'])

<IPython.core.display.Javascript object>

In [52]:
display_pca_scatterplot(model, sample=300)

<IPython.core.display.Javascript object>