# Word2vec tutorial


In [None]:
!pip install gensim
import gensim
import gensim.downloader


In [None]:
#get the interactive tools for matplotlib
%matplotlib notebook
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import plotly.graph_objects as go

In [None]:
from sklearn.decomposition import PCA

import numpy as np
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

Gensim isn't really a deep learning package. It's a package for for word and text similarity modeling, which started with (LDA-style) topic models and grew into SVD and neural word representations. But its efficient and scalable, and quite widely used.

## Load model

Word2Vec is a popular algorithm for learning word embeddings, which are dense vector representations of words in a continuous space. These embeddings capture semantic relationships between words and enable various natural language processing tasks. "word2vec-google-news-300" specifically refers to a pre-trained Word2Vec model developed by Google, trained on a vast corpus of news articles.

Here are some key characteristics and points about the "word2vec-google-news-300" model:

1- 300-Dimensional Vectors: The model generates word embeddings in a 300-dimensional vector space. Each dimension of the vector captures a specific aspect of the word's meaning or context.

2- Semantic Similarity: The model's embeddings are known for capturing semantic relationships between words. Words that are similar in meaning tend to have vectors that are close together in the vector space.

3- Pre-trained on News Corpus: This model is trained on a massive collection of news articles, which means it is particularly good at capturing language related to current events, news topics, and general vocabulary.

4- Generalization: The model's embeddings can be used as features for various NLP tasks, such as sentiment analysis, text classification, and named entity recognition. Its ability to generalize from news articles can make it useful for a wide range of applications.

5- Out-of-Vocabulary Words: While the model can provide embeddings for a vast vocabulary, it might not have embeddings for very rare or specialized words, which could limit its utility in certain domains.

6- Word Analogies: Word2Vec embeddings often exhibit interesting properties, such as the ability to perform word analogies like "king - man + woman = queen." This is due to the vector space capturing relationships between words.

7- Large-Scale Training Data: The quality of the embeddings is partly attributed to the vast amount of news data used for training. This helps the model learn rich and nuanced representations of words.

8- Resource Intensive: The vector embeddings are real-valued numbers, resulting in a relatively high memory requirement to store the model and perform computations. However, these vectors can be loaded into memory to efficiently perform similarity calculations.

9- Pre-trained Model Availability: The "word2vec-google-news-300" model is publicly available and can be downloaded from various sources. It can be used as a starting point for various NLP projects, saving time and computational resources that would be required for training from scratch.



In [None]:
model = gensim.downloader.load('word2vec-google-news-300')

## Embedding Vector

In [None]:
model.get_vector("car")

In [None]:
print("Embedding vector size is {}".format(model.get_vector("car").size))

## Most similar to

In [None]:
model.most_similar('obama')

In [None]:
model.most_similar('banana')

## Vector similarity

In [None]:
result = model.most_similar(positive=['woman','king'], negative=['man'])
print("{}:{:.4f}".format(*result[0]))

## Analogy

In [None]:
def analogy(x1,x2,y1):
    result = model.most_similar(positive=[y1,x2], negative=[x1])
    return result[0][0]

In [None]:
analogy('france','paris','spain')

In [None]:
analogy('japan','japanese','germany')

In [None]:
analogy('japan','japanese','sweden')

In [None]:
analogy('japan','japanese','canada')

In [None]:
analogy('ireland','bread','france')

In [None]:
analogy('tall','tallest','long')

In [None]:
analogy('good','fantastic','bad')

In [None]:
analogy('good','best','bad')

## Find the intruder

In [None]:
print(model.doesnt_match("breakfast cereal dinner lunch".split()))

In [None]:
print(model.doesnt_match("car wheel driver bread".split()))

## Plot

In [None]:
def display_pca_scatterplot(model,words=None,sample=0):
    if words == None:
        if sample > 0:
            words = np.random.choice(list(model.vocab.keys()),sample)
        else:
            words = [word for word in model.vocab]
    
    word_vectors = np.array([model[w] for w in words])
    twodim = PCA().fit_transform(word_vectors)[:,:2]
    
    plt.figure(figsize=(6,6))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.05, y+0.05, word)

In [None]:
%matplotlib inline
words = ['coffee', 'wednesday', 'monday', 'friday' , 'sunday', 'tuesday' ,'tea', 'beer', 'wine', 'brandy', 'rum', 'champagne', 'water',
                         'spaghetti', 'borscht', 'hamburger', 'pizza', 'falafel', 'sushi', 'meatballs',
                         'dog', 'horse', 'cat', 'monkey', 'parrot', 'koala', 'lizard', 'italy','uniform','spain',
                         'frog', 'toad', 'monkey', 'ape', 'kangaroo', 'wombat', 'wolf',
                         'france', 'germany', 'donkey', 'australia', 'water',
                         'homework', 'assignment', 'problem', 'exam', 'test', 'class',
                         'school', 'college', 'university', 'institute', 'six', 'two', 'three','four','five']

display_pca_scatterplot(model, words)

In [None]:
if words == None:
        if sample > 0:
            words = np.random.choice(list(model.vocab.keys()),sample)
        else:
            words = [word for word in model.vocab]
    
word_vectors = np.array([model[w] for w in words])
embeddings_3d = PCA(n_components=3).fit_transform(word_vectors)
    
fig = go.Figure(data=[go.Scatter3d(
    x=embeddings_3d[:, 0],
    y=embeddings_3d[:, 1],
    z=embeddings_3d[:, 2],
    mode='markers',
    marker=dict(
        size=3,
        opacity=0.5
    ),
    text=words,
    hoverinfo='text'
)])

# Set plot title and axis labels
fig.update_layout(
    title='Word2Vec Word Embeddings (3D)',
    scene=dict(
        xaxis_title='Dimension 1',
        yaxis_title='Dimension 2',
        zaxis_title='Dimension 3'
    )
)

fig.update_layout(
    autosize=False,
    width=1000,
    height=1000,
    paper_bgcolor="LightSteelBlue",
)


# Show the plot
fig.show()