# Modified from Text Analytics with Python

# Gensim is a super fast library to create feature vectors (embeddings) from text  -- https://radimrehurek.com/gensim/index.html

# First, let's load a pretrained deep learning model: Word2Vec

In [1]:
import gensim.downloader as api
word2vecA = api.load('word2vec-google-news-300')

### The model aims at assigning nearby feature vectors to similar words, allowing linear operations such as:  

In [2]:
# W1 and W2 should be embeddings near each other: W1 ~= W2
W1 = word2vecA["king"] - word2vecA["man"] + word2vecA["woman"]
W2 = word2vecA["queen"]
# You can find that by 
print(word2vecA.most_similar(positive=["king", "woman"], negative=["man"], topn=1))
# Or you can also verify that by searching the two closest words to W1
print(word2vecA.most_similar(positive=[W1], topn=2))

[('queen', 0.7118193507194519)]
[('king', 0.8449392318725586), ('queen', 0.7300518155097961)]


# Second, let's import required packages and use a corpus to train our own model

In [3]:
import re
import nltk
import numpy as np
from   nltk.corpus import gutenberg, stopwords
from   string      import punctuation
import pandas as pd
import matplotlib.pyplot as plt

pd.options.display.max_colwidth = 200
%matplotlib inline

wpt        = nltk.WordPunctTokenizer()
stop_words = stopwords.words('english') # define stopwords

# Create a standard function to normalize documents. It will be used to normalize your own corpus later

def normalize_document(doc): 
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

bible = gutenberg.sents('bible-kjv.txt') 
remove_terms = punctuation + '0123456789'

# convert to lower case and remove punctuation and digits
norm_bible = [[word.lower() for word in sent if word not in remove_terms] for sent in bible]
norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible]
# apply your standard function for document normalization, and then 
# eliminate sentences with less than two tokens
norm_bible = filter(None, normalize_corpus(norm_bible))
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2]

print('Total lines:', len(bible))
print('\nSample line:', bible[10])
print('\nProcessed line:', norm_bible[10])


Total lines: 30103

Sample line: ['1', ':', '6', 'And', 'God', 'said', ',', 'Let', 'there', 'be', 'a', 'firmament', 'in', 'the', 'midst', 'of', 'the', 'waters', ',', 'and', 'let', 'it', 'divide', 'the', 'waters', 'from', 'the', 'waters', '.']

Processed line: god said let firmament midst waters let divide waters waters


# Train your Word2Vec model on the selected corpus

In [4]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

# tokenize sentences in corpus
tokenized_corpus = [wpt.tokenize(document) for document in norm_bible]

# Set values for various parameters
feature_size   = 300  # embedding dimensionality  
window_context = 30   # context window size: 15 words around the central word                                                                                    
min_word_count = 3    # minimum word count: consider the word when it appears at least 3 times in the corpus      
num_cores      = 12   # number of processors in your machine to speed up process if you have cython 

word2vecB      = Word2Vec(sentences=tokenized_corpus, vector_size=feature_size, window=window_context, \
                          min_count=min_word_count, epochs=50, workers=num_cores)


In [5]:
# create a dictionary with some similar words based on gensim's model
similar_words = {search_term: [item[0] for item in word2vecB.wv.most_similar([search_term], topn=5)]
                for search_term in ['god', 'jesus', 'noah', 'egypt', 'john', 'gospel', 'moses','famine']}
similar_words

{'god': ['lord', 'sworn', 'rebellion', 'covenant', 'glory'],
 'jesus': ['peter', 'john', 'impotent', 'apostles', 'repentance'],
 'noah': ['shem', 'japheth', 'ham', 'methuselah', 'milcah'],
 'egypt': ['pharaoh', 'egyptians', 'bondage', 'flowing', 'rid'],
 'john': ['james', 'baptist', 'peter', 'galilee', 'baptism'],
 'gospel': ['christ', 'preach', 'faith', 'hope', 'godly'],
 'moses': ['aaron', 'congregation', 'joshua', 'sinai', 'gerizim'],
 'famine': ['pestilence', 'blasting', 'sojourn', 'mildew', 'noisome']}

## The same operations now produce different results, due to the change of context

In [6]:
# W1 and W2 should be embeddings near each other: W1 ~= W2
W1 = word2vecB.wv["king"] - word2vecB.wv["man"] + word2vecB.wv["woman"]
W2 = word2vecB.wv["queen"]
# You can find that by 
print(word2vecB.wv.most_similar(positive=["king", "woman"], negative=["man"], topn=1))
# Or you can also verify that by searching the two closest words to W1
print(word2vecB.wv.most_similar(positive=[W1], topn=2))

[('wife', 0.30001696944236755)]
[('woman', 0.6494314074516296), ('king', 0.5991543531417847)]


## Visualize word embeddings

In [7]:
from sklearn.manifold import TSNE

words = sum([[k] + v for k, v in similar_words.items()], [])
wvs = word2vecB.wv[words]

tsne = TSNE(n_components=2, random_state=0, max_iter=10000, perplexity=10)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(wvs)
labels = words

plt.figure(figsize=(14, 8))
plt.scatter(T[:, 0], T[:, 1], c='orange', edgecolors='r')
for label, x, y in zip(labels, T[:, 0], T[:, 1]):
    plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')

# Create your own corpus 

In [8]:
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
          'I love green eggs, ham, sausages and bacon!',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'    
]
labels = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather', 'animals']

corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus, 
                          'Category': labels})
corpus_df = corpus_df[['Document', 'Category']]
print(corpus_df)

                                                             Document Category
0                                      The sky is blue and beautiful.  weather
1                                   Love this blue and beautiful sky!  weather
2                        The quick brown fox jumps over the lazy dog.  animals
3  A king's breakfast has sausages, ham, bacon, eggs, toast and beans     food
4                         I love green eggs, ham, sausages and bacon!     food
5                    The brown fox is quick and the blue dog is lazy!  animals
6            The sky is very blue and the sky is very beautiful today  weather
7                         The dog is lazy but the brown fox is quick!  animals


# Preprocess it 

In [9]:
norm_corpus = normalize_corpus(corpus)
print(norm_corpus)

['sky blue beautiful' 'love blue beautiful sky'
 'quick brown fox jumps lazy dog'
 'kings breakfast sausages ham bacon eggs toast beans'
 'love green eggs ham sausages bacon' 'brown fox quick blue dog lazy'
 'sky blue sky beautiful today' 'dog lazy brown fox quick']


## Train a Word2Vec model on your sample corpus

In [10]:
tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]

# Set values for various parameters
feature_size   = 10  # embedding dimensionality  
window_context = 3   # context window size: 1 word around the central word                                                                                    
min_word_count = 1   # minimum word count: consider the word when it appears at least 1 time in the corpus      
num_cores      = 12  # number of processors in your machine to speed up process if you have cython 

word2vecC      = Word2Vec(sentences=tokenized_corpus, vector_size=feature_size, window=window_context, \
                          min_count=min_word_count, epochs=100, workers=num_cores)


In [11]:
# create a dictionary with some similar words
similar_words = {search_term: [item[0] for item in word2vecC.wv.most_similar([search_term], topn=3)]
                for search_term in ['sky', 'fox', 'breakfast']}
similar_words

{'sky': ['blue', 'bacon', 'sausages'],
 'fox': ['quick', 'today', 'love'],
 'breakfast': ['green', 'kings', 'love']}

## Visualize word embeddings

In [12]:
words = word2vecC.wv.index_to_key
wvs   = word2vecC.wv[words]

tsne = TSNE(n_components=2, random_state=0, n_iter=5000, perplexity=2)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(wvs)
labels = words

plt.figure(figsize=(12, 6))
plt.scatter(T[:, 0], T[:, 1], c='orange', edgecolors='r')
for label, x, y in zip(labels, T[:, 0], T[:, 1]):
    plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')



## Sample word embedding

In [13]:
word2vecC.wv['sky']

array([-0.00650653,  0.00165769,  0.05251156,  0.09033262, -0.09172861,
       -0.07161501,  0.06707988,  0.09047023, -0.05170376, -0.03739343],
      dtype=float32)

## Build framework for getting document (sentence) level embeddings

In [14]:
#!pip install gensim # Average of the word vectors of a given sentence

def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    
# Compute the embedding of each sentence in a corpus as the average of its word embeddings    
def Doc2Vec(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    #print(vocabulary)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) \
                for tokenized_sentence in corpus]
    return np.array(features)

In [15]:
w2v_feature_array = Doc2Vec(corpus=tokenized_corpus, model=word2vecC, num_features=feature_size)
pd.DataFrame(w2v_feature_array)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.003058,-0.008641,0.035086,0.027984,-0.053435,-0.036183,0.060932,0.014549,-0.045775,-0.059238
1,0.003382,-0.018292,0.047592,-0.003435,-0.022776,-0.019782,0.034021,0.022211,-0.039022,-0.02743
2,-0.02625,0.043631,-0.026681,-0.041218,0.037346,0.000333,0.018762,0.013306,-0.00774,0.025239
3,-0.006139,-0.025693,0.013477,0.019014,0.023917,0.013165,0.01859,-0.005417,0.010433,0.01317
4,0.001664,-0.029173,0.029612,0.024028,0.037718,0.032967,0.019836,0.023164,-0.020708,0.036433
5,0.000229,0.035499,-0.026354,-0.021669,0.023457,-0.011599,0.010943,0.024727,-0.034063,-0.001417
6,-0.001069,-0.008166,0.036277,0.01919,-0.055612,-0.030766,0.061074,0.022239,-0.057121,-0.033951
7,-0.014345,0.045987,-0.022765,-0.039259,0.037476,-0.010353,0.006809,0.027462,-0.024049,0.016994


## Clustering with sentence embeddings

In [16]:
from sklearn.cluster import AffinityPropagation

ap = AffinityPropagation()
ap.fit(w2v_feature_array)
cluster_labels = ap.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

Unnamed: 0,Document,Category,ClusterLabel
0,The sky is blue and beautiful.,weather,1
1,Love this blue and beautiful sky!,weather,1
2,The quick brown fox jumps over the lazy dog.,animals,2
3,"A king's breakfast has sausages, ham, bacon, eggs, toast and beans",food,0
4,"I love green eggs, ham, sausages and bacon!",food,0
5,The brown fox is quick and the blue dog is lazy!,animals,2
6,The sky is very blue and the sky is very beautiful today,weather,1
7,The dog is lazy but the brown fox is quick!,animals,2


## Visualize the PCA projection of the sentences in each cluster 

In [17]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2, random_state=0)
pcs = pca.fit_transform(w2v_feature_array)
labels = ap.labels_
categories = list(corpus_df['Category'])
plt.figure(figsize=(8, 6))

for i in range(len(labels)):
    label = labels[i]
    color = 'orange' if label == 0 else 'blue' if label == 1 else 'green'
    annotation_label = categories[i]
    x, y = pcs[i]
    plt.scatter(x, y, c=color, edgecolors='k')
    plt.annotate(annotation_label, xy=(x+1e-4, y+1e-3), xytext=(0, 0), textcoords='offset points')

## GloVe Embeddings with spaCy

## First, load a vocabulary with word embeddings created by GloVe

In [18]:
import spacy

# make sure you have a GloVe pretrained model for an english vocabulary by downloading en_vectors_web_md   
!python -m spacy download en_core_web_md

nlp = spacy.load("en_core_web_md") # load the pipeline with the pretrained model

total_vectors = len(nlp.vocab.vectors)
print('Total of word vectors:', total_vectors)

Collecting en-core-web-md==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
Total of word vectors: 20000


In [19]:
# You can parse any sentence and get its word embeddings

parsed_text = nlp("The green frog jumped the black dog")

glove_vectors = [w.vector for w in parsed_text]
for i, w in enumerate(parsed_text):
    print(w, glove_vectors[i])

The [ -7.2681     -0.85717     5.8105      1.9771      8.8147     -5.8579
   3.7143      3.585       4.7987     -4.4251      1.7461     -3.7296
  -5.1407     -1.0792     -2.5555      3.0755      5.0141      5.8525
   7.3378     -2.7689     -5.1641     -1.9879      2.9782      2.1024
   4.4306      0.84355    -6.8742     -4.2949     -0.17294     3.6074
   0.84379     0.33419    -4.8147      0.035683  -13.721      -4.6528
  -1.4021      0.48342     1.2549     -4.0644      3.3278     -0.2159
  -5.1786      3.536      -3.1575     -3.5273     -3.6753      1.5863
  -8.1594     -3.4657      1.5262      4.8135     -3.8428     -3.9082
   0.67549    -0.35787    -1.7806      3.5284     -0.051114   -0.9715
  -0.90553    -1.557       1.2038      4.7708      0.98561    -2.3186
  -7.4899     -9.5389      8.5572      2.742      -3.627       2.7456
  -6.9574     -1.719      -2.9145      1.1838      3.7864      2.0413
  -3.5808      1.4319      0.20528    -0.7064     -5.3556     -2.5911
   4.4922      1

## Visualize GloVe word embeddings of our corpus

In [20]:
unique_words = list(set([word for sublist in [doc.split() for doc in norm_corpus] for word in sublist]))

word_glove_vectors = np.array([nlp(word).vector for word in unique_words])
pd.DataFrame(word_glove_vectors, index=unique_words)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
today,1.5333,1.6226,1.0552,-1.3615,0.97952,-0.82806,1.4854,2.8817,-0.354,0.34457,...,3.2434,-1.5378,2.3661,-2.5112,-1.2672,-1.4808,-1.2825,1.4143,-2.5045,2.7744
brown,-3.8429,0.14068,-3.3584,3.2679,-2.1536,-6.5085,0.23512,6.8845,-0.54349,3.5969,...,-1.592,1.0269,1.6064,-2.8404,-2.2747,2.1002,5.4802,0.83172,-4.3083,-1.0437
kings,-1.8759,-3.1451,0.26432,2.5128,5.1288,1.5155,-3.7334,3.6285,-0.97522,-1.0979,...,5.4135,0.457,-3.2481,-1.4668,0.40547,3.0402,-1.6745,-3.5399,-4.889,-2.0406
breakfast,-0.6866,-1.7267,-3.0013,-1.1101,1.8389,-3.0766,-0.23369,1.0133,-2.0234,3.0337,...,1.1736,-2.7272,1.5723,-2.429,-1.5276,0.98387,-0.098775,2.9516,-1.6079,2.5412
love,2.0565,-3.2259,-5.7364,-6.146,0.15748,-2.4284,7.658,2.7064,-2.211,-0.8999,...,1.5802,1.7597,-0.60806,-6.6107,0.009383,-4.2763,-0.50507,5.0049,-8.5312,-1.4967
lazy,-1.5888,0.73316,1.0746,-2.4521,-0.42517,3.3344,-0.17942,-1.0924,-0.093904,0.058663,...,1.1253,-2.3553,3.936,-3.3751,-0.99934,1.2639,-2.1068,3.0743,-4.429,3.8938
beans,0.93585,-0.45081,-4.956,2.2548,1.2105,-1.6676,0.87959,0.004286,-4.1678,3.96,...,3.1166,1.1007,4.0496,-0.97496,-5.1273,-0.63158,6.8901,-0.34744,2.4419,0.90073
sky,7.1524,3.0356,-8.704,0.93788,-3.6781,-0.035128,1.2568,0.024432,0.28924,4.5054,...,-5.7712,4.7644,4.235,-1.8427,0.36947,1.3165,1.165,-1.3928,-3.9125,0.48484
sausages,-0.44031,-2.975,-1.2263,1.393,2.9121,-2.8421,-0.39008,0.12248,-2.8923,1.7352,...,3.7204,-1.2142,3.3765,-4.1516,-0.40894,-0.13075,1.9397,2.7565,1.4458,1.5704
ham,0.44278,-4.1141,1.2356,4.1873,1.6244,-1.2601,-1.6435,-0.90033,-1.2415,0.86724,...,3.6574,-1.6391,3.903,-6.5612,3.6737,-1.2058,3.8193,3.605,3.6386,4.0362


In [21]:
tsne = TSNE(n_components=2, random_state=0, n_iter=5000, perplexity=3)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(word_glove_vectors)
labels = unique_words

plt.figure(figsize=(12, 6))
plt.scatter(T[:, 0], T[:, 1], c='orange', edgecolors='r')
for label, x, y in zip(labels, T[:, 0], T[:, 1]):
    plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')



## Cluster documents with GloVe Embeddings

In [22]:
from sklearn.cluster import KMeans 

doc_glove_vectors = np.array([nlp(str(doc)).vector for doc in norm_corpus])

km = KMeans(n_clusters=3, random_state=0)
km.fit_transform(doc_glove_vectors)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

Unnamed: 0,Document,Category,ClusterLabel
0,The sky is blue and beautiful.,weather,1
1,Love this blue and beautiful sky!,weather,1
2,The quick brown fox jumps over the lazy dog.,animals,2
3,"A king's breakfast has sausages, ham, bacon, eggs, toast and beans",food,0
4,"I love green eggs, ham, sausages and bacon!",food,0
5,The brown fox is quick and the blue dog is lazy!,animals,2
6,The sky is very blue and the sky is very beautiful today,weather,1
7,The dog is lazy but the brown fox is quick!,animals,2


# Leveraging gensim for building a FastText model

In [23]:
from gensim.models.fasttext import FastText 


tokenized_corpus = [wpt.tokenize(document) for document in norm_bible]

# Set values for various parameters
feature_size   = 300 # Word vector dimensionality  
window_context = 30  # Context window size                                                                                    
min_word_count = 5   # Minimum word count                        
num_cores      = 12  # Number of processors

# train the model 
fast_text = FastText(tokenized_corpus, vector_size=feature_size, window=window_context, 
                    min_count=min_word_count, workers=num_cores, epochs=50) # it does not support n-grams with n
                                                                            # different than 1


In [24]:
# view similar words based on the fast_text model
similar_words = {search_term: [item[0] for item in fast_text.wv.most_similar([search_term], topn=5)]
                  for search_term in ['god', 'jesus', 'noah', 'egypt', 'john', 'gospel', 'moses','famine']}
similar_words

{'god': ['godly', 'goddess', 'gods', 'godliness', 'lord'],
 'jesus': ['ephesus', 'tarsus', 'gaius', 'alphaeus', 'scripture'],
 'noah': ['zanoah', 'manoah', 'adah', 'milcah', 'joah'],
 'egypt': ['egyptian', 'egyptians', 'pharaoh', 'bondage', 'canaan'],
 'john': ['alphaeus', 'galilee', 'herodias', 'james', 'baptist'],
 'gospel': ['christ', 'hope', 'superscription', 'grace', 'revelation'],
 'moses': ['joses', 'moza', 'purposes', 'amos', 'asses'],
 'famine': ['examine', 'familiar', 'family', 'mine', 'pestilence']}

### Visualize their projection by PCA

In [25]:
words = sum([[k] + v for k, v in similar_words.items()], [])
wvs = fast_text.wv[words]

pca = PCA(n_components=2)
np.set_printoptions(suppress=True)
P = pca.fit_transform(wvs)
labels = words

plt.figure(figsize=(18, 10))
plt.scatter(P[:, 0], P[:, 1], c='lightgreen', edgecolors='g')
for label, x, y in zip(labels, P[:, 0], P[:, 1]):
    plt.annotate(label, xy=(x+0.06, y+0.03), xytext=(0, 0), textcoords='offset points')

### Playing with similarity

In [26]:
fast_text.wv['jesus']

array([-0.30584332,  1.4597086 , -0.20075062, -3.5213044 , -3.5032415 ,
        0.6045214 , -0.7558142 ,  1.6340855 , -0.7819558 ,  0.46100932,
       -2.3164396 ,  2.398333  , -0.56278855, -1.6978656 , -1.9863446 ,
       -2.5130289 ,  2.6174915 , -1.2601485 ,  3.0547707 , -0.15412554,
       -2.0985327 ,  2.2201421 , -3.5260022 , -3.184709  ,  0.37583095,
        0.40446222, -0.14917588,  0.40501443,  1.0694941 , -0.06712214,
        1.8951794 , -1.4284976 ,  1.948012  ,  0.04004262, -0.53943264,
        0.8933363 ,  0.6302208 ,  0.87466747, -0.16572584, -0.37349617,
        1.0459887 , -2.779037  ,  2.5061605 ,  0.7256071 ,  2.5760088 ,
       -1.5487748 , -3.5315645 ,  4.838411  , -3.289885  , -1.5516857 ,
        0.22165133, -1.4997333 ,  0.5478979 ,  0.86551076, -1.4852788 ,
        0.30736035,  1.2807195 ,  0.9958289 ,  1.9986396 , -3.071913  ,
       -1.1619956 , -2.427179  ,  0.04490274, -1.311942  , -2.5709636 ,
        0.5152984 ,  1.3082554 , -0.5259133 , -1.1672891 ,  0.78

In [27]:
print(fast_text.wv.similarity(w1='god', w2='satan'))
print(fast_text.wv.similarity(w1='god', w2='jesus'))

-0.011600196
0.11037772


In [28]:
st1 = "christ jesus satan gospel"
print('Odd one out for [',st1, ']:', fast_text.wv.doesnt_match(st1.split()))

st2 = "john peter james judas"
print('Odd one out for [',st2, ']:', fast_text.wv.doesnt_match(st2.split()))

Odd one out for [ christ jesus satan gospel ]: satan
Odd one out for [ john peter james judas ]: judas
