# (1) Implementing  word embedding models trained using **our** own data

In [2]:
! pip install wikipedia



In [3]:
from keras.preprocessing.text import Tokenizer
from gensim.models.fasttext import FastText
import numpy as np
import matplotlib.pyplot as plt
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer

import wikipedia
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

%matplotlib inline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
artificial_intelligence = wikipedia.page("Artificial Intelligence").content
#machine_learning = wikipedia.page("Machine Learning").content
deep_learning = wikipedia.page("Deep Learning").content
neural_network = wikipedia.page("Neural Network").content

artificial_intelligence = sent_tokenize(artificial_intelligence)
#machine_learning = sent_tokenize(machine_learning)
deep_learning = sent_tokenize(deep_learning)
neural_network = sent_tokenize(neural_network)

#artificial_intelligence.extend(machine_learning)
artificial_intelligence.extend(deep_learning)
artificial_intelligence.extend(neural_network)

In [5]:
import re
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

def preprocess_text(document):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens if len(word) > 3]

        preprocessed_text = ' '.join(tokens)

        return preprocessed_text

In [6]:
sent = preprocess_text("Artificial intelligence, is the most advanced technology of the present era")
print(sent)


final_corpus = [preprocess_text(sentence) for sentence in artificial_intelligence if sentence.strip() !='']

word_punctuation_tokenizer = nltk.WordPunctTokenizer()
word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in final_corpus]


artificial intelligence advanced technology present


In [7]:
embedding_size = 60
window_size = 40
min_word = 5
down_sampling = 1e-2

# FAST TEXT

In [8]:
%%time
ft_model = FastText(word_tokenized_corpus,
                      size=embedding_size,
                      window=window_size,
                      min_count=min_word,
                      sample=down_sampling,
                      sg=1,
                      iter=100)

CPU times: user 42.7 s, sys: 436 ms, total: 43.1 s
Wall time: 37.8 s


In [9]:
print(ft_model.wv['artificial'])

[ 0.6932765  -0.2963536  -0.21637559  0.06758623  0.15932657  0.52195823
 -0.24512057 -0.32482928 -0.07606503 -0.08539195  0.03418946  0.81393313
  0.04692402  0.05112171  0.21053742  0.40145722  0.23286313  0.07525598
 -0.06820988  0.50691974  0.41983938 -0.16697241 -0.12868842  0.33029443
 -0.04305311  0.04409973  0.15355237 -0.15530716 -0.16474764  0.16932997
 -0.91203403  0.05106799 -0.4583516   0.30981684  0.11186435 -0.11394573
 -0.43446967  0.1897681  -0.04982232 -0.01309912 -0.32498053 -0.45466
 -0.13298832 -0.03639678 -0.6618926  -0.1780947  -0.149266    0.33501023
  0.28702512 -0.63633794 -0.05051721 -0.34511605  0.12756354  0.13793111
 -0.26423684  0.17532864 -0.09753798  0.07096051 -0.29031393  0.3396976 ]


In [10]:
semantically_similar_words = {words: [item[0] for item in ft_model.wv.most_similar([words], topn=5)]
                  for words in ['artificial', 'intelligence', 'machine', 'network', 'recurrent', 'deep']}

for k,v in semantically_similar_words.items():
    print(k+":"+str(v))

artificial:['intelligence', 'superintelligence', 'paradigm', 'reference', 'issn']
intelligence:['artificial', 'superintelligence', 'intelligent', 'general', 'existential']
machine:['definition', 'future', 'springer', 'still', 'perspective']
network:['neural', 'recurrent', 'convolutional', 'deep', 'feedforward']
recurrent:['network', 'neural', 'schmidhuber', 'short', 'memory']
deep:['learning', 'network', 'specifically', 'neural', 'scale']


In [11]:
print(ft_model.wv.similarity(w1='artificial', w2='intelligence'))

0.7923245


In [12]:
import gensim
import pandas as pd

# WORD2VEC

In [13]:
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import Word2Vec

# init callback class
class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        elif self.epoch % 100 == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        
        
        self.epoch += 1
        self.loss_previous_step = loss

In [14]:
# init word2vec class
w2v_model = Word2Vec(size = 300,
                     window = 15,
                     min_count = 2,
                     workers = 20,
                     sg = 1,
                     negative = 5,
                     sample = 1e-5)
# build vovab


w2v_model.build_vocab(word_tokenized_corpus)

  
# train the w2v model
#start = time.time()
w2v_model.train(word_tokenized_corpus, 
                total_examples=w2v_model.corpus_count, 
                epochs=1001, 
                report_delay=1,
                compute_loss = True, # set compute_loss = True
                callbacks=[callback()]) # add the callback class
#end = time.time()


#print("elapsedtime in seconds :"+ str(end - start))
# save the word2vec model
#w2v_model.save('C:\\wiki\\word2vec.model')

Loss after epoch 0: 8380.736328125
Loss after epoch 100: 6090.8125
Loss after epoch 200: 5202.25
Loss after epoch 300: 5224.375
Loss after epoch 400: 3977.75
Loss after epoch 500: 3305.0
Loss after epoch 600: 3095.5
Loss after epoch 700: 3187.5
Loss after epoch 800: 2782.75
Loss after epoch 900: 2642.0
Loss after epoch 1000: 2850.0


(1478426, 13491478)

In [15]:
#reloaded_w2v_model = Word2Vec.load('C:\\AmazonReviewsCellPhones\\word2vec.model')
words = list(w2v_model.wv.vocab)
print('Vocab size: '+str(len(words)))
w1 = 'artificial'
print("Top 5 words similar to artificial:",\
      w2v_model.wv.most_similar(positive = w1,topn =5))
w1 = 'intelligence'
print("Top 5 words similar to intelligence:",\
      w2v_model.wv.most_similar(positive = w1,topn =5))
print("Similarity between artificial and intelligence:"+\
      str(w2v_model.wv.similarity(w1="artificial",w2="intelligence")))
print("Similarity between machine and network:"+\
      str(w2v_model.wv.similarity(w1="machine",w2="network")))

Vocab size: 1699
Top 5 words similar to artificial: [('intelligence', 0.8088740706443787), ('paradigm', 0.6970927715301514), ('selected', 0.6918691396713257), ('psychology', 0.6886146664619446), ('selection', 0.6807595491409302)]
Top 5 words similar to intelligence: [('artificial', 0.8088740706443787), ('business', 0.7853144407272339), ('mccarthy', 0.7451132535934448), ('regulation', 0.7437580227851868), ('regulating', 0.7382546663284302)]
Similarity between artificial and intelligence:0.8088741
Similarity between machine and network:0.2644036


In [16]:
!pip install glove-python-binary



# **GLOVE**

In [17]:
import nltk
import re
import numpy as np

from glove import Corpus, Glove
from nltk.corpus import gutenberg
from multiprocessing import Pool
from scipy import spatial

In [18]:
corpus = Corpus()
corpus.fit(word_tokenized_corpus, window = 3)    # window parameter denotes the distance of context
glove = Glove(no_components = 50, learning_rate = 0.05)
glove.fit(matrix = corpus.matrix, epochs = 20, verbose = True)

Performing 20 training epochs with 2 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


In [19]:
glove.add_dictionary(corpus.dictionary) 

In [20]:
glove.save('glove_model')
glove.load('glove_model')

<glove.glove.Glove at 0x7fe9f02806d0>

In [27]:
glove.most_similar('artificial', number = 5)

[('intelligence', 0.9954028719274185),
 ('general', 0.9751589688527823),
 ('neuron', 0.942457578711369),
 ('biological', 0.8795480814242774)]

In [28]:
def vector_converter(word):
    idx = glove.dictionary[word]
    return glove.word_vectors[idx]

In [29]:
v1 = vector_converter('artificial')
v2 = vector_converter('intelligence')

In [39]:
v1

array([-0.01275158,  0.19094547, -0.16517368, -0.0078705 , -0.06852876,
        0.1867336 , -0.05529471, -0.15929799, -0.39391046, -0.20847612,
        0.19480649,  0.21482315, -0.30011154, -0.19262956,  0.32755457,
       -0.35811749,  0.16197272,  0.06802437,  0.21331267,  0.13861714,
       -0.09679476,  0.35493147, -0.06071992, -0.22855641, -0.04897889,
       -0.16547452, -0.45300933, -0.22666161,  0.21669483,  0.19505024,
       -0.18246718,  0.13291124,  0.25918245,  0.31996886, -0.11926836,
        0.0431102 ,  0.02712656,  0.33462299, -0.20646125, -0.15794015,
       -0.24313383, -0.13858669,  0.0454516 ,  0.03470849, -0.0253744 ,
       -0.13682146,  0.28566293,  0.29993809, -0.27727892,  0.09663655])

In [40]:
v2

array([-0.03741554,  0.14837838, -0.13320892,  0.01578165, -0.05847838,
        0.15946866, -0.04142263, -0.18431478, -0.32309879, -0.23160163,
        0.17581459,  0.19895385, -0.24876428, -0.1619936 ,  0.28261536,
       -0.31574723,  0.15893725,  0.10704771,  0.18341188,  0.10587967,
       -0.06381416,  0.30972264, -0.01982153, -0.20297415, -0.05126804,
       -0.14822189, -0.37772032, -0.17962885,  0.20141353,  0.16351597,
       -0.15468125,  0.11927515,  0.21576431,  0.29306675, -0.10212786,
        0.03953097,  0.03982473,  0.28774534, -0.18764283, -0.13255864,
       -0.20910495, -0.13127452,  0.06850415,  0.03811644, -0.05766696,
       -0.11310544,  0.25565167,  0.25120973, -0.24047595,  0.08470625])

In [30]:
from scipy import spatial

In [41]:
1 - spatial.distance.cosine(v1, v2)

0.9954028719274185

# FLAIR

In [32]:
!pip install flair



In [33]:
from flair.embeddings import FlairEmbeddings

In [34]:
flair_embedding_forward = FlairEmbeddings('news-forward')

In [35]:
from flair.data import Sentence
from flair.models import SequenceTagger


In [36]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings

In [37]:
from flair.embeddings import FlairEmbeddings

# init embedding
flair_embedding_forward = FlairEmbeddings('news-forward')

# create a sentence
sentence = Sentence('NLP drives computer programs that translate text from one language to another respond to spoken commands and summarize large volumes of text rapidly even in real time. There is a good chance you interacted with NLP in the form of voiceoperated GPS systems, digital assistants, speech to text dictation software, customer service chatbots, and other consumer conveniences. But NLP also plays a growing role in enterprise solutions that help streamline business operations, increase employee productivity, and simplify mission-critical business processes.')

# embed words in sentence
flair_embedding_forward.embed(sentence)

[Sentence: "NLP drives computer programs that translate text from one language to another respond to spoken commands and summarize large volumes of text rapidly even in real time . There is a good chance you interacted with NLP in the form of voiceoperated GPS systems , digital assistants , speech to text dictation software , customer service chatbots , and other consumer conveniences . But NLP also plays a growing role in enterprise solutions that help streamline business operations , increase employee productivity , and simplify mission-critical business processes ."]

In [38]:
for token in sentence:
    print(token)
    print(token.embedding)


Token[0]: "NLP"
tensor([ 3.0688e-04,  3.6357e-05, -7.4025e-02,  ..., -3.1189e-03,
        -1.0819e-02,  5.4082e-03])
Token[1]: "drives"
tensor([-0.0006,  0.0003,  0.0083,  ..., -0.0001, -0.0432,  0.0178])
Token[2]: "computer"
tensor([-0.0023, -0.0015,  0.0522,  ..., -0.0067,  0.0250,  0.0136])
Token[3]: "programs"
tensor([-0.0010, -0.0001,  0.0544,  ..., -0.0005, -0.0026,  0.0048])
Token[4]: "that"
tensor([-3.4260e-04,  3.9312e-05,  3.9482e-02,  ..., -2.0388e-02,
        -9.5657e-04,  1.7101e-03])
Token[5]: "translate"
tensor([-2.1845e-03, -1.5394e-04,  1.1978e-01,  ..., -1.8368e-05,
         4.4195e-04,  9.7938e-03])
Token[6]: "text"
tensor([-0.0034,  0.0028,  0.0263,  ..., -0.0020,  0.0050,  0.0063])
Token[7]: "from"
tensor([-6.7279e-05, -3.4755e-04,  4.1317e-02,  ..., -9.6794e-05,
         3.8929e-03, -5.1897e-03])
Token[8]: "one"
tensor([ 0.0004,  0.0002,  0.0076,  ..., -0.0031,  0.0026,  0.0755])
Token[9]: "language"
tensor([-6.5698e-04,  3.9553e-04,  1.2800e-02,  ..., -6.1270e-05

# (2) Glove pretrained model for generating embeddings

In [42]:
import gensim.downloader as api

In [50]:
# to load a glove pretrained model
glove_m = api.load("glove-wiki-gigaword-50")

In [57]:
corpus_1 = []
for i in range(20):
    x = word_tokenized_corpus[i]
    corpus_1.append(x)



In [61]:
new_corpus = corpus_1[1]

In [70]:
new_corpus

['leading',
 'textbook',
 'define',
 'field',
 'study',
 'intelligent',
 'agent',
 'system',
 'perceives',
 'environment',
 'take',
 'action',
 'maximize',
 'chance',
 'achieving',
 'goal',
 'popular',
 'account',
 'term',
 'artificial',
 'intelligence',
 'describe',
 'machine',
 'mimic',
 'cognitive',
 'function',
 'human',
 'associate',
 'human',
 'mind',
 'learning',
 'problem',
 'solving',
 'however',
 'definition',
 'rejected',
 'major',
 'researcher',
 'application',
 'include',
 'advanced',
 'search',
 'engine',
 'google',
 'recommendation',
 'system',
 'used',
 'youtube',
 'amazon',
 'netflix',
 'understanding',
 'human',
 'speech',
 'siri',
 'alexa',
 'self',
 'driving',
 'tesla',
 'automated',
 'decision',
 'making',
 'competing',
 'highest',
 'level',
 'strategic',
 'game',
 'system',
 'chess']

In [69]:
len(new_corpus)

68

In [63]:
vectors = []

In [64]:
for i in new_corpus:
    y = glove_m[i]
    vectors.append(y)




In [67]:
vectors

[array([-0.45816 ,  0.18518 ,  0.35274 ,  0.46199 ,  0.1666  ,  0.22531 ,
        -0.94667 ,  0.072191, -0.020073,  0.22138 ,  0.022843, -0.7057  ,
        -0.68059 ,  0.6923  , -0.41883 , -0.68109 ,  0.35849 , -0.56264 ,
        -0.68847 , -0.073729,  0.070436, -0.17436 , -0.38994 ,  0.090127,
        -0.30945 , -1.554   , -0.13894 , -0.80492 , -0.73123 ,  0.63291 ,
         3.208   ,  0.041802,  0.41536 , -0.26637 ,  0.038696, -0.25567 ,
        -0.44192 ,  0.50113 ,  0.11523 ,  0.10027 , -0.30422 ,  0.3274  ,
        -0.17812 ,  0.10018 ,  0.59222 ,  0.29602 , -0.54511 ,  0.24227 ,
         0.37948 , -0.41335 ], dtype=float32),
 array([-1.163   , -0.13688 , -0.83546 , -0.85329 , -0.39908 ,  0.35    ,
        -0.064561, -1.3242  , -0.1079  ,  0.42883 , -0.25933 ,  0.04346 ,
        -0.10668 ,  0.44067 ,  0.23598 , -0.53208 , -0.65907 ,  0.23817 ,
         0.33575 ,  0.64787 ,  0.63219 , -0.41558 , -0.38247 ,  0.30521 ,
         0.63    , -1.2265  , -1.0514  , -0.32805 , -0.41857 , -0

In [74]:
glove_vectors = dict(zip(new_corpus,vectors))

In [75]:
glove_vectors['action']

array([ 2.4196e-01, -7.4387e-01, -5.1436e-01, -8.9699e-02, -2.6825e-01,
        4.5664e-01,  7.2555e-01, -2.1819e-01,  4.2826e-02,  1.8943e-01,
       -3.2057e-01,  5.7118e-02, -5.9674e-01,  2.2751e-01,  1.2945e-01,
       -2.9623e-01,  6.4950e-01, -2.2105e-01, -5.7708e-01, -4.5436e-01,
        2.5636e-01,  9.9275e-02, -1.1760e-01, -1.6338e-01, -2.3526e-01,
       -1.8493e+00, -1.0322e-01, -3.5198e-01,  2.4535e-03,  3.3223e-02,
        3.3513e+00,  1.5836e-01, -1.0583e+00, -7.2565e-01, -5.7345e-01,
       -5.7453e-02,  4.7996e-01, -9.9358e-01, -8.0074e-01, -6.2818e-01,
       -2.2644e-01,  1.4946e-01, -3.2743e-02, -3.3740e-01,  2.0041e-01,
       -2.3766e-03,  1.2109e-02,  5.5461e-01,  1.5185e-01,  7.0218e-01],
      dtype=float32)

*THE CODE FOR WORD2VEC AND FAST TEXT IS SUBMITTED IN ANOTHER IPYTHON NOTEBOOK AS WE RAN THE PROGRAM IN COLAB, THE RAM AND DISK IS GETIING EXHAUSTED AND RESTARTING AS WE ARE IMPORTING THE PRETARINED MODEL USING api.load*

# (3) Generating embeddings for 5 pairs of positive and negative samples
# in the data and computing the cosine similarity between the vectors

In [92]:
positive = w2v_model.most_similar('data',topn=5)

  """Entry point for launching an IPython kernel.


In [93]:
negative = w2v_model.most_similar(positive=['news', 'book'], negative=['data'], topn=5)

  """Entry point for launching an IPython kernel.


In [94]:
positive

[('flow', 0.7052559852600098),
 ('health', 0.6705082654953003),
 ('used', 0.6596438884735107),
 ('record', 0.6594346761703491),
 ('annotation', 0.659172534942627)]

In [95]:
negative

[('bibcode', 0.7860730290412903),
 ('oxford', 0.7794829607009888),
 ('fear', 0.7685981392860413),
 ('2015sci', 0.7658615112304688),
 ('discussion', 0.7575021982192993)]

In [102]:
l = len(positive)

In [110]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [112]:
for i in range(l):
    print('The similarity between {} and {} is:'.format(positive[i][0],negative[i][0]))
    print(w2v_model.similarity(positive[i][0],negative[i][0]))

    

The similarity between flow and bibcode is:
0.2226525
The similarity between health and oxford is:
0.26360434
The similarity between used and fear is:
0.27013567
The similarity between record and 2015sci is:
0.3884084
The similarity between annotation and discussion is:
0.12610756


# (4) OBSERVATION

**1) When training word2vec with gensim, the result you achieve is a representation of the words in your vocabulary as vectors. The dimension of these vectors is the size of the neural network.**

**2) The pre-trained word2vec models simply contain a list of those vectors that were pre-trained on a large corpus. You will find pre-trained vectors of various sizes.**

**3) word embeddings that have previously been trained on a vast corpus of text and are thus referred to as Pre-trained Word Embeddings.**

**4) The similarity between pairs of positive and negative samples are very less that indicates that angle between those two word vectors high and they are similar.**

