In [1]:
import re
import nltk
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [3]:
#let's view the fileids of the gutenberg corpus
from nltk.corpus import gutenberg
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [4]:
#lets choose the bryant stories
bryant_sents = gutenberg.raw(gutenberg.fileids()[5])
bryant_sents = bryant_sents.split('\n')
print('the length of the sentences before cleaning and preprocessing is', len(bryant_sents))

the length of the sentences before cleaning and preprocessing is 5539


In [5]:
#an exmaple line in the sents
print(bryant_sents[7])

     Full of little gentlemen;


In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
#importing necessary libraires to clean the data
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stopWords = stopwords.words('english')
charfilter = re.compile('[a-zA-Z]+')

In [8]:
#now let's tokenize the words
def simple_filter(sent):
    #converting all the tokens to lower case:
    words = sent.split()
    word_lower = [] 
    for word in words:
        word_lower.append(word.lower())
    #let's remove every stopword:
    word_clean = [word for word in word_lower if word not in stopWords]
    #removing all the characters and using only characters
    tokens = list(filter(lambda token : charfilter.match(token),word_clean))
    #stemming all the words
    ntokens = []
    for word in tokens:
        ntokens.append(PorterStemmer().stem(word))
    return tokens

In [9]:
#converting all the bryant data to tokens using our function simple tokenizer we created earlier
sentences = []
for sent in bryant_sents:
    tokens = simple_filter(sent)
    if len(tokens) >0 :
        sentences.append(tokens)

In [10]:
#an example sentence in the data
print(sentences[7])

['messenger', 'small', 'slight,']


In [11]:
#Word2Vec
#training the gensim on the data
#Using the Cbow architecture for the word2Vec
from gensim.models import Word2Vec
model_cbow = Word2Vec(sentences, min_count = 1, size = 50, workers = 3, window = 5, sg = 0)

In [12]:
#Any example to find the vector model of a word
print('the array representation of the word \'gentleman\'\n:',model_cbow['gentleman'], '\n the array representation of the word \'messenger\'\n:', model_cbow['messenger']) 

the array representation of the word 'gentleman'
: [-1.6238875e-03  5.2046544e-05 -1.6655297e-03  5.4726237e-03
 -3.7136106e-04  7.8298887e-03  8.8874046e-03  3.9056756e-03
  2.3949440e-03 -2.3354089e-03  8.2995715e-03 -5.0907698e-03
  2.7501998e-03 -7.2486615e-03 -3.2535163e-03 -4.9176766e-04
  6.1176401e-03 -4.0196595e-03 -8.0045993e-03 -7.2745928e-03
  7.3357988e-03  2.4220531e-03 -6.6526188e-03  6.4516752e-03
  4.6080914e-03  9.7617730e-03  9.7249970e-03 -9.0984041e-03
 -3.1061368e-03 -8.8562965e-03 -9.5619919e-04 -6.9865980e-03
 -6.2448991e-04 -4.6572080e-03 -1.5983528e-03 -7.6128091e-03
 -7.1349647e-03  7.4844486e-03 -2.9184259e-05  7.7572688e-03
  6.1151981e-03  3.0898745e-03 -5.6976695e-03 -1.8392071e-03
 -9.9876989e-03  9.8783998e-03  9.1043683e-03 -6.4186361e-03
  1.0746912e-03 -2.9253131e-03] 
 the array representation of the word 'messenger'
: [-0.00939686  0.00870303 -0.00244509 -0.00375331  0.00025061 -0.00725746
 -0.00057367 -0.0065589   0.00567129  0.00655579  0.0021534

In [13]:
#Computing the similarities of the words
print(model_cbow.similarity('messenger', 'gentleman'))

0.04549653


In [14]:
#Computing the 5 most similar words to the word gentleman
print('the 5 most similar words to \'gentleman\':', model_cbow.most_similar('gentleman')[:5])

the 5 most similar words to 'gentleman': [('minute,', 0.5440585017204285), ('sea;', 0.5238329172134399), ('robber?', 0.5104078650474548), ('brathe', 0.4847576320171356), ('ear.', 0.4814983606338501)]


In [15]:
#let's see how the skipgram model works on the data
model_skipgram = Word2Vec(sentences, min_count = 1, size = 50, workers = 3, window = 5, sg = 1)

In [16]:
#Any example to find the vector model of a word
print('the array representation of the word \'gentleman\'\n:',model_skipgram['gentleman'], '\n the array representation of the word \'messenger\'\n:', model_skipgram['messenger']) 

the array representation of the word 'gentleman'
: [-0.00085873 -0.00140694 -0.0018727   0.00562109 -0.00057548  0.0087606
  0.01137052  0.00344855  0.00234635 -0.00276426  0.0084667  -0.00496227
  0.00250011 -0.00811942 -0.00565087  0.00189225  0.00546402 -0.00316642
 -0.00691166 -0.00694099  0.00651739  0.00117364 -0.0067368   0.00607441
  0.00444426  0.01004354  0.00926617 -0.00950051 -0.00512157 -0.00732022
 -0.00089679 -0.00591263 -0.00055283 -0.00527631  0.00080326 -0.00821549
 -0.00821198  0.00852285  0.00139628  0.00812304  0.006193    0.00217026
 -0.00492435 -0.0007501  -0.0125117   0.00998284  0.00948716 -0.00418544
  0.00092839 -0.00212597] 
 the array representation of the word 'messenger'
: [-0.00138484 -0.00570969 -0.0043577  -0.00236183 -0.00162741  0.00220965
  0.02302744 -0.01022819  0.00569328  0.00262888  0.00447234 -0.0060498
  0.00667456 -0.01681448 -0.02958052  0.01625853 -0.00464369  0.01078462
  0.02002277 -0.00041919 -0.01670301 -0.00887086 -0.00948716  0.00369

In [17]:
#Computing the similarities of the words
print(model_skipgram.similarity('messenger', 'gentleman'))

0.24993962


In [18]:
#Computing the 5 most similar words to the word gentleman
print('the 5 most similar words to \'gentleman\':', model_skipgram.most_similar('gentleman')[:5])

the 5 most similar words to 'gentleman': [('sea;', 0.5623403191566467), ('minute,', 0.5454872250556946), ('ear.', 0.5267316699028015), ('robber?', 0.5168870091438293), ('brathe', 0.5092170238494873)]


In [19]:
#using the glove package for embeddings
!pip install glove-python-binary

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting glove-python-binary
  Downloading glove_python_binary-0.2.0-cp37-cp37m-manylinux1_x86_64.whl (948 kB)
[K     |████████████████████████████████| 948 kB 13.9 MB/s 
Installing collected packages: glove-python-binary
Successfully installed glove-python-binary-0.2.0


In [20]:
from glove import Corpus, Glove
corpus = Corpus()
corpus.fit(sentences, window = 5)
glove = Glove(no_components = 50, learning_rate = 0.05)
glove.fit(corpus.matrix, epochs = 30, no_threads = 4, verbose = True)
glove.add_dictionary(corpus.dictionary)

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [21]:
#Any example to find the vector model of a word
print('the array representation of the word \'gentleman\'\n:',glove.word_vectors[glove.dictionary['gentleman']],
      '\n the array representation of the word \'messenger\'\n:', glove.word_vectors[glove.dictionary['messenger']]) 

the array representation of the word 'gentleman'
: [-0.00446257 -0.00011563  0.00048778  0.00470994  0.00047404 -0.00650052
  0.00275075 -0.00245812  0.00132315  0.00678317  0.00294232 -0.00963198
 -0.006922    0.00758769  0.00866188  0.00025094  0.00936169  0.00638576
 -0.00684898  0.00252016  0.00416938  0.00057839  0.00449063  0.00807159
 -0.00723738  0.00640753  0.00559421  0.00672373 -0.00988107 -0.00463331
  0.00418348  0.00308651  0.00110704 -0.00814962 -0.00644658  0.00116924
  0.00763553 -0.00058078 -0.00120575  0.00162964  0.00719197  0.00022644
 -0.00230916  0.00795786 -0.0001765  -0.00510027 -0.00476935  0.005888
 -0.00727045 -0.00267948] 
 the array representation of the word 'messenger'
: [-4.59228478e-04 -2.41253158e-03  3.23837646e-03  6.51687244e-03
  6.33845422e-03  2.49419763e-03 -1.27105829e-03 -8.14738685e-03
 -8.26794884e-04 -3.11872711e-03 -5.97164824e-03 -1.38683848e-03
 -1.34232465e-04  1.89124214e-03 -7.07742934e-03 -9.59844083e-03
 -6.32536642e-03  4.26748653

In [22]:
#Computing the similarities of the words
print(glove.most_similar('gentleman', number = 5))

[('cheese,--and', 0.4584710241006303), ('berries', 0.44931665625607436), ('bear-catchers.', 0.4410907196027992), ('drive', 0.4379447740886234)]
