# Playing with word 2 vec Bible corpus

In [3]:
from string import punctuation 
from gensim.models import word2vec
import nltk 
from nltk.stem import WordNetLemmatizer 
import tqdm
import re



In [4]:
import logging

In [5]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 

In [38]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

# Which Bible Version

The original King James (gutenberg) uses an old archaic english.  
Most lematization packages don't work for old english.  
Solution is to use the `American King James Version` from http://biblehub.net/.  

*From Amazon*
The American King James Bible replaces archaic Middle English words with modern English words without changing the grammar or sentence structure of the King James version. It contains the Old and New Testaments.The 2018 version corrected "saith" to "says" and several spelling errors.

In [7]:
# bible_kjv_sents = gutenberg.sents('bible-kjv.txt')  # that's what is needed
bible_kjv_sents = None
with open('American_King_James_Version_Only_Sentences.txt', 'r') as file:
    bible_kjv_sents = file.read()

bible_kjv_sents = bible_kjv_sents.split('\n')
print("number of sentences (verses): ", len(bible_kjv_sents))

number of sentences (verses):  31102


In [8]:
from numpy.random import randint
rand_start = randint(30100)
for sent in bible_kjv_sents[rand_start:rand_start+3]: # phrases or sentences
    print(sent)

Thirty milk camels with their colts, forty cows, and ten bulls, twenty she asses, and ten foals.
And he delivered them into the hand of his servants, every drove by themselves; and said to his servants, Pass over before me, and put a space between drove and drove.
And he commanded the foremost, saying, When Esau my brother meets you, and asks you, saying, Whose are you? and where go you? and whose are these before you?


# Lematization and remove stop-words

In contrast to stemming, lemmatization is a lot more powerful. It looks beyond word reduction and considers a language’s full vocabulary to apply a morphological analysis to words, aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma.


https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/

In [9]:
import spacy
nlp = spacy.load('en_core_web_md')

In [10]:
sentence = bible_kjv_sents[26450]
print(sentence) 
doc = nlp(sentence)
[token.lemma_ for token in doc][:15]

Therefore said they to him, How were your eyes opened?


['therefore',
 'say',
 'they',
 'to',
 'he',
 ',',
 'how',
 'be',
 'your',
 'eye',
 'open',
 '?']

In [11]:
 for token in doc:
     print(token.lemma_, token.is_stop)

therefore True
say False
they True
to True
he True
, False
how True
be True
your True
eye False
open False
? False


In [10]:
bible_kjv_sents_lema = []
for sent in tqdm.tqdm(bible_kjv_sents):
    doc = nlp(sent)
    bible_kjv_sents_lema.append([token.lemma_ for token in doc if not token.is_stop])

  2%|▏         | 767/31102 [00:07<05:30, 91.89it/s]

In [12]:
import spacy

def lematize(sent, remove_stop=False):
    doc = nlp(sent)
    if remove_stop:
        return [token.lemma_ for token in doc if not token.is_stop]
    else:
        return [token.lemma_ for token in doc]

In [13]:
from joblib import Parallel, delayed

bible_kjv_sents_lema = Parallel(n_jobs=8, verbose=5, 
        batch_size=int(len(bible_kjv_sents))//8)(delayed(lematize)(sent) for sent in bible_kjv_sents)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done 792 tasks      | elapsed:   17.8s
[Parallel(n_jobs=8)]: Done 21744 tasks      | elapsed:  1.6min
[Parallel(n_jobs=8)]: Done 31102 out of 31102 | elapsed:  2.2min finished


In [14]:
' '.join(bible_kjv_sents_lema[26450])

'therefore say they to he , how be your eye open ?'

# Remove punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' + ' ' pure space words

In [15]:
#[[word.lower() for word in sent if word not in punctuation+' '] for sent in bible_kjv_sents_lema]

In [16]:
bible_kjv_sents_lema_punct = [[word.lower() for word in sent if word not in punctuation+' '] for sent in bible_kjv_sents_lema]

In [17]:
for sentence in bible_kjv_sents_lema_punct[:10]:
    print(' '.join(sentence))

in the beginning god create the heaven and the earth
and the earth be without form and void and darkness be on the face of the deep and the spirit of god move on the face of the water
and god say let there be light and there be light
and god see the light that it be good and god divide the light from the darkness
and god call the light day and the darkness he call night and the evening and the morning be the first day
and god say let there be a firmament in the middle of the water and let it divide the water from the water
and god make the firmament and divide the water which be under the firmament from the water which be above the firmament and it be so
and god call the firmament heaven and the evening and the morning be the second day
and god say let the water under the heaven be gather together to one place and let the dry land appear and it be so
and god call the dry land earth and the gathering together of the water call he seas and god see that it be good


# Remove Stop Words

### It seams not really usefull for word 2 vec goal. The vectors created removing stop words seams less coherent. 

https://stackoverflow.com/a/40447086/1207193

In [43]:
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
bible_kjv_sents_lema_punct_stop = [[word for word in sent if word not in stopwords] for sent in bible_kjv_sents_lema_punct]

In [44]:
for sentence in bible_kjv_sents_lema_punct_stop[:10]:
    print(' '.join(sentence))

beginning god create heaven earth
earth without form void darkness face deep spirit god move face water
god say let light light
god see light good god divide light darkness
god call light day darkness call night evening morning first day
god say let firmament middle water let divide water water
god make firmament divide water firmament water firmament
god call firmament heaven evening morning second day
god say let water heaven gather together one place let dry land appear
god call dry land earth gathering together water call seas god see good


vector_size (int, optional) – Dimensionality of the word vectors.

window (int, optional) – Maximum distance between the current and predicted word within a sentence.

min_count (int, optional) – Ignores all words with total frequency lower than this.

max_vocab_size (int, optional) – Limits the RAM during vocabulary building; if there are more unique words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. Set to None for no limit.

max_final_vocab (int, optional) – Limits the vocab to a target vocab size by automatically picking a matching min_count. If the specified min_count is more than the calculated min_count, the specified min_count will be used. Set to None if not required.

epochs (int, optional) – Number of iterations (epochs) over the corpus. (Formerly: iter)

compute_loss (bool, optional) – If True, computes and stores loss value which can be retrieved using get_latest_training_loss().

shrink_windows (bool, optional) – New in 4.1. Experimental. If True, the effective window size is uniformly sampled from [1, window] for each target word during training, to match the original word2vec algorithm’s approximate weighting of context words by distance. Otherwise, the effective window size is always fixed to window words to either side.

sample (float, optional) – The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5). 

# Skip-gram or Continuous Bag of Words (CBOW)

https://ai.stackexchange.com/questions/18634/what-are-the-main-differences-between-skip-gram-and-continuous-bag-of-words/18637

So as you're probably already aware of, CBOW and Skip-gram are just mirrored versions of each other. CBOW is trained to predict a single word from a fixed window size of context words, whereas Skip-gram does the opposite, and tries to predict several context words from a single input word.

Intuitively, the first task is much simpler, this implies a much faster convergence for CBOW than for Skip-gram, in the original paper (link below) they wrote that CBOW took hours to train, Skip-gram 3 days.

For the same logic regarding the task difficulty, CBOW learn better syntactic relationships between words while Skip-gram is better in capturing better semantic relationships. In practice, this means that for the word 'cat' CBOW would retrive as closest vectors morphologically similar words like plurals, i.e. 'cats' while Skip-gram would consider morphologically different words (but semantically relevant) like 'dog' much closer to 'cat' in comparison.

A final consideration to make deals instead with the sensitivity to rare and frequent words. Because Skip-gram rely on single words input, it is less sensitive to overfit frequent words, because even if frequent words are presented more times that rare words during training, they still appear individually, while CBOW is prone to overfit frequent words because they appear several time along with the same context. This advantage over frequent words overfitting leads Skip-gram to be also more efficient in term of documents required to achieve good performances, much less than CBOW (and it's also the reason of the better performances of Skip-gram in capturing semantical relationships).


https://stats.stackexchange.com/a/261440/105763

* CBOW is learning to predict the word by the context. (faster) many -> one


    -> Seams reasonable that's better to remove stop-words

In [149]:
bible_kjv_word2vec_model = word2vec.Word2Vec(bible_kjv_sents_lema_punct_stop, min_count=5, 
        vector_size=200, 
        window=6, # context words to predict before/after the central word skip-gram or the inverse cbow
        sample=0.001,  # downsample frequent words default 0.001 or dont 0.0 - recommended instead of remove stop-words
        workers=8,         
        sorted_vocab=1, 
        sg=1, # sg=1 skip-gram
        epochs=100) 

2021-09-01 11:29:26,358 : INFO : collecting all words and their counts
2021-09-01 11:29:26,359 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-09-01 11:29:26,388 : INFO : PROGRESS: at sentence #10000, processed 120970 words, keeping 4912 word types
2021-09-01 11:29:26,418 : INFO : PROGRESS: at sentence #20000, processed 222263 words, keeping 7238 word types
2021-09-01 11:29:26,448 : INFO : PROGRESS: at sentence #30000, processed 325600 words, keeping 8847 word types
2021-09-01 11:29:26,453 : INFO : collected 8980 word types from a corpus of 337539 raw words and 31102 sentences
2021-09-01 11:29:26,454 : INFO : Creating a fresh vocabulary
2021-09-01 11:29:26,483 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 3876 unique words (43.16258351893096%% of original 8980, drops 5104)', 'datetime': '2021-09-01T11:29:26.483739', 'gensim': '4.0.1', 'python': '3.8.10 (default, May 19 2021, 13:12:57) [MSC v.1916 64 bit (AMD64)]', 'platform': '

In [158]:
bible_kjv_word2vec_model.wv.index_to_key[:10]

['shall', 'lord', 'say', 'god', 'come', 'man', 'son', 'go', 'king', 'make']

In [159]:
len(bible_kjv_word2vec_model.wv.index_to_key) # vocabulary size

3876

In [160]:
bible_kjv_word2vec_model.wv.closer_than('jesus', 'christ')[:10]

[]

In [161]:
bible_kjv_word2vec_model.wv.most_similar(["god"])

[('lord', 0.7502718567848206),
 ('say', 0.5241986513137817),
 ('sanctification', 0.4578445255756378),
 ('sincerity', 0.43164896965026855),
 ('exalted', 0.41754475235939026),
 ('confession', 0.4099636971950531),
 ('fix', 0.4074748456478119),
 ('redeemer', 0.4031759798526764),
 ('saving', 0.4011639356613159),
 ('reprobate', 0.39531105756759644)]

In [162]:
bible_kjv_word2vec_model.wv.most_similar(["christ"])

[('jesus', 0.5467848181724548),
 ('gospel', 0.474446177482605),
 ('abundant', 0.4148310124874115),
 ('faith', 0.402395635843277),
 ('sanctification', 0.3759796917438507),
 ('grace', 0.3718375563621521),
 ('meditation', 0.36150574684143066),
 ('god', 0.3608017861843109),
 ('eternal', 0.36008360981941223),
 ('belove', 0.35845649242401123)]

In [163]:
bible_kjv_word2vec_model.wv.most_similar(["jesus"], topn=10)

[('christ', 0.5467847585678101),
 ('disciple', 0.45068156719207764),
 ('peter', 0.4407723546028137),
 ('john', 0.3930402100086212),
 ('romans', 0.3531630337238312),
 ('galilee', 0.3528057336807251),
 ('nazareth', 0.3473432660102844),
 ('immediately', 0.3422870635986328),
 ('paul', 0.34071412682533264),
 ('faith', 0.34017059206962585)]

In [164]:
bible_kjv_word2vec_model.wv.most_similar(["saul"], topn=10)

[('jonathan', 0.494997501373291),
 ('david', 0.49233680963516235),
 ('samuel', 0.47509056329727173),
 ('abner', 0.44466668367385864),
 ('michal', 0.4349168837070465),
 ('gilboa', 0.427920401096344),
 ('naioth', 0.39455944299697876),
 ('maon', 0.3937312066555023),
 ('jesse', 0.3835592269897461),
 ('javelin', 0.3831195831298828)]

In [165]:
bible_kjv_word2vec_model.wv['jesus']

array([ 0.10450251,  0.14936721,  0.6442774 ,  0.05124003, -0.13037749,
        0.07043558, -0.7090818 , -0.10004777,  0.48946014,  0.5314849 ,
       -0.26606488, -0.35777095,  0.11497089, -0.40360218,  0.0620638 ,
       -0.43668583,  0.10719658, -0.19995257, -0.19785257, -0.33812   ,
       -0.07336547,  0.25279567, -0.31729788,  0.05122983,  0.11655648,
        0.08348206,  0.30296907,  0.03241773, -0.7305899 , -0.16129759,
        0.00459224,  0.1873528 ,  0.09161553, -0.09941293,  0.4632847 ,
        0.05760025, -0.12523936, -0.70911896, -0.17300253,  0.1410986 ,
       -0.40287423,  0.03459988, -0.1407281 ,  0.3561677 ,  0.5421509 ,
       -0.1630545 , -0.15685983,  0.16352233,  0.05579595,  0.24692412,
       -0.41490299, -0.28556854, -0.08312353,  0.05107731, -0.24932218,
       -0.37900877,  0.32055244, -0.49453485,  0.21039945,  0.03431154,
       -0.25031725, -0.06534752, -0.20071128,  0.13566348,  0.07299663,
       -0.18114693,  0.04665125, -0.14220518, -0.17766911,  0.33

In [66]:
bible_kjv_word2vec_model.wv.save_word2vec_format("bible_word2vec_org", "bible_word2vec_vocabulary")

2021-09-01 10:35:04,830 : INFO : storing vocabulary in bible_word2vec_vocabulary
2021-09-01 10:35:04,840 : INFO : storing 3985x200 projection weights into bible_word2vec_org


In [67]:
bible_kjv_word2vec_model.save("bible_word2vec_gensim")

2021-09-01 10:35:05,409 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'bible_word2vec_gensim', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-09-01T10:35:05.409265', 'gensim': '4.0.1', 'python': '3.8.10 (default, May 19 2021, 13:12:57) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19043-SP0', 'event': 'saving'}
2021-09-01 10:35:05,412 : INFO : not storing attribute cum_table
2021-09-01 10:35:05,423 : INFO : saved bible_word2vec_gensim
