# Playing with word 2 vec Bible corpus

In [189]:
from string import punctuation 
from gensim.models import word2vec
import nltk 
from nltk.stem import WordNetLemmatizer 
import tqdm
import re

In [190]:
import logging

In [191]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 

In [192]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/andre/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/andre/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/andre/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Which Bible Version

The original King James (gutenberg) uses an old archaic english.  
Most lematization packages don't work for old english.  
Solution is to use the `American King James Version` from http://biblehub.net/.  

*From Amazon*
The American King James Bible replaces archaic Middle English words with modern English words without changing the grammar or sentence structure of the King James version. It contains the Old and New Testaments.The 2018 version corrected "saith" to "says" and several spelling errors.

In [193]:
# bible_kjv_sents = gutenberg.sents('bible-kjv.txt')  # that's what is needed
bible_kjv_sents = None
with open('American_King_James_Version_Only_Sentences.txt', 'r') as file:
    bible_kjv_sents = file.read()

bible_kjv_sents = bible_kjv_sents.split('\n')
print("number of sentences (verses): ", len(bible_kjv_sents))

number of sentences (verses):  31102


In [6]:
from numpy.random import randint
rand_start = randint(30100)
for sent in bible_kjv_sents[rand_start:rand_start+3]: # phrases or sentences
    print(sent)

And Abimelech dwelled at Arumah: and Zebul thrust out Gaal and his brothers, that they should not dwell in Shechem.
And it came to pass on the morrow, that the people went out into the field; and they told Abimelech.
And he took the people, and divided them into three companies, and laid wait in the field, and looked, and, behold, the people were come forth out of the city; and he rose up against them, and smote them.


# Lematization and remove stop-words

In contrast to stemming, lemmatization is a lot more powerful. It looks beyond word reduction and considers a language’s full vocabulary to apply a morphological analysis to words, aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma.


https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/

In [119]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [194]:
import spacy
nlp = spacy.load('en_core_web_md')

In [195]:
sentence = bible_kjv_sents[26450]
print(sentence) 
doc = nlp(sentence)
[token.lemma_ for token in doc][:15]

Therefore said they to him, How were your eyes opened?


['therefore',
 'say',
 'they',
 'to',
 'he',
 ',',
 'how',
 'be',
 'your',
 'eye',
 'open',
 '?']

In [15]:
bible_kjv_sents_lema = []
for sent in tqdm.tqdm(bible_kjv_sents):
    doc = nlp(sent)
    bible_kjv_sents_lema.append([token.lemma_ for token in doc if not token.is_stop])

100%|██████████| 31102/31102 [04:59<00:00, 103.70it/s]


In [196]:
import spacy

def lematize(sent, remove_stop=False):
    doc = nlp(sent)
    if remove_stop:
        return [token.lemma_ for token in doc if not token.is_stop]
    else:
        return [token.lemma_ for token in doc]

In [197]:
from joblib import Parallel, delayed

bible_kjv_sents_lema = Parallel(n_jobs=8, verbose=5, max_nbytes=None,
        batch_size=int(len(bible_kjv_sents))//8)(delayed(lematize)(sent) for sent in bible_kjv_sents)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done 792 tasks      | elapsed:   29.3s
[Parallel(n_jobs=8)]: Done 21744 tasks      | elapsed:  2.6min
[Parallel(n_jobs=8)]: Done 31102 out of 31102 | elapsed:  3.4min finished


In [198]:
' '.join(bible_kjv_sents_lema[26450])

'therefore say they to he , how be your eye open ?'

# Remove punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' + ' ' pure space words

In [199]:
#[[word.lower() for word in sent if word not in punctuation+' '] for sent in bible_kjv_sents_lema]

In [200]:
bible_kjv_sents_lema_punct = [[word.lower() for word in sent if word not in punctuation+' '] for sent in bible_kjv_sents_lema]

In [201]:
for sentence in bible_kjv_sents_lema_punct[:10]:
    print(' '.join(sentence))

in the beginning god create the heaven and the earth
and the earth be without form and void and darkness be on the face of the deep and the spirit of god move on the face of the water
and god say let there be light and there be light
and god see the light that it be good and god divide the light from the darkness
and god call the light day and the darkness he call night and the evening and the morning be the first day
and god say let there be a firmament in the middle of the water and let it divide the water from the water
and god make the firmament and divide the water which be under the firmament from the water which be above the firmament and it be so
and god call the firmament heaven and the evening and the morning be the second day
and god say let the water under the heaven be gather together to one place and let the dry land appear and it be so
and god call the dry land earth and the gathering together of the water call he seas and god see that it be good


In [202]:
nltk('humble')

TypeError: 'module' object is not callable

In [133]:
for sentence in bible_kjv_sents_lema_punct:
    for word in sentence:
        if word == 'humble':
            print(' '.join(sentence))
            break 

and moses and aaron come in to pharaoh and say to he thus say the lord god of the hebrews how long will you refuse to humble yourself before i let my people go that they may serve i
and that i also have walk contrary to they and have bring they into the land of their enemy if then their uncircumcised heart be humble and they then accept of the punishment of their iniquity
and you shall remember all the way which the lord your god lead you these forty year in the wilderness to humble you and to prove you to know what be in your heart whether you would keep his commandment or no
and he humble you and suffer you to hunger and feed you with manna which you know not neither do your father know that he might make you know that man do not live by bread only but by every word that proceed out of the mouth of the lord do man live
who feed you in the wilderness with manna which your father know not that he might humble you and that he might prove you to do you good at your latter end
and it shal

### Not removing stop words

It seams not really usefull for word 2 vec goal. The vectors created removing stop words seams less coherent. 

https://stackoverflow.com/a/40447086/1207193

In [16]:
# from gensim.parsing.preprocessing import STOPWORDS
# from nltk.corpus import stopwords

# stopwords = stopwords.words('english')
# bible_kjv_sents_lema_punct_stop = [[word for word in sent if word not in stopwords] for sent in bible_kjv_sents_lema_punct]

In [17]:
# for sentence in bible_kjv_sents_lema_punct_stop[:10]:
#     print(' '.join(sentence))

vector_size (int, optional) – Dimensionality of the word vectors.

window (int, optional) – Maximum distance between the current and predicted word within a sentence.

min_count (int, optional) – Ignores all words with total frequency lower than this.

max_vocab_size (int, optional) – Limits the RAM during vocabulary building; if there are more unique words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. Set to None for no limit.

max_final_vocab (int, optional) – Limits the vocab to a target vocab size by automatically picking a matching min_count. If the specified min_count is more than the calculated min_count, the specified min_count will be used. Set to None if not required.

epochs (int, optional) – Number of iterations (epochs) over the corpus. (Formerly: iter)

compute_loss (bool, optional) – If True, computes and stores loss value which can be retrieved using get_latest_training_loss().

shrink_windows (bool, optional) – New in 4.1. Experimental. If True, the effective window size is uniformly sampled from [1, window] for each target word during training, to match the original word2vec algorithm’s approximate weighting of context words by distance. Otherwise, the effective window size is always fixed to window words to either side.

sample (float, optional) – The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5). 

# Skip-gram or Continuous Bag of Words (CBOW)

https://ai.stackexchange.com/questions/18634/what-are-the-main-differences-between-skip-gram-and-continuous-bag-of-words/18637

So as you're probably already aware of, CBOW and Skip-gram are just mirrored versions of each other. CBOW is trained to predict a single word from a fixed window size of context words, whereas Skip-gram does the opposite, and tries to predict several context words from a single input word.

Intuitively, the first task is much simpler, this implies a much faster convergence for CBOW than for Skip-gram, in the original paper (link below) they wrote that CBOW took hours to train, Skip-gram 3 days.

For the same logic regarding the task difficulty, CBOW learn better syntactic relationships between words while Skip-gram is better in capturing better semantic relationships. In practice, this means that for the word 'cat' CBOW would retrive as closest vectors morphologically similar words like plurals, i.e. 'cats' while Skip-gram would consider morphologically different words (but semantically relevant) like 'dog' much closer to 'cat' in comparison.

A final consideration to make deals instead with the sensitivity to rare and frequent words. Because Skip-gram rely on single words input, it is less sensitive to overfit frequent words, because even if frequent words are presented more times that rare words during training, they still appear individually, while CBOW is prone to overfit frequent words because they appear several time along with the same context. This advantage over frequent words overfitting leads Skip-gram to be also more efficient in term of documents required to achieve good performances, much less than CBOW (and it's also the reason of the better performances of Skip-gram in capturing semantical relationships).


https://stats.stackexchange.com/a/261440/105763

* CBOW is learning to predict the word by the context. (faster) many -> one


    -> Seams reasonable that's better to remove stop-words

In [134]:
bible_kjv_word2vec_model = word2vec.Word2Vec(bible_kjv_sents_lema_punct, min_count=5, 
        vector_size=300, 
        window=8, # context words to predict before/after the central word skip-gram or the inverse cbow
        sample=0.001,  # downsample frequent words default 0.001 or dont 0.0 - recommended instead of remove stop-words
        workers=8,         
        sorted_vocab=1, 
        sg=1, # sg=1 skip-gram
        epochs=100) 

2023-03-17 22:16:20,240 : INFO : collecting all words and their counts
2023-03-17 22:16:20,241 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-03-17 22:16:20,297 : INFO : PROGRESS: at sentence #10000, processed 283852 words, keeping 5013 word types
2023-03-17 22:16:20,343 : INFO : PROGRESS: at sentence #20000, processed 517118 words, keeping 7370 word types
2023-03-17 22:16:20,390 : INFO : PROGRESS: at sentence #30000, processed 764478 words, keeping 8989 word types
2023-03-17 22:16:20,397 : INFO : collected 9124 word types from a corpus of 792609 raw words and 31102 sentences
2023-03-17 22:16:20,398 : INFO : Creating a fresh vocabulary
2023-03-17 22:16:20,427 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 3978 unique words (43.60% of original 9124, drops 5146)', 'datetime': '2023-03-17T22:16:20.427762', 'gensim': '4.3.1', 'python': '3.10.6 (main, Nov 14 2022, 16:10:14) [GCC 11.3.0]', 'platform': 'Linux-5.19.0-35-generic-x86_64-

In [140]:
bible_kjv_word2vec_model.wv.index_to_key[:10]

['the', 'and', 'of', 'be', 'to', 'he', 'you', 'they', 'i', 'that']

In [141]:
len(bible_kjv_word2vec_model.wv.index_to_key) # vocabulary size

3978

In [142]:
bible_kjv_word2vec_model.wv.closer_than('jesus', 'christ')[:10]

[]

In [143]:
bible_kjv_word2vec_model.wv.most_similar("humble")

[('haughtiness', 0.31562453508377075),
 ('haughty', 0.3148563504219055),
 ('abase', 0.3039110600948334),
 ('resist', 0.2879158854484558),
 ('contrite', 0.28720584511756897),
 ('lowly', 0.28645578026771545),
 ('childless', 0.2776082754135132),
 ('revive', 0.2752573788166046),
 ('humility', 0.2647615671157837),
 ('obedient', 0.2601352632045746)]

In [181]:
nlp("multiply").similarity(nlp("add"))

0.14801288166202323

In [182]:
bible_kjv_word2vec_model.wv.most_similar("haughty")

[('lofty', 0.4268783926963806),
 ('affect', 0.3298184275627136),
 ('humility', 0.32684198021888733),
 ('abase', 0.32326775789260864),
 ('contrite', 0.32101699709892273),
 ('scorner', 0.3173736035823822),
 ('humble', 0.3148563504219055),
 ('pride', 0.31064674258232117),
 ('subject', 0.30551761388778687),
 ('oppress', 0.2982201874256134)]

In [183]:
bible_kjv_word2vec_model.wv.most_similar(["god"])

[('lord', 0.5579831600189209),
 ('i', 0.3982095420360565),
 ('you', 0.385988712310791),
 ('redeemer', 0.3660259246826172),
 ('o', 0.35674259066581726),
 ('have', 0.355631560087204),
 ('to', 0.3511999845504761),
 ('not', 0.34572696685791016),
 ('we', 0.3386997878551483),
 ('savior', 0.33248040080070496)]

In [184]:
bible_kjv_word2vec_model.wv.most_similar(["christ"])

[('jesus', 0.4993652403354645),
 ('gospel', 0.37370216846466064),
 ('faith', 0.3184944987297058),
 ('preach', 0.30671292543411255),
 ('nazareth', 0.30493101477622986),
 ('believe', 0.3039478063583374),
 ('our', 0.29783058166503906),
 ('god', 0.2971987724304199),
 ('grace', 0.2961413264274597),
 ('we', 0.2938659191131592)]

In [185]:
bible_kjv_word2vec_model.wv.most_similar(["jesus"], topn=10)

[('christ', 0.4993651807308197),
 ('disciple', 0.37840142846107483),
 ('peter', 0.35410329699516296),
 ('he', 0.3472472131252289),
 ('nazareth', 0.32248222827911377),
 ('ask', 0.2977120280265808),
 ('galilee', 0.29492613673210144),
 ('john', 0.29178881645202637),
 ('pilate', 0.28626564145088196),
 ('hezekiah', 0.2807839512825012)]

In [186]:
bible_kjv_word2vec_model.wv.most_similar(["saul"], topn=10)

[('david', 0.4558025300502777),
 ('jonathan', 0.41307035088539124),
 ('samuel', 0.3951224386692047),
 ('abner', 0.3669678866863251),
 ('gilboa', 0.33321523666381836),
 ('javelin', 0.3046970069408417),
 ('michal', 0.29548048973083496),
 ('ner', 0.29203811287879944),
 ('maon', 0.28503304719924927),
 ('ziph', 0.2846544682979584)]

In [165]:
bible_kjv_word2vec_model.wv['jesus']

array([ 0.10450251,  0.14936721,  0.6442774 ,  0.05124003, -0.13037749,
        0.07043558, -0.7090818 , -0.10004777,  0.48946014,  0.5314849 ,
       -0.26606488, -0.35777095,  0.11497089, -0.40360218,  0.0620638 ,
       -0.43668583,  0.10719658, -0.19995257, -0.19785257, -0.33812   ,
       -0.07336547,  0.25279567, -0.31729788,  0.05122983,  0.11655648,
        0.08348206,  0.30296907,  0.03241773, -0.7305899 , -0.16129759,
        0.00459224,  0.1873528 ,  0.09161553, -0.09941293,  0.4632847 ,
        0.05760025, -0.12523936, -0.70911896, -0.17300253,  0.1410986 ,
       -0.40287423,  0.03459988, -0.1407281 ,  0.3561677 ,  0.5421509 ,
       -0.1630545 , -0.15685983,  0.16352233,  0.05579595,  0.24692412,
       -0.41490299, -0.28556854, -0.08312353,  0.05107731, -0.24932218,
       -0.37900877,  0.32055244, -0.49453485,  0.21039945,  0.03431154,
       -0.25031725, -0.06534752, -0.20071128,  0.13566348,  0.07299663,
       -0.18114693,  0.04665125, -0.14220518, -0.17766911,  0.33

In [66]:
bible_kjv_word2vec_model.wv.save_word2vec_format("bible_word2vec_org", "bible_word2vec_vocabulary")

2021-09-01 10:35:04,830 : INFO : storing vocabulary in bible_word2vec_vocabulary
2021-09-01 10:35:04,840 : INFO : storing 3985x200 projection weights into bible_word2vec_org


In [67]:
bible_kjv_word2vec_model.save("bible_word2vec_gensim")

2021-09-01 10:35:05,409 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'bible_word2vec_gensim', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-09-01T10:35:05.409265', 'gensim': '4.0.1', 'python': '3.8.10 (default, May 19 2021, 13:12:57) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19043-SP0', 'event': 'saving'}
2021-09-01 10:35:05,412 : INFO : not storing attribute cum_table
2021-09-01 10:35:05,423 : INFO : saved bible_word2vec_gensim
