<a href="https://colab.research.google.com/github/chi-yan/notebooks/blob/master/NLP_(Bible_Text_Encoding_with_the_Universal_Text_Encoder).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This Google Colab notebook uses Natural Language Processing (NLP) techniques 
to detect Bible verses that are most similar to a random block of text. Word numerical (high-ordered vectors) obtained via the "Universal Text Encoder" is used to numerically model these verses.`

Cosine similarity is to quantify similarity (in meaning) between sentences. A similiarity close to 0 indicates that two sentences are similar in meaning.

References:

http://www.ijstr.org/final-print/dec2019/Lexical-And-Semantic-Analysis-Of-Sacred-Texts-Using-Machine-Learning-And-Natural-Language-Processing.pdf (see Section 3.4)

https://tfhub.dev/google/universal-sentence-encoder/1

In [1]:
import numpy as np # linear algebra
import pandas as pd #
import tensorflow_hub as hub
import csv
import pprint
from tabulate import tabulate
from collections import OrderedDict
from operator import itemgetter
text = 'cats and dogs' #@param {type:"string", run:"auto"}
 
def cosine_similarity(A, B):
    '''
    Input:
        A: a numpy array which corresponds to a word vector
        B: A numpy array which corresponds to a word vector
    Output:
        cos: numerical number representing the cosine similarity between A and B.
    '''
    # you have to set this variable to the true label.
    cos = -10
    dot = np.dot(A, B)
    norma = np.linalg.norm(A)
    normb = np.linalg.norm(B)
    cos = dot / (norma * normb)
 
    return cos
 
if 'init' not in locals():
  init = True
  !curl https://raw.githubusercontent.com/EswarGitHub/BibleSearch/master/bible_data_set.csv -o bible_data_set.csv
  reader = csv.reader(open('bible_data_set.csv', 'r'))
  d = {}
  for row in reader:
    k,_,_,_,v = row
    d[k] = v[:-2] #    
  embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
  embeddings = embed(list(d.values()))
  versedict = {}
  for i, verse in enumerate(d):
    versedict[i] = verse
 

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 5010k  100 5010k    0     0  10.1M      0 --:--:-- --:--:-- --:--:-- 10.1M


In [2]:

import pickle
with open('embeddings.pickle', 'wb') as f:
    pickle.dump(embeddings, f)

In [3]:
import pickle
with open('data.pickle', 'wb') as f:
    pickle.dump(d, f)

In [4]:
embeddings

<tf.Tensor: shape=(31103, 512), dtype=float32, numpy=
array([[ 0.00468304, -0.08392566, -0.01649407, ..., -0.03823555,
         0.00346091, -0.10069176],
       [-0.05283082, -0.00314456,  0.05393992, ..., -0.05066366,
         0.05926256,  0.00790901],
       [-0.04331737,  0.01895819, -0.04516518, ..., -0.00918443,
         0.02742366,  0.04344476],
       ...,
       [-0.03668365, -0.02885344,  0.04155295, ..., -0.01788977,
         0.09710161,  0.07535172],
       [-0.01769449, -0.03582925, -0.00357679, ...,  0.01731504,
        -0.03287546,  0.01774418],
       [-0.01537722, -0.04798327, -0.03328145, ...,  0.02343451,
         0.01298576,  0.00462452]], dtype=float32)>

In [5]:
import pickle
with open('versedict.pickle', 'wb') as f:
    pickle.dump(versedict, f)

In [6]:
d['Genesis 32:15']

'Thirty milch camels with their colts, forty kine, and ten bulls, twenty she asses, and ten foals.'

In [7]:

def cosine_similarities(A, B): #A is the big matrix containing all embeddings, B is the sentence
    return np.divide(np.dot(A,B),np.apply_along_axis(np.linalg.norm, 1, A)*np.linalg.norm(B))[:,0]

In [8]:


def generateSimilarities(text, numberOfItems):
  embedded_text = embed([text])
  similarities = cosine_similarities(embeddings.numpy(), embedded_text.numpy().T)
  data = []
  for i in range(len(embeddings)):
    data.append([versedict[i],d[versedict[i]],similarities[i]])
  return(pd.DataFrame (data, columns = ['Verse', 'Text', 'Similarity']).sort_values(by=['Similarity'], ascending=False).head(numberOfItems))

pd.set_option('max_colwidth', 500)

generateSimilarities("only son",5)


Unnamed: 0,Verse,Text,Similarity
10563,1 Chronicles 7:27,"Non his son, Jehoshuah his son.",0.454879
26063,John 1:18,"No man hath seen God at any time, the only begotten Son, which is in the bosom of the Father, he hath declared him.",0.422034
564,Genesis 22:16,"And said, By myself have I sworn, saith the LORD, for because thou hast done this thing, and hast not withheld thy son, thine only son:",0.414489
25340,Luke 9:38,"And, behold, a man of the company cried out, saying, Master, I beseech thee, look upon my son: for he is mine only child.",0.39974
27396,Acts 13:33,"God hath fulfilled the same unto us their children, in that he hath raised up Jesus again; as it is also written in the second psalm, Thou art my Son, this day have I begotten thee.",0.386815


In [9]:
generateSimilarities(d['Genesis 1:1'],5)

Unnamed: 0,Verse,Text,Similarity
1,Genesis 1:1,In the beginning God created the heaven and the earth.,1.0
35,Genesis 2:4,"These are the generations of the heavens and of the earth when they were created, in the day that the LORD God made the earth and the heavens,",0.738225
15846,Psalms 115:15,Ye are blessed of the LORD which made heaven and earth.,0.626305
16626,Proverbs 8:23,"I was set up from everlasting, from the beginning, or ever the earth was.",0.624477
15820,Psalms 113:6,"Who humbleth himself to behold the things that are in heaven, and in the earth!",0.604734


In [164]:
pd.set_option('max_colwidth', 500)
print(df.sort_values(by=['Similarity'], ascending=False))


               Verse                                                                                                                                                       Text  Similarity
26137      John 3:17                                               For God sent not his Son into the world to condemn the world; but that the world through him might be saved.    0.523826
26136      John 3:16              For God so loved the world, that he gave his only begotten Son, that whosoever believeth in him should not perish, but have everlasting life.    0.452966
28148    Romans 8:32                                 He that spared not his own Son, but delivered him up for us all, how shall he not with him also freely give us all things?    0.427742
4730   Numbers 32:12                                           Save Caleb the son of Jephunneh the Kenezite, and Joshua the son of Nun: for they have wholly followed the LORD.    0.427722
23738  Matthew 18:11                                        