# How many names are found in a given version of the Bible in the places where the macula data says they are?

In [1]:
#!/usr/bin/env python3

import csv
from collections import Counter
import json
import numpy as np
import os
import pandas as pd
import re
from pathlib import Path
import sys
import urllib.request

In [2]:
data_folder = Path("D:/GitHub/davidbaines/trabina/data")

by_lang_folder = data_folder / "by-lang"
compare_col = "English gloss"
matched_names_tsv = data_folder / "matched_eng_web_macula_names.tsv"
macula_json_file = Path("D:/GitHub/davidbaines/trabina") / "macula.json"

bible_url = r"https://raw.githubusercontent.com/BibleNLP/ebible-corpus/main/corpus/eng-eng-web.txt"

silnlp_assets_folder = Path("D:/GitHub/davidbaines/trabina/silnlp/assets")
silnlp_vref_file = silnlp_assets_folder / "vref.txt"


In [3]:
macula = pd.read_json(macula_json_file)
macula

Unnamed: 0,refs,Source,English gloss,Hebrew source,English gloss of Hebrew,Greek gloss of Hebrew,Mandarin gloss of Hebrew,Greek source,Greek lemma,Greek normalized,English gloss of Greek,book,chapter_no,verse_no,word_no,ref_only,silnlp_line_number
0,GEN 2:4!8,יְהוָ֥ה,LORD,יְהוָ֥ה,LORD,,耶和华,,,,,GEN,2.0,4.0,8.0,GEN 2:4,35
1,GEN 2:5!15,יְהוָ֤ה,LORD,יְהוָ֤ה,LORD,,耶和华,,,,,GEN,2.0,5.0,15.0,GEN 2:5,36
2,GEN 2:7!2,יְהוָ֨ה,LORD,יְהוָ֨ה,LORD,,耶和华,,,,,GEN,2.0,7.0,2.0,GEN 2:7,38
3,GEN 2:8!2,יְהוָ֧ה,LORD,יְהוָ֧ה,LORD,κύριος,耶和华,,,,,GEN,2.0,8.0,2.0,GEN 2:8,39
4,GEN 2:8!5,עֵ֖דֶן,Eden,עֵ֖דֶן,Eden,εδεμ,伊甸,,,,,GEN,2.0,8.0,5.0,GEN 2:8,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38812,REV 22:13!6,"Ὦ,",Omega,,,,,"Ὦ,",Ὦ,Ὦ,Omega,REV,22.0,13.0,6.0,REV 22:13,31162
38813,REV 22:16!2,Ἰησοῦς,Jesus,,,,,Ἰησοῦς,Ἰησοῦς,Ἰησοῦς,Jesus,REV,22.0,16.0,2.0,REV 22:16,31165
38814,REV 22:16!20,"Δαυείδ,",of David,,,,,"Δαυείδ,",Δαυίδ,Δαυείδ,of David,REV,22.0,16.0,20.0,REV 22:16,31165
38815,REV 22:20!11,Ἰησοῦ.,Jesus,,,,,Ἰησοῦ.,Ἰησοῦς,Ἰησοῦ,Jesus,REV,22.0,20.0,11.0,REV 22:20,31169


In [4]:
def get_bible_text(filename: str) -> list:
    """
    Reads a text file from a local file or URL, returns a list of lines.
    Inputs:
        filename:  URL or local filepath
    Outputs:
        :      Dictionary mapping semantic domains to descriptions
    """
    if filename[:4] == 'http':
        f = urllib.request.urlopen(filename)
        lines =  [line.decode('utf-8').strip('\n') for line in f.readlines()]
    else:
        with open(filename, 'r', encoding='utf-8') as f:
            lines = [line.strip('\n') for line in f.readlines()]
    return lines

In [5]:
# Read in a Bible
bible_lines = get_bible_text(bible_url)    
print(bible_lines[:10])
bible_dict = {vref + 1 : verse for vref,verse in enumerate(bible_lines)}
#print(bible_dict)

# Make a Dataframe from the dictionary
bible = pd.DataFrame.from_dict(bible_dict, orient='index', dtype=str, columns=['verse'])
print(bible.verse == '')
#print()
#print(hebrew_ot[hebrew_ot.verse == ''])
#print()
#print(hebrew_ot[hebrew_ot.verse == ''].index)

#hebrew_ot.drop(hebrew_ot['verse']=='', inplace=True)
bible.drop(bible[bible.verse == ''].index, inplace=True)
bible


['In the beginning, God created the heavens and the earth.', 'The earth was formless and empty. Darkness was on the surface of the deep and God’s Spirit was hovering over the surface of the waters.', 'God said, “Let there be light,” and there was light.', 'God saw the light, and saw that it was good. God divided the light from the darkness.', 'God called the light “day”, and the darkness he called “night”. There was evening and there was morning, the first day.', 'God said, “Let there be an expanse in the middle of the waters, and let it divide the waters from the waters.”', 'God made the expanse, and divided the waters which were under the expanse from the waters which were above the expanse; and it was so.', 'God called the expanse “sky”. There was evening and there was morning, a second day.', 'God said, “Let the waters under the sky be gathered together to one place, and let the dry land appear;” and it was so.', 'God called the dry land “earth”, and the gathering together of the w

Unnamed: 0,verse
1,"In the beginning, God created the heavens and ..."
2,The earth was formless and empty. Darkness was...
3,"God said, “Let there be light,” and there was ..."
4,"God saw the light, and saw that it was good. G..."
5,"God called the light “day”, and the darkness h..."
...,...
37832,"Who shall tell my Lord? The Lord himself, he h..."
37833,He sent forth his angel and took me from my fa...
37834,My brothers were handsome and tall; but the Lo...
37835,"I went out to meet the Philistine, and he curs..."


In [6]:
# For each English gloss, does it appear in the verse?
# Add the verse to the macula data
macula = pd.merge(macula, bible, how='left', left_on = 'silnlp_line_number', right_index=True)
macula

Unnamed: 0,refs,Source,English gloss,Hebrew source,English gloss of Hebrew,Greek gloss of Hebrew,Mandarin gloss of Hebrew,Greek source,Greek lemma,Greek normalized,English gloss of Greek,book,chapter_no,verse_no,word_no,ref_only,silnlp_line_number,verse
0,GEN 2:4!8,יְהוָ֥ה,LORD,יְהוָ֥ה,LORD,,耶和华,,,,,GEN,2.0,4.0,8.0,GEN 2:4,35,This is the history of the generations of the ...
1,GEN 2:5!15,יְהוָ֤ה,LORD,יְהוָ֤ה,LORD,,耶和华,,,,,GEN,2.0,5.0,15.0,GEN 2:5,36,"No plant of the field was yet in the earth, an..."
2,GEN 2:7!2,יְהוָ֨ה,LORD,יְהוָ֨ה,LORD,,耶和华,,,,,GEN,2.0,7.0,2.0,GEN 2:7,38,Yahweh God formed man from the dust of the gro...
3,GEN 2:8!2,יְהוָ֧ה,LORD,יְהוָ֧ה,LORD,κύριος,耶和华,,,,,GEN,2.0,8.0,2.0,GEN 2:8,39,"Yahweh God planted a garden eastward, in Eden,..."
4,GEN 2:8!5,עֵ֖דֶן,Eden,עֵ֖דֶן,Eden,εδεμ,伊甸,,,,,GEN,2.0,8.0,5.0,GEN 2:8,39,"Yahweh God planted a garden eastward, in Eden,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38812,REV 22:13!6,"Ὦ,",Omega,,,,,"Ὦ,",Ὦ,Ὦ,Omega,REV,22.0,13.0,6.0,REV 22:13,31162,"I am the Alpha and the Omega, the First and th..."
38813,REV 22:16!2,Ἰησοῦς,Jesus,,,,,Ἰησοῦς,Ἰησοῦς,Ἰησοῦς,Jesus,REV,22.0,16.0,2.0,REV 22:16,31165,"I, Jesus, have sent my angel to testify these ..."
38814,REV 22:16!20,"Δαυείδ,",of David,,,,,"Δαυείδ,",Δαυίδ,Δαυείδ,of David,REV,22.0,16.0,20.0,REV 22:16,31165,"I, Jesus, have sent my angel to testify these ..."
38815,REV 22:20!11,Ἰησοῦ.,Jesus,,,,,Ἰησοῦ.,Ἰησοῦς,Ἰησοῦ,Jesus,REV,22.0,20.0,11.0,REV 22:20,31169,"He who testifies these things says, “Yes, I am..."


In [13]:
macula.fillna('').apply(lambda row: row['English gloss'].lower() in row.verse.lower(), axis=1)

0        False
1        False
2        False
3        False
4         True
         ...  
38812     True
38813     True
38814     True
38815     True
38816     True
Length: 38817, dtype: bool

In [15]:
# print([x[0].lower() in str(x[1]).lower() for x in zip(macula['English gloss'], macula['verse'])][:100])
# macula['English_gloss_in_eng_web'] = [x[0].lower() in str(x[1]).lower() for x in zip(macula['English gloss'], macula['verse'])]
macula['English_gloss_in_eng_web'] = macula.fillna('').apply(lambda row: row['English gloss'].lower() in row.verse.lower(), axis=1)
macula

Unnamed: 0,refs,Source,English gloss,Hebrew source,English gloss of Hebrew,Greek gloss of Hebrew,Mandarin gloss of Hebrew,Greek source,Greek lemma,Greek normalized,English gloss of Greek,book,chapter_no,verse_no,word_no,ref_only,silnlp_line_number,verse,English_gloss_in_eng_web
0,GEN 2:4!8,יְהוָ֥ה,LORD,יְהוָ֥ה,LORD,,耶和华,,,,,GEN,2.0,4.0,8.0,GEN 2:4,35,This is the history of the generations of the ...,False
1,GEN 2:5!15,יְהוָ֤ה,LORD,יְהוָ֤ה,LORD,,耶和华,,,,,GEN,2.0,5.0,15.0,GEN 2:5,36,"No plant of the field was yet in the earth, an...",False
2,GEN 2:7!2,יְהוָ֨ה,LORD,יְהוָ֨ה,LORD,,耶和华,,,,,GEN,2.0,7.0,2.0,GEN 2:7,38,Yahweh God formed man from the dust of the gro...,False
3,GEN 2:8!2,יְהוָ֧ה,LORD,יְהוָ֧ה,LORD,κύριος,耶和华,,,,,GEN,2.0,8.0,2.0,GEN 2:8,39,"Yahweh God planted a garden eastward, in Eden,...",False
4,GEN 2:8!5,עֵ֖דֶן,Eden,עֵ֖דֶן,Eden,εδεμ,伊甸,,,,,GEN,2.0,8.0,5.0,GEN 2:8,39,"Yahweh God planted a garden eastward, in Eden,...",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38812,REV 22:13!6,"Ὦ,",Omega,,,,,"Ὦ,",Ὦ,Ὦ,Omega,REV,22.0,13.0,6.0,REV 22:13,31162,"I am the Alpha and the Omega, the First and th...",True
38813,REV 22:16!2,Ἰησοῦς,Jesus,,,,,Ἰησοῦς,Ἰησοῦς,Ἰησοῦς,Jesus,REV,22.0,16.0,2.0,REV 22:16,31165,"I, Jesus, have sent my angel to testify these ...",True
38814,REV 22:16!20,"Δαυείδ,",of David,,,,,"Δαυείδ,",Δαυίδ,Δαυείδ,of David,REV,22.0,16.0,20.0,REV 22:16,31165,"I, Jesus, have sent my angel to testify these ...",True
38815,REV 22:20!11,Ἰησοῦ.,Jesus,,,,,Ἰησοῦ.,Ἰησοῦς,Ἰησοῦ,Jesus,REV,22.0,20.0,11.0,REV 22:20,31169,"He who testifies these things says, “Yes, I am...",True


In [17]:
# From https://www.statology.org/pandas-groupby-count-with-condition/
#groupby team and count number of 'pos' equal to 'Gu'
#df_count = df.groupby('team')['pos'].apply(lambda x: (x=='Gu').sum()).reset_index(name='count')
bible_wordcount_found = macula.groupby('English gloss')['English_gloss_in_eng_web'].apply(lambda x: (x).sum()).reset_index(name='Found')
bible_wordcount_not_found = macula.groupby('English gloss')['English_gloss_in_eng_web'].apply(lambda x: (x == False).sum()).reset_index(name='Not_found')
print(bible_wordcount_found)
bible_wordcount = pd.merge(bible_wordcount_found,bible_wordcount_not_found)
print(bible_wordcount)
bible_wordcount['Found_ratio'] = bible_wordcount.apply(lambda x: int(x[1]) / (int(x[1]) + int(x[2])), axis=1)  


           English gloss  Found
0                           196
1             A Nazarene      1
2            A Samaritan      0
3                  Aaron    315
4                Aaron’s     22
...                  ...    ...
3210     with Samaritans      1
3211          with Simon      1
3212  with [the] Tyrians      0
3213              zaphon      0
3214              ~south      0

[3215 rows x 2 columns]
           English gloss  Found  Not_found
0                           196          0
1             A Nazarene      1          0
2            A Samaritan      0          1
3                  Aaron    315          0
4                Aaron’s     22         11
...                  ...    ...        ...
3210     with Samaritans      1          0
3211          with Simon      1          0
3212  with [the] Tyrians      0          1
3213              zaphon      0          1
3214              ~south      0          2

[3215 rows x 3 columns]


In [19]:
bible_wordcount.to_csv(matched_names_tsv, sep = '\t')