# Match Biblical names from multiple versions and across languages.
The JHU trabina project has done this with a list of 1128 names across 531 languages.
It would be good also to 'read' through the projects or extracts we have to do a similar matching.
We have some hand crafted data to get started: The Macula dataset and also the All Biblical Terms and Major Biblical Terms lists from Paratext.

In [1]:
#!/usr/bin/env python3

import csv
from collections import Counter
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import numpy as np
import os
import pandas as pd

from pathlib import Path
import sys

In [2]:
def move_column(df, column_name, column_index):
    col = df.pop(column_name)
    return df.insert(column_index, col.name, col)

In [29]:
data_folder = Path("D:/GitHub/davidbaines/trabina/data")
by_lang_folder = data_folder / "by-lang"
jhu_filename = "eng"
compare_col = "English gloss"
english_names_file = by_lang_folder / jhu_filename
major_terms_out = data_folder / "major_terms.tsv"

hebrew_refs_and_names = data_folder / "hebrew_refs_and_names.tsv"
hebrew_cols = {0: 'ref', 1: 'Hebrew source', 2: 'English gloss of Hebrew', 3:'Greek gloss of Hebrew', 4:'Mandarin gloss of Hebrew'}

greek_refs_and_names = data_folder / "greek_refs_and_names.tsv"
greek_cols = {0:'ref', 1:'Greek source', 2:'Greek lemma', 3:'Greek normalized', 4:'English gloss of Greek'}

In [4]:
def read_tsv(file,column_names):
    df = pd.read_table(file, header=None, dtype=str, sep='\t')
    df.fillna('', inplace=True)
    df.rename(column_names, axis="columns", inplace=True)
    return df

In [5]:
hebrew = read_tsv(hebrew_refs_and_names, hebrew_cols)
hebrew

Unnamed: 0,ref,Hebrew source,English gloss of Hebrew,Greek gloss of Hebrew,Mandarin gloss of Hebrew
0,GEN 2:4!8,יְהוָ֥ה,LORD,,耶和华
1,GEN 2:5!15,יְהוָ֤ה,LORD,,耶和华
2,GEN 2:7!2,יְהוָ֨ה,LORD,,耶和华
3,GEN 2:8!2,יְהוָ֧ה,LORD,κύριος,耶和华
4,GEN 2:8!5,עֵ֖דֶן,Eden,εδεμ,伊甸
...,...,...,...,...,...
34182,MAL 3:22!3,מֹשֶׁ֣ה,Moses,μωυσῆ,摩西
34183,MAL 3:22!8,חֹרֵב֙,Horeb,ξωρηβ,何烈山
34184,MAL 3:22!11,יִשְׂרָאֵ֔ל,Israel,ισραηλ,以色列
34185,MAL 3:23!6,אֵלִיָּ֣ה,Elijah,ηλιαν,以利亚


In [6]:
len(hebrew['ref'].unique())

34187

In [7]:
#hebrew.loc[hebrew['Hebrew source'] == 'יְהוָ֧ה']
any(hebrew['ref'].duplicated()) 

False

In [8]:
greek = read_tsv(greek_refs_and_names, greek_cols)
greek

Unnamed: 0,ref,Greek source,Greek lemma,Greek normalized,English gloss of Greek
0,MAT 1:1!3,Ἰησοῦ,Ἰησοῦς,Ἰησοῦ,of Jesus
1,MAT 1:1!4,Χριστοῦ,Χριστός,Χριστοῦ,Christ
2,MAT 1:1!6,Δαυεὶδ,Δαυίδ,Δαυείδ,of David
3,MAT 1:1!8,Ἀβραάμ.,Ἀβραάμ,Ἀβραάμ,of Abraham
4,MAT 1:2!1,Ἀβραὰμ,Ἀβραάμ,Ἀβραάμ,Abraham
...,...,...,...,...,...
4627,REV 22:13!6,"Ὦ,",Ὦ,Ὦ,Omega
4628,REV 22:16!2,Ἰησοῦς,Ἰησοῦς,Ἰησοῦς,Jesus
4629,REV 22:16!20,"Δαυείδ,",Δαυίδ,Δαυείδ,of David
4630,REV 22:20!11,Ἰησοῦ.,Ἰησοῦς,Ἰησοῦ,Jesus


In [9]:
# Don't understand how these refs compare as duplicates.
print(len(greek['ref'].unique()))

#print(greek['ref'].duplicated())
#duplicate_ref = greek.duplicated(subset=['ref'])
#if duplicate_ref.any():
#    print(greek.loc[~duplicate_ref], end='\n')
 
#for line in greek.groupby(['ref']):
#    print(line)

# Ah it looks as though this has read in the list twice.
# Fixed in text file - need to fix BaseX query too.


4632


In [10]:
# Combine the Hebrew and Greek dataframes.
macula = pd.concat([hebrew,greek], sort=False)
macula['refs'] = macula['ref']
macula.set_index(['ref'], inplace = True)

macula = macula.fillna('')
macula['Source'] = macula['Hebrew source'] + macula['Greek source']
macula['English gloss'] = macula['English gloss of Hebrew'] + macula['English gloss of Greek']

move_column(macula,'Source',0)
move_column(macula,'English gloss',1)
move_column(macula,'refs',0)

#source = macula.pop('Source')
#macula.insert(0, source.name, source)
#english = macula.pop('English gloss')
#macula.insert(1, english.name, english)
#refs = macula.pop('refs')
#macula.insert(0,refs.name, refs)

macula

Unnamed: 0_level_0,refs,Source,English gloss,Hebrew source,English gloss of Hebrew,Greek gloss of Hebrew,Mandarin gloss of Hebrew,Greek source,Greek lemma,Greek normalized,English gloss of Greek
ref,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
GEN 2:4!8,GEN 2:4!8,יְהוָ֥ה,LORD,יְהוָ֥ה,LORD,,耶和华,,,,
GEN 2:5!15,GEN 2:5!15,יְהוָ֤ה,LORD,יְהוָ֤ה,LORD,,耶和华,,,,
GEN 2:7!2,GEN 2:7!2,יְהוָ֨ה,LORD,יְהוָ֨ה,LORD,,耶和华,,,,
GEN 2:8!2,GEN 2:8!2,יְהוָ֧ה,LORD,יְהוָ֧ה,LORD,κύριος,耶和华,,,,
GEN 2:8!5,GEN 2:8!5,עֵ֖דֶן,Eden,עֵ֖דֶן,Eden,εδεμ,伊甸,,,,
...,...,...,...,...,...,...,...,...,...,...,...
REV 22:13!6,REV 22:13!6,"Ὦ,",Omega,,,,,"Ὦ,",Ὦ,Ὦ,Omega
REV 22:16!2,REV 22:16!2,Ἰησοῦς,Jesus,,,,,Ἰησοῦς,Ἰησοῦς,Ἰησοῦς,Jesus
REV 22:16!20,REV 22:16!20,"Δαυείδ,",of David,,,,,"Δαυείδ,",Δαυίδ,Δαυείδ,of David
REV 22:20!11,REV 22:20!11,Ἰησοῦ.,Jesus,,,,,Ἰησοῦ.,Ἰησοῦς,Ἰησοῦ,Jesus


In [11]:
unique_names = set()
count_names = 0
name_columns = ['Source', 'English gloss of Hebrew', 'Greek gloss of Hebrew', 'Mandarin gloss of Hebrew', 'Greek lemma', 'Greek normalized', 'English gloss of Greek']
for col in name_columns:
    col_unique = set(macula[col].unique())
    print(f"There are {len(col_unique)}\t different names in the '{col}' column.")
    # Count how many unique names in each column.
    count_names += len(col_unique)
    unique = col_unique
    unique_names = unique_names | unique

print(f"Of the {count_names} names in all the name columns {len(unique_names)} are unique. We can match on these across the five languages in the Macula dataset.\n")
print(f"{macula.nunique()}")



There are 10851	 different names in the 'Source' column.
There are 2569	 different names in the 'English gloss of Hebrew' column.
There are 3435	 different names in the 'Greek gloss of Hebrew' column.
There are 2396	 different names in the 'Mandarin gloss of Hebrew' column.
There are 534	 different names in the 'Greek lemma' column.
There are 867	 different names in the 'Greek normalized' column.
There are 760	 different names in the 'English gloss of Greek' column.
Of the 21412 names in all the name columns 20479 are unique. We can match on these across the five languages in the Macula dataset.

refs                        38819
Source                      10851
English gloss                3215
Hebrew source                9534
English gloss of Hebrew      2569
Greek gloss of Hebrew        3435
Mandarin gloss of Hebrew     2396
Greek source                 1319
Greek lemma                   534
Greek normalized              867
English gloss of Greek        760
dtype: int64


In [12]:
def get_name_matrix(folder):
    
    all_names = dict()
    
    folder = Path(folder)
    files = sorted(folder.glob(r'*'))
    #print([file.name[0:3] for file in files])
    
    for file in files:
        with open(file, 'r', encoding='utf-8') as fin:
            names = [name.strip('\n').title() for name in fin.readlines()]
            all_names[file.name] = names   
    
    return all_names

In [13]:
#Get all the names and make a dataframe
all_jhu_names = get_name_matrix(by_lang_folder)
jhu = pd.DataFrame.from_dict(all_jhu_names, dtype=str)
jhu.replace('-','', inplace=True)
# jhu_columns = [col for col in jhu.columns]
# print(jhu_columns)
print(f"{jhu.nunique()}")

aai_aai               483
aak_aak               471
aau_aau               481
abt_maprik            464
aby_aby               456
                     ... 
ukr_1871              491
urd_arabic            479
vie_1926compounds     987
xho_1996             1005
zul_zul               493
Length: 592, dtype: int64


In [14]:
jhu['eng_index'] = jhu['eng']
jhu = jhu.set_index(['eng_index'], drop=True, append=False, inplace=False, verify_integrity=False)
jhu.describe()

  jhu['eng_index'] = jhu['eng']


Unnamed: 0,aai_aai,aak_aak,aau_aau,abt_maprik,aby_aby,acd_acd,ace_ace,acf_acf,acn_acn,acr_cubulcu,...,tpa_tpa,tpi_tpi,tpm_tpm,tsn_1908,tur_2009,ukr_1871,urd_arabic,vie_1926compounds,xho_1996,zul_zul
count,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,...,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0,1129.0
unique,483.0,471.0,481.0,464.0,456.0,478.0,470.0,491.0,471.0,482.0,...,96.0,973.0,953.0,488.0,425.0,491.0,479.0,987.0,1005.0,493.0
top,,,,,,,,,,,...,,,,,,,,,,
freq,556.0,575.0,549.0,577.0,583.0,558.0,568.0,553.0,551.0,555.0,...,1006.0,34.0,70.0,548.0,619.0,553.0,556.0,27.0,51.0,568.0


In [28]:
assets_folder = Path('D:/GitHub/davidbaines/trabina/silnlp/assets')
patterns = ['All', 'Major', 'SilNt'] #'Pt6' doesn't have any glosses.

def read_assets_data(folder, pattern):
    # Function to read in the various PT metadata files. 
    # Each are read in differently.
    
    metadata_file = folder / f"{pattern}-metadata.txt"
    glosses_files = folder.glob(f"*-{pattern}-glosses.txt")
    vrefs_file = folder / f"{pattern}-vrefs.txt"
    print(f"Reading in {pattern} files.")
    
    # The assest folder contains files with pattern from ['Major', 'All', SilNt', 'Pt6']
    # Different sets have different data. 
    
    # Glosses exist for certain languages in separate files.
    # Not all files exist for all patterns. The 'Major' files are as follows:
    # en-Major-glosses.txt, en-Pt6-glosses.txt , en-SilNt-glosses.txt es-Major-glosses.txt fr-Major-glosses.txt, id-Major-glosses.txt Major-metadata.txt , Major-vrefs.txt
    
    # Reading in vrefs is the same for all patterns:
    vrefs = pd.read_csv(vrefs_file,  names=['vrefs'], converters={'vrefs': lambda x: x.split('\t')})     
    #vrefs = pd.read_csv(vrefs_file,header=None).squeeze("columns")
    #vrefs.rename({0: "vrefs"}, axis="columns", inplace=True)
    #vrefs = [vref for vref in vrefs.str.split('\t', expand=True)
    
    if pattern == 'All':
        # This dataset doesn't include sense numbers. Only the first column contains data.
        # The column contains (DC) and (AR) which need to be split off.
        #print(metadata_file)
        
        terms = pd.read_table(metadata_file,header=None, usecols=[0]).squeeze("columns")
        terms.rename('terms')
        
        terms = terms.str.split(' ', expand=True)
        
        terms.rename({0: "term", 1: "note"}, axis="columns", inplace=True)
        terms['AR'] = terms['note'] == '(AR)'
        terms['DC'] = terms['note'] == '(DC)'
        terms.drop(columns=['note'],inplace=True)
    
    if pattern == 'Major':
        terms = pd.read_table(metadata_file,header=None)
        terms.rename({0: "term", 1: "domain", 2:'category'}, axis="columns", inplace=True)
        terms[['term', 'note']] = terms['term'].str.split(' ', 1, expand=True)

        terms[['term', 'sense']] = terms['term'].str.split('-', 1, expand=True)
        terms['AR'] = terms['note'] == '(AR)'
        terms['DC'] = terms['note'] == '(DC)'
        terms.drop(columns=['note'],inplace=True)
        
    if pattern == 'SilNt':
        terms = pd.read_table(metadata_file,header=None)
        terms.rename({0: "term", 1: "domain", 2:'category'}, axis="columns", inplace=True)
        # The domain column is empty.
        terms.drop(columns=['domain'],inplace=True)
        
        
    isos  = list()
    for gloss_file in glosses_files:
        iso = gloss_file.name[:gloss_file.name.find("-")]
        terms[iso] = pd.read_table(gloss_file,header=None, usecols=[0]).squeeze("columns")
        terms[iso] = terms[iso].fillna('')
    return terms, vrefs


all_terms, all_vrefs = read_assets_data(assets_folder, 'All')
major_terms, major_vrefs = read_assets_data(assets_folder, 'Major')
silnt_terms, silnt_vrefs = read_assets_data(assets_folder, 'SilNt')


print(f"\nAll terms:\n{all_terms.nunique()}\n")
print(all_terms)
print()
#print(f"All vrefs:\n{all_vrefs}\n\n")
print(f"Major terms:\n{major_terms.nunique()}\n")
print(major_terms)
print()
#print(f"Major vrefs:\n{major_vrefs}\n\n")

print(f"SilNt terms:\n{silnt_terms.nunique()}\n")
print(silnt_terms)
print()
#print(f"SilNt vrefs:\n{silnt_vrefs}")



Reading in All files.
Reading in Major files.
Reading in SilNt files.

All terms:
term    17350
AR          2
DC          2
en      13195
dtype: int64

            term     AR     DC           en
0            אֵב  False  False          bud
1      אֲבַגְתָא  False  False      Abagtha
2            אבד  False  False       perish
3          אֹבֵד  False  False  destruction
4       אֲבַדֹּה  False  False  destruction
...          ...    ...    ...          ...
20578        ὥρα  False   True             
20579         ὥς  False   True             
20580        ὧδε  False   True             
20581       ὦμος  False   True             
20582        ᾠδή  False   True             

[20583 rows x 4 columns]

Major terms:
term        6083
domain         8
category     565
sense         18
AR             2
DC             2
en          5344
es          4929
fr          5463
id          3411
dtype: int64

            term domain                      category sense     AR     DC  \
0      אֲבַגְתָא   

In [32]:
major_terms.to_csv(r"D:\GitHub\davidbaines\trabina\data\major_terms.tsv", sep = '\t')
major_vrefs.to_csv(r"D:\GitHub\davidbaines\trabina\data\major_vrefs.txt", sep = '\t')

In [55]:
major_vrefs

Unnamed: 0,vrefs
0,[EST 1:10]
1,[JOB 9:26]
2,"[JOB 39:9, PRO 14:4, ISA 1:3]"
3,[NUM 11:5]
4,[2KI 18:2]
...,...
8643,"[1ES 5:12, 1ES 9:30]"
8644,[1ES 5:22]
8645,[JDT 8:1]
8646,[1ES 9:23]


In [64]:
# Read in the OT Bible text
with open(r"D:\GitHub\davidbaines\trabina\data\hbo-hboWLC.txt", 'r', encoding='utf-8') as OT:
    ot_lines = [line.strip('\n') for line in OT.readlines()]
    
with open(r"D:\GitHub\davidbaines\trabina\silnlp\assets\vref.txt" , 'r', encoding='utf-8') as vref:
    ot_vrefs = [line.strip('\n') for line in vref.readlines()]    

In [74]:
print(ot_lines[:10],ot_vrefs[:10])

heb_ot = {ot_vref:verse for ot_vref,verse in zip(ot_vrefs,ot_lines)}
for i, items in enumerate(heb_ot.items()):
    if i > 10:
        break
    else:
        print(items)
# heb_ot is a dictionary with the verse reference as the key and the verse as the value.
# It is the Hebrew OT (Westminster Leningrad codex ) from Paratext extracted line by line by silnlp.

['בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃', 'וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָבֹ֔הוּ וְחֹ֖שֶׁךְ עַל־פְּנֵ֣י תְה֑וֹם וְר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל־פְּנֵ֥י הַמָּֽיִם׃', 'וַיֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר וַֽיְהִי־אֽוֹר׃', 'וַיַּ֧רְא אֱלֹהִ֛ים אֶת־הָא֖וֹר כִּי־ט֑וֹב וַיַּבְדֵּ֣ל אֱלֹהִ֔ים בֵּ֥ין הָא֖וֹר וּבֵ֥ין הַחֹֽשֶׁךְ׃', 'וַיִּקְרָ֨א אֱלֹהִ֤ים ׀ לָאוֹר֙ י֔וֹם וְלַחֹ֖שֶׁךְ קָ֣רָא לָ֑יְלָה וַֽיְהִי־עֶ֥רֶב וַֽיְהִי־בֹ֖קֶר י֥וֹם אֶחָֽד׃ פ', 'וַיֹּ֣אמֶר אֱלֹהִ֔ים יְהִ֥י רָקִ֖יעַ בְּת֣וֹךְ הַמָּ֑יִם וִיהִ֣י מַבְדִּ֔יל בֵּ֥ין מַ֖יִם לָמָֽיִם׃', 'וַיַּ֣עַשׂ אֱלֹהִים֮ אֶת־הָרָקִיעַ֒ וַיַּבְדֵּ֗ל בֵּ֤ין הַמַּ֙יִם֙ אֲשֶׁר֙ מִתַּ֣חַת לָרָקִ֔יעַ וּבֵ֣ין הַמַּ֔יִם אֲשֶׁ֖ר מֵעַ֣ל לָרָקִ֑יעַ וַֽיְהִי־כֵֽן׃', 'וַיִּקְרָ֧א אֱלֹהִ֛ים לָֽרָקִ֖יעַ שָׁמָ֑יִם וַֽיְהִי־עֶ֥רֶב וַֽיְהִי־בֹ֖קֶר י֥וֹם שֵׁנִֽי׃ פ', 'וַיֹּ֣אמֶר אֱלֹהִ֗ים יִקָּו֨וּ הַמַּ֜יִם מִתַּ֤חַת הַשָּׁמַ֙יִם֙ אֶל־מָק֣וֹם אֶחָ֔ד וְתֵרָאֶ֖ה הַיַּבָּשָׁ֑ה וַֽיְהִי־כֵֽן׃', 'וַיִּקְרָ֨א אֱלֹהִ֤ים ׀ לַיַּבָּשָׁה֙ אֶ֔רֶץ ו

In [100]:
# Make a Dataframe from the dictionary
hebrew_ot = pd.DataFrame.from_dict(heb_ot, orient='index', dtype=str, columns=['verse'])
#print(hebrew_ot.verse == '')
#print()
#print(hebrew_ot[hebrew_ot.verse == ''])
#print()
#print(hebrew_ot[hebrew_ot.verse == ''].index)

#hebrew_ot.drop(hebrew_ot['verse']=='', inplace=True)
hebrew_ot.drop(hebrew_ot[hebrew_ot.verse == ''].index, inplace=True)
hebrew_ot

Unnamed: 0,verse
GEN 1:1,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...
GEN 1:2,וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָבֹ֔הוּ וְחֹ֖שֶׁך...
GEN 1:3,וַיֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר וַֽיְהִי־אֽוֹר׃
GEN 1:4,וַיַּ֧רְא אֱלֹהִ֛ים אֶת־הָא֖וֹר כִּי־ט֑וֹב וַי...
GEN 1:5,וַיִּקְרָ֨א אֱלֹהִ֤ים ׀ לָאוֹר֙ י֔וֹם וְלַחֹ֖ש...
...,...
MAL 3:20,וְזָרְחָ֨ה לָכֶ֜ם יִרְאֵ֤י שְׁמִי֙ שֶׁ֣מֶשׁ צְ...
MAL 3:21,וְעַסּוֹתֶ֣ם רְשָׁעִ֔ים כִּֽי־יִהְי֣וּ אֵ֔פֶר ...
MAL 3:22,זִכְר֕וּ תּוֹרַ֖ת מֹשֶׁ֣ה עַבְדִּ֑י אֲשֶׁר֩ צִ...
MAL 3:23,הִנֵּ֤ה אָֽנֹכִי֙ שֹׁלֵ֣חַ לָכֶ֔ם אֵ֖ת אֵלִיָּ...


In [215]:
# Create a dictionary with a term, and a list of verse references where the term should appear.

terms = [term for term in major_terms['term']]
print(terms[:10])

# Get the list of references
reference_lists = [refs for refs in major_vrefs['vrefs']]
#print(reference_lists[0:10])

terms_and_refs_dict = {term:refs for term, refs in zip(term,reference_lists)}
terms_and_refs_dict

['אֲבַגְתָא', 'אֵבֶה', 'אֵבוּס', 'אֲבַטִּיחַ', 'אֲבִי', 'אֲבִי', 'אֲבִי־עַלְבוֹן', 'אֲבִיאֵל', 'אֲבִיאֵל', 'אֲבִיאָסָף']


{'אֲבַגְתָא': ['EST 1:10'],
 'אֵבֶה': ['JOB 9:26'],
 'אֵבוּס': ['JOB 39:9', 'PRO 14:4', 'ISA 1:3'],
 'אֲבַטִּיחַ': ['NUM 11:5'],
 'אֲבִי': ['JDG 6:11', 'JDG 6:24', 'JDG 8:32'],
 'אֲבִי־עַלְבוֹן': ['2SA 23:31'],
 'אֲבִיאֵל': ['1CH 11:32'],
 'אֲבִיאָסָף': ['EXO 6:24'],
 'אֲבִיגַיִל': ['2SA 17:25', '1CH 2:16', '1CH 2:17'],
 'אֲבִידָן': ['NUM 1:11', 'NUM 2:22', 'NUM 7:60', 'NUM 7:65', 'NUM 10:24'],
 'אֲבִידָע': ['GEN 25:4', '1CH 1:33'],
 'אֲבִיָּה': ['NEH 10:9', 'NEH 12:4', 'NEH 12:17'],
 'אֲבִיָּהוּ': ['2CH 13:20', '2CH 13:21'],
 'אֲבִיהוּא': ['EXO 6:23',
  'EXO 24:1',
  'EXO 24:9',
  'EXO 28:1',
  'LEV 10:1',
  'NUM 3:2',
  'NUM 3:4',
  'NUM 26:60',
  'NUM 26:61',
  '1CH 5:29',
  '1CH 24:1',
  '1CH 24:2'],
 'אֲבִיהוּד': ['1CH 8:3'],
 'אֲבִיהַיִל': ['2CH 11:18'],
 'אֲבִיחַיִל': ['EST 2:15', 'EST 9:29'],
 'אֲבִיטוּב': ['1CH 8:11'],
 'אֲבִיטַל': ['2SA 3:4', '1CH 3:3'],
 'אֲבִיָּם': ['1KI 14:31', '1KI 15:1', '1KI 15:7', '1KI 15:7', '1KI 15:8'],
 'אֲבִימָאֵל': ['GEN 10:28', '1CH 1:22'],
 'אֲב

In [217]:
# Perhaps we can invert this: For every verse which words do we expect to find in it?
# Create a list of verses with the set of terms that should appear in that verse.
print(terms[:10])

#refs_and_terms_df = pd.DataFrame([reference_lists], index = terms)
#refs_and_terms_df.set_index(, drop=True, append=False, inplace=True, verify_integrity=True)

#Are the terms unique?
#print(len(set(terms)) == len(terms))

refs_and_terms_df = pd.DataFrame(zip(terms,reference_lists),columns=['term','refs'])
refs_and_terms_df = refs_and_terms_df.explode('refs')
refs_and_terms_df

['אֲבַגְתָא', 'אֵבֶה', 'אֵבוּס', 'אֲבַטִּיחַ', 'אֲבִי', 'אֲבִי', 'אֲבִי־עַלְבוֹן', 'אֲבִיאֵל', 'אֲבִיאֵל', 'אֲבִיאָסָף']


Unnamed: 0,term,refs
0,אֲבַגְתָא,EST 1:10
1,אֵבֶה,JOB 9:26
2,אֵבוּס,JOB 39:9
2,אֵבוּס,PRO 14:4
2,אֵבוּס,ISA 1:3
...,...,...
8643,Ωλαμος,1ES 9:30
8644,Ωνους,1ES 5:22
8645,Ωξ,JDT 8:1
8646,Ωουδας,1ES 9:23


In [177]:
# Add a column which indicates whether or not the term is found in the verse.
#refs_and_terms_df['term_found_in_verse'] = 
test_term = refs_and_terms_df.term[1]
test_ref  = refs_and_terms_df.refs[1]
print(test_term,test_ref)
print(test_ref in hebrew_ot.index)

# This is equivalent to ``df1.at['a','A']``
# df1.loc['a', 'A']
# Out[54]: 0.13200317033032932

print(hebrew_ot.loc[test_ref,'verse'])
print(test_term in hebrew_ot.loc[test_ref,'verse'])

אֵבֶה JOB 9:26
True
חָ֭לְפוּ עִם־אֳנִיּ֣וֹת אֵבֶ֑ה כְּ֝נֶ֗שֶׁר יָט֥וּשׂ עֲלֵי־אֹֽכֶל׃
False


In [201]:
refs_and_terms_df['ref_exists'] =  refs_and_terms_df.refs.isin(hebrew_ot.index)

refs_and_terms_df['term']

#hebrew_ot.loc['GEN 1:1']
#hebrew_ot


#refs_and_terms_df['term_exists'] = refs_and_terms_df.refs.isin(hebrew_ot.index)
#refs_and_terms_df.refs.isin(hebrew_ot.index[refs_and_terms_df.refs])

#refs_and_terms_df.term  # in hebrew_ot.loc[refs_and_terms_df.refs,'verse']
#refs_and_terms_df.refs.isin(hebrew_ot.index)

0       אֲבַגְתָא
1           אֵבֶה
2          אֵבוּס
2          אֵבוּס
2          אֵבוּס
          ...    
8643       Ωλαμος
8644        Ωνους
8645           Ωξ
8646       Ωουδας
8647       Ὡσαίας
Name: term, Length: 104232, dtype: object

In [113]:
#How many of the 8648 major terms are in the All terms data?
major_terms['all_terms_exact'] = major_terms['term'].map(all_terms['term'].value_counts())
major_terms['all_terms_exact'] = major_terms['all_terms_exact'].fillna(0)
major_terms.sort_values('all_terms_exact',ascending=False)
major_terms

Unnamed: 0,term,domain,category,sense,AR,DC,en,es,fr,id,all_terms_exact
0,אֲבַגְתָא,PN,person,,False,False,Abagtha,Abagtá,Avagta,Abagta,1.0
1,אֵבֶה,FL,grasses,,False,False,papyrus,papiro,papyrus,pandan,1.0
2,אֵבוּס,RE,containers; animal husbandry,,False,False,manger,pesebre,mangeoire,palungan,1.0
3,אֲבַטִּיחַ,FL,fruits,,False,False,melon,melón,melon,semangka,1.0
4,אֲבִי,PN,person,,False,False,Abi,Abí,Avi,Abi,1.0
...,...,...,...,...,...,...,...,...,...,...,...
8643,Ωλαμος,PN,person,,False,True,Ono,,,,1.0
8644,Ωνους,PN,person,,False,True,Ox,,,,1.0
8645,Ωξ,PN,person,,False,True,Judah,,,,1.0
8646,Ωουδας,PN,person,,False,True,Jeshaiah,,,,1.0


In [17]:
col = major_terms['all_terms_exact'] 
count = col[col != 0].count()
print(f"There are {count} major-metadata terms that appear exactly in the All metadata file.")
print(f"There are {len(major_terms) - count} major-metadata terms that don't appear exactly in the All metadata file.")


There are 8065 major-metadata terms that appear exactly in the All metadata file.
There are 583 major-metadata terms that don't appear exactly in the All metadata file.


In [18]:
def count_matches(reference_col, source_col):
    matches = source_col.map(reference_col.value_counts()).fillna(0).astype(int)
    # Return count of values greater than 0 
    return matches[matches > 0].count()

#This is very slow.
#def find_matches(reference_col, source_col):
#    matches = [source for source in source_col if source in reference_col.unique()]
#    return matches

# This is also slow.
def find_matches(reference_col, source_col):
    matches = source_col.map(reference_col.value_counts()).fillna(0).astype(int)
    # Return values greater than 0 
    return matches[matches > 0]

# This is almost instant.
def find_matches(reference_col, source_col):
    return set(source_col).intersection(set(reference_col))

def report_matches(ref_df, ref_name, ref_columns, search_dict):
        
    for name, col in search_dict.items():
        all_matches = set()
        match_count = 0
        unique_values = col.unique()
        print(f"Searching for {len(unique_values)} terms from: '{name}' in the {ref_name} data.")
        for ref_column in ref_columns:
            matches = find_matches(ref_df[ref_column],col)
            match_count += len(matches)
            print(f"There are {len(matches)} found in the '{ref_column}' column.")
            all_matches = all_matches.union(matches)

        print(f"{match_count} '{name}' matched of which {len(all_matches)} are unique.\n")

In [None]:
# Make a dataframe from terms_and_refs
#terms_and_refs = pd.DataFrame.from_dict(terms_and_refs_dict, orient='index', dtype=str, columns=['verse_refs'])

# This idea from https://stackoverflow.com/questions/33504424/pandas-dataframe-from-dictionary-with-lists
# Makes one row, with the key as column labels.
#terms_and_refs = pd.DataFrame([terms_and_refs_dict])

#terms_and_refs = pd.DataFrame(reference_lists, index=terms) 
#terms_and_refs

In [19]:
# Count how many terms from All, Major and JHU (eng) occur in the Macula data.
# Macula Index(['Original unicode', 'Greek lemma', 'Greek normalized', 'Greek gloss', 'English gloss', 'Mandarin gloss']

terms_macula         = macula['Source']
terms_macula_english = macula['English gloss']

terms_all     = all_terms['term']
terms_major   = major_terms['term']
terms_jhu_eng = jhu['eng']

print(f"There are {count_matches(terms_macula,terms_all)}   all_terms out of {len(terms_all)} found in the Macula {terms_macula.name} column.")
print(f"There are {count_matches(terms_macula,terms_major)} major_terms out of {len(terms_major)} found in the Macula {terms_macula.name} column.")
print(f"There are {count_matches(terms_macula_english,terms_jhu_eng)} jhu_eng terms out of {len(terms_jhu_eng)} found in the Macula {terms_macula.name} column.\n")

# How many of these Original language terms are found exactly as a key in other lists?
macula_search_columns = ['Source', 'Greek lemma', 'Greek normalized', 'Greek gloss of Hebrew']
search = {'All terms':terms_all, 'Major terms':terms_major}
report_matches(macula, "macula", macula_search_columns, search)

There are 89   all_terms out of 20583 found in the Macula Source column.
There are 174 major_terms out of 8648 found in the Macula Source column.
There are 893 jhu_eng terms out of 1129 found in the Macula Source column.

Searching for 17350 terms from: 'All terms' in the macula data.
There are 82 found in the 'Source' column.
There are 98 found in the 'Greek lemma' column.
There are 52 found in the 'Greek normalized' column.
There are 60 found in the 'Greek gloss of Hebrew' column.
292 'All terms' matched of which 202 are unique.

Searching for 6083 terms from: 'Major terms' in the macula data.
There are 80 found in the 'Source' column.
There are 94 found in the 'Greek lemma' column.
There are 52 found in the 'Greek normalized' column.
There are 19 found in the 'Greek gloss of Hebrew' column.
245 'Major terms' matched of which 156 are unique.



Macula columns:
Source                      10851
English gloss                3215
Hebrew source                9534
English gloss of Hebrew      2569
Greek gloss of Hebrew        3435
Mandarin gloss of Hebrew     2396
Greek source                 1319
Greek lemma                   534
Greek normalized              867
English gloss of Greek        760

In [20]:
macula_search_columns = ['English gloss']
search = {'JHU eng terms':terms_jhu_eng}
report_matches(macula, "macula",  macula_search_columns, search)

macula_search_columns = ['Source', 'Greek lemma', 'Greek normalized', 'Greek gloss of Hebrew']
search = {'JHU grc_accented_terms' : jhu['grc_accented'], 'JHU ell_helenic1 terms' : jhu['ell_hellenic1']}
report_matches(macula, "macula", macula_search_columns, search)

Searching for 1129 terms from: 'JHU eng terms' in the macula data.
There are 893 found in the 'English gloss' column.
893 'JHU eng terms' matched of which 893 are unique.

Searching for 651 terms from: 'JHU grc_accented_terms' in the macula data.
There are 6 found in the 'Source' column.
There are 4 found in the 'Greek lemma' column.
There are 9 found in the 'Greek normalized' column.
There are 1 found in the 'Greek gloss of Hebrew' column.
20 'JHU grc_accented_terms' matched of which 12 are unique.

Searching for 500 terms from: 'JHU ell_helenic1 terms' in the macula data.
There are 201 found in the 'Source' column.
There are 152 found in the 'Greek lemma' column.
There are 283 found in the 'Greek normalized' column.
There are 1 found in the 'Greek gloss of Hebrew' column.
637 'JHU ell_helenic1 terms' matched of which 330 are unique.



In [21]:
matches_es = set(jhu['spa_blph']).intersection(set(major_terms['es']))
print(f"There are {len(set(jhu['spa_blph']))} unique words in the JHU Spanish list.")
print(f"{len(matches_es)} of the words in the JHU Spanish list match those in the Major metadata.")

[word for i, word in enumerate(matches_es) if i <10]


There are 495 unique words in the JHU Spanish list.
388 of the words in the JHU Spanish list match those in the Major metadata.


['',
 'Grecia',
 'Agar',
 'Zeus',
 'Fariseo',
 'Tiatira',
 'Carpo',
 'Judea',
 'Lázaro',
 'España']

In [25]:
matches = set(jhu['cmn_sf_ncv']).intersection(set(macula['Mandarin gloss of Hebrew']))

print(f"There are {len(set(jhu['cmn_sf_ncv']))} terms in the JHU CMN list and {len(set(macula['Mandarin gloss of Hebrew']))} Mandarin glosses in Macula.")
print(f"There are {len(matches)} that match.")
print(matches)


There are 919 terms in the JHU CMN list and 2396 Mandarin glosses in Macula.
There are 14 that match.
{'', '拉麦', '撒迦利亚', '塞特', '主', '说', '以利', '闪', '以撒', '亚拿突人', '玛土撒拉', '底波拉', '雅列', '扫罗'}


### Checking names in translations.
Given a list of names from the Major Terms data, and the verse references for the names check that the names appear in the extract in the expected verses.


In [None]:
merged = macula.reset_index().merge(jhu, how="left", left_on='English gloss' ,right_on='eng', indicator=True).set_index('ref')

# In the '_merge' column there are two values 'both' for those rows in jhu that matched on 'eng' and left_only for those that didn't.
# Replace 'both' with 'matched on eng and English gloss'
merged['_merge'] = merged['_merge'].replace('both','matched on eng and English gloss')
move_column(merged,'eng',3)
merged

In [None]:
# Mandarin column is 'cmn_sf_ncv'
# Greek column is 'grc_accented'
# Hebrew column is 'heb_2009'

search_dict  = {'eng':['English gloss'], 'heb_2009':['Hebrew source'], 'grc_accented': ['Greek lemma', 'Greek source','Greek normalized'], 'cmn_sf_ncv' :['Mandarin gloss of Hebrew']}
search_dict  = {'eng':['English gloss']}

# This requires too much memory (12.8 GiB) 
# For those rows for which a match hasn't been found, see if we can match on other column pairs.
#for search_for, search_in_cols in search_dict.items():
#    for search_col in search_in_cols:
#        print(search_for, search_col)
#        merged = macula.reset_index().merge(jhu, how="left", left_on=search_col ,right_on=search_for, indicator=True).set_index('ref')
#        # In the '_merge' column there are two values 'both' for those rows in jhu that matched on 'eng' and left_only for those that didn't.
#        # Replace 'both' with an indication of the two columns used in the match'
#        merged['_merge'] = merged['_merge'].replace('both',f'matched on {search_for} and {search_col}')

matches = macula
#print(macula.index.is_monotonic)

# https://stackoverflow.com/questions/70460010/map-index-of-one-dataframe-to-column-of-another-dataframe
# df1['payment'] = df1.index.map(dict(zip(df2.index,df2['paid value'])))

# Add a match column to macula for each possible match in JHU.
# Create lookup tables maybe.
    # Find the unique list of glosses from Macula and from JHU in a given language.
    # 
for search_for, search_in_cols in search_dict.items():
    for search_col in search_in_cols:
        print(search_for, search_col)
        #matches[f'match {search_for} in {search_col}']
        # 
        print(jhu.index[jhu[search_for] == matches[search_col]])
