# Match Biblical names from multiple versions and across languages.
The JHU trabina project has done this with a list of 1128 names across 531 languages.

It would be good also to 'read' through the projects or extracts we have to do a similar matching.

We have some hand crafted data to get started: The Macula dataset and also the All Biblical Terms and Major Biblical Terms lists from Paratext.

In [1]:
#!/usr/bin/env python3

import csv
from collections import Counter
import numpy as np
import os
import pandas as pd
from pathlib import Path
import sys

def get_macula_df(file, sep="\t"):

    # Load the 
    macula_df = pd.read_csv(file, dtype=str, sep=sep)
    macula_df.fillna('', inplace=True)
    
    #print(macula_df)
    
    #Index(['ref', 'Original unicode', 'Hebrew Original', 'Aramaic Original', 'Greek Original', 'Greek lemma', 'Greek normalized', 'Greek gloss', 'English gloss', 'Mandarin gloss']
    
    return macula_df


In [2]:

data_folder = Path("D:/GitHub/trabina/data") 
by_lang_folder = data_folder / "by-lang"
jhu_filename = "eng"
compare_col = "English gloss"
english_names_file = by_lang_folder / jhu_filename
updated_macula_data_tsv = data_folder / "updated_macula_names.tsv"

macula_data_tsv = data_folder / "macula_names.tsv"
macula_df = get_macula_df(macula_data_tsv)

simple_df = macula_df.drop(columns= ['ref','Hebrew Original', 'Aramaic Original', 'Greek Original'])
#Remove duplicate rows from the simple_df
simple_df.drop_duplicates(subset=None, keep='first', inplace=True)

print(f"\nThe simple macula dataframe has these columns:\n{simple_df.columns}")
simple_df


The simple macula dataframe has these columns:
Index(['Original unicode', 'Greek lemma', 'Greek normalized', 'Greek gloss',
       'English gloss', 'Mandarin gloss'],
      dtype='object')


Unnamed: 0,Original unicode,Greek lemma,Greek normalized,Greek gloss,English gloss,Mandarin gloss
0,יְהוָ֥ה,,,,LORD,耶和华
1,יְהוָ֤ה,,,,LORD,耶和华
2,יְהוָ֨ה,,,,LORD,耶和华
3,יְהוָ֧ה,,,κύριος,LORD,耶和华
4,עֵ֖דֶן,,,εδεμ,Eden,伊甸
...,...,...,...,...,...,...
38793,ΒΑΒΥΛΩΝ,Βαβυλών,ΒΑΒΥΛΩΝ,,Babylon,
38801,Ἰησοῦ,Ἰησοῦς,Ἰησοῦ,,,
38802,Χριστοῦ,Χριστός,Χριστοῦ,,,
38805,Γὼγ,Γώγ,Γώγ,,Gog,


In [3]:
def get_name_matrix(folder):
    
    all_names = dict()
    
    folder = Path(folder)
    files = sorted(folder.glob(r'*'))
    #print([file.name[0:3] for file in files])
    
    for file in files:
        with open(file, 'r', encoding='utf-8') as fin:
            names = [name.strip('\n').title() for name in fin.readlines()]
            all_names[file.name] = names   
    
    return all_names

In [16]:
#Get all the names and make a dataframe
all_jhu_names = get_name_matrix(by_lang_folder)
jhu_df = pd.DataFrame.from_dict(all_jhu_names, dtype=str)

print(f"\nThe JHU dataframe has these columns:\n {jhu_df.columns}")  
jhu_df


The JHU dataframe has these columns:
 Index(['aai_aai', 'aak_aak', 'aau_aau', 'abt_maprik', 'aby_aby', 'acd_acd',
       'ace_ace', 'acf_acf', 'acn_acn', 'acr_cubulcu',
       ...
       'tpa_tpa', 'tpi_tpi', 'tpm_tpm', 'tsn_1908', 'tur_2009', 'ukr_1871',
       'urd_arabic', 'vie_1926compounds', 'xho_1996', 'zul_zul'],
      dtype='object', length=592)


Unnamed: 0,aai_aai,aak_aak,aau_aau,abt_maprik,aby_aby,acd_acd,ace_ace,acf_acf,acn_acn,acr_cubulcu,...,tpa_tpa,tpi_tpi,tpm_tpm,tsn_1908,tur_2009,ukr_1871,urd_arabic,vie_1926compounds,xho_1996,zul_zul
0,Aaron,Erono,Aron,Eron,Eroni,Aron,Harun,Éronn,Aron,Aaron,...,-,Aron,Aarɔn,Arone,Harun,Аарон,ہارون,A-Rôn,Uaron,Ku-Aroni
1,Abaddon,Abadonoyɨ,Abadon,Abadon,Abadoni,Abadon,Abadon,Abadonn,Abadon,Abadon,...,-,Abadon,Abadɔn,Abatone,Abadon,Авадон,ابدون,A-Ba-Đôn,Uapoliyon,Lingu-Abadoni
2,-,-,-,-,-,-,-,-,-,-,...,-,Abarim,Abarim,-,-,-,-,A-Ba-Rim,,-
3,Abba,Ápe,Abba,Wao,Ufane,Sɛi,Oe,Papa,Aba,Chawesaj,...,Ama'U,Aba,Kote,Aba,Abba,Авва,ابّا,A-Ba,Tata,Aba
4,-,-,-,-,-,-,-,-,-,-,...,-,Apdon,Abdɔn,-,-,-,-,Áp-Đôn,Uabdon,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1124,-,-,-,-,-,-,-,-,-,-,...,-,Soba,Zoba,-,-,-,-,Xô-Ba,Wasezobha,-
1125,-,-,-,-,-,-,-,-,-,-,...,-,Sora,Zora,-,-,-,-,Xô-Ra,Ezora,-
1126,Zerubbabel,Serababero,Serubabel,Serababel,Serubabeo,Serubabelɛ,Zerubabel,Zèròbabèl,Zerubabe,Zorobabel,...,-,Serubabel,Zɛrubabɛl,Serubabele,Zerubabel,Заровавель,زرُبابل,Xô-Rô-Ba-Bên,Uzerubhabheli,Uzorobabeli
1127,-,-,-,-,-,-,-,-,-,-,...,-,Suf,Zuf,-,-,-,-,Xu-Phơ,Kazufi,-


In [17]:
jhu_df = jhu_df.set_index(["eng"], drop=True, append=False, inplace=False, verify_integrity=False)
jhu_df.describe()

Unnamed: 0,aai_aai,aak_aak,aau_aau,abt_maprik,aby_aby,acd_acd,ace_ace,acf_acf,acn_acn,acr_cubulcu,...,tpa_tpa,tpi_tpi,tpm_tpm,tsn_1908,tur_2009,ukr_1871,urd_arabic,vie_1926compounds,xho_1996,zul_zul
count,1129,1129,1129,1129,1129,1129,1129,1129,1129,1129,...,1129,1129,1129,1129,1129,1129,1129,1129,1129,1129
unique,483,472,481,465,456,478,470,491,471,482,...,96,974,953,488,425,492,480,988,1006,493
top,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
freq,556,573,549,576,583,558,568,553,551,555,...,1006,33,70,548,619,551,554,26,50,568


In [20]:
jhu_df.index

Index(['Aaron', 'Abaddon', 'Abarim', 'Abba', 'Abdon', 'Abednego', 'Abed-Nego',
       'Abel', 'Abiathar', 'Abib',
       ...
       'Zion', 'Ziph', 'Zippor', 'Zoan', 'Zoar', 'Zobah', 'Zorah', 'Zorobabel',
       'Zuph', 'Zur'],
      dtype='object', name='eng', length=1129)

In [6]:
#Concatenate the two dataframes joining on exact matches of 
# 'English gloss' and jhu_eng columns. Retain both.

merged_df = pd.merge(jhu_df, simple_df, how='inner', left_index=True,right_on='English gloss', indicator=True)
merged_df.rename(columns={"_merge": "matched_on_eng"},inplace=True)
merged_df

Unnamed: 0,aai_aai,aak_aak,aau_aau,abt_maprik,aby_aby,acd_acd,ace_ace,acf_acf,acn_acn,acr_cubulcu,...,vie_1926compounds,xho_1996,zul_zul,Original unicode,Greek lemma,Greek normalized,Greek gloss,English gloss,Mandarin gloss,matched_on_eng
2686,Aaron,Erono,Aron,Eron,Eroni,Aron,Harun,Éronn,Aron,Aaron,...,A-Rôn,Uaron,Ku-Aroni,אַהֲרֹ֤ן,,,ααρων,Aaron,亚伦,both
2708,Aaron,Erono,Aron,Eron,Eroni,Aron,Harun,Éronn,Aron,Aaron,...,A-Rôn,Uaron,Ku-Aroni,אַהֲרֹ֔ן,,,ααρων,Aaron,亚伦,both
2711,Aaron,Erono,Aron,Eron,Eroni,Aron,Harun,Éronn,Aron,Aaron,...,A-Rôn,Uaron,Ku-Aroni,אַֽהֲרֹ֔ן,,,ααρων,Aaron,亚伦,both
2714,Aaron,Erono,Aron,Eron,Eroni,Aron,Harun,Éronn,Aron,Aaron,...,A-Rôn,Uaron,Ku-Aroni,אַהֲרֹ֑ן,,,ααρων,Aaron,亚伦,both
2777,Aaron,Erono,Aron,Eron,Eroni,Aron,Harun,Éronn,Aron,Aaron,...,A-Rôn,Uaron,Ku-Aroni,אַהֲרֹן֒,,,ααρων,Aaron,亚伦,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19831,-,-,-,-,-,-,-,-,-,-,...,Xu-Phơ,Kazufi,-,צוּף֙,,,σουφ,Zuph,苏弗,both
6131,-,-,-,-,-,-,-,-,-,-,...,Xu-Rơ,Uzure,-,צ֑וּר,,,σουρ,Zur,苏珥,both
6496,-,-,-,-,-,-,-,-,-,-,...,Xu-Rơ,Uzure,-,צ֤וּר,,,,Zur,苏珥,both
20247,-,-,-,-,-,-,-,-,-,-,...,Xu-Rơ,Uzure,-,צ֥וּר,,,σουρ,Zur,苏珥,both


Not sure that merging like this is useful.

In [7]:
# Read in the All terms data from silnlp
sil_assets_path = Path('D:/GitHub/silnlp/silnlp/assets')
all_terms_file = sil_assets_path / 'All-metadata.txt'

all_terms = pd.read_table(all_terms_file,header=None, usecols=[0]).squeeze("columns")
all_terms.rename('terms')

0              אֵב
1        אֲבַגְתָא
2              אבד
3            אֹבֵד
4         אֲבַדֹּה
           ...    
20578     ὥρα (DC)
20579      ὥς (DC)
20580     ὧδε (DC)
20581    ὦμος (DC)
20582     ᾠδή (DC)
Name: terms, Length: 20583, dtype: object

In [8]:
# Split into words
terms_df = all_terms.str.split(' ', expand=True)
terms_df

Unnamed: 0,0,1
0,אֵב,
1,אֲבַגְתָא,
2,אבד,
3,אֹבֵד,
4,אֲבַדֹּה,
...,...,...
20578,ὥρα,(DC)
20579,ὥς,(DC)
20580,ὧδε,(DC)
20581,ὦμος,(DC)


In [9]:
terms_df.rename({0: "term", 1: "note"}, axis="columns", inplace=True)
terms_df['AR'] = terms_df['note'] == '(AR)'
terms_df['DC'] = terms_df['note'] == '(DC)'
terms_df.drop(columns=['note'],inplace=True)
terms_df

Unnamed: 0,term,AR,DC
0,אֵב,False,False
1,אֲבַגְתָא,False,False
2,אבד,False,False
3,אֹבֵד,False,False
4,אֲבַדֹּה,False,False
...,...,...,...
20578,ὥρα,False,True
20579,ὥς,False,True
20580,ὧδε,False,True
20581,ὦμος,False,True


In [10]:
# Read in the Major terms data from silnlp
sil_assets_path = Path('D:/GitHub/silnlp/silnlp/assets')
major_terms_file = sil_assets_path / 'Major-metadata.txt'

major_terms = pd.read_table(major_terms_file,header=None)
major_terms.rename({0: "term", 1: "domain", 2:'category'}, axis="columns", inplace=True)
major_terms[['term', 'note']] = major_terms['term'].str.split(' ', 1, expand=True)

major_terms[['term', 'sense']] = major_terms['term'].str.split('-', 1, expand=True)
major_terms['AR'] = major_terms['note'] == '(AR)'
major_terms['DC'] = major_terms['note'] == '(DC)'
major_terms.drop(columns=['note'],inplace=True)

major_terms

Unnamed: 0,term,domain,category,sense,AR,DC
0,אֲבַגְתָא,PN,person,,False,False
1,אֵבֶה,FL,grasses,,False,False
2,אֵבוּס,RE,containers; animal husbandry,,False,False
3,אֲבַטִּיחַ,FL,fruits,,False,False
4,אֲבִי,PN,person,,False,False
...,...,...,...,...,...,...
8643,Ωλαμος,PN,person,,False,True
8644,Ωνους,PN,person,,False,True
8645,Ωξ,PN,person,,False,True
8646,Ωουδας,PN,person,,False,True


In [11]:
major_terms.to_csv(r"D:\GitHub\davidbaines\trabina\data\major_terms.txt", sep = '\t')

In [12]:
#How many of the 8648 major terms are in the All terms data?
major_terms['all_terms_exact']=major_terms['term'].map(terms_df['term'].value_counts())
major_terms['all_terms_exact'] = major_terms['all_terms_exact'].fillna(0)
major_terms.sort_values('all_terms_exact',ascending=False)

Unnamed: 0,term,domain,category,sense,AR,DC,all_terms_exact
6540,σκύβαλον,RE,artifacts,,False,False,2.0
5364,γῆ,RE,nature,2,False,False,2.0
7049,ἀσπίς,RE,warfare,,False,True,2.0
1651,חֵלֶם,PN,person,,False,False,2.0
5351,Γάζα,PN,settlement,,False,False,2.0
...,...,...,...,...,...,...,...
5698,θυμιαστήριον,RE,sacrifices and offerings,,False,False,0.0
4825,שֻׁפִּים,PN,person,2,False,False,0.0
4824,שֻׁפִּים,PN,person,1,False,False,0.0
546,אֶפֶס דַמִּים,PN,settlement,,False,False,0.0


In [13]:
# These major terms aren't found exactly in the All-metadata.
# major_terms.loc[major_terms['all_terms_exact'].isna()]
major_terms.loc[major_terms['all_terms_exact'] == 0]

Unnamed: 0,term,domain,category,sense,AR,DC,all_terms_exact
67,אֶבֶן בֹּהַן בֶּן־רְאוּבֵן,PN,locale,,False,False,0.0
106,אֲדָמִי,PN,settlement,,False,False,0.0
110,אֲדֹנָי,BE,supernatural beings and powers; titles,,False,False,0.0
182,אֶזְבַּי,PN,person,,False,False,0.0
185,אֻזֵן,PN,settlement,,False,False,0.0
...,...,...,...,...,...,...,...
7235,Γαβρια,PN,person,,False,True,0.0
7554,ἤδυσμα,RE,artifacts; perfumes and spices,,False,True,0.0
7586,θυΐσκη,RE,?,,False,True,0.0
7742,καρρον,RE,?,,False,True,0.0


In [14]:
col = major_terms['all_terms_exact'] 
count = col[col != 0].count()
print(f"There are {count} major-metadata terms that appear exactly in the All metadata file.")
print(f"There are {len(major_terms) - count} major-metadata terms that don't appear exactly in the All metadata file.")


There are 8065 major-metadata terms that appear exactly in the All metadata file.
There are 583 major-metadata terms that don't appear exactly in the All metadata file.


In [42]:
def count_matches(reference_col, source_col):
    matches = source_col.map(reference_col.value_counts()).fillna(0).astype(int)
    # Return count of values greater than 0 
    return matches[matches > 0].count()

In [52]:
# Count how many terms from All, Major and JHU (eng) occur in the Macula data.
# Macula Index(['Original unicode', 'Greek lemma', 'Greek normalized', 'Greek gloss', 'English gloss', 'Mandarin gloss']

terms_macula         = simple_df['Original unicode']
terms_macula_english = simple_df['English gloss']

terms_all     = terms_df['term']
terms_major   = major_terms['term']
terms_jhu_eng = jhu_df.index.to_series()

print(f"There are {count_matches(terms_macula,terms_all)}   all_terms out of {len(terms_all)} found in the Macula data.")
print(f"There are {count_matches(terms_macula,terms_major)} major_terms out of {len(terms_major)} found in the Macula data.")
print(f"There are {count_matches(terms_macula_english,terms_jhu_eng)} jhu_eng terms out of {len(terms_jhu_eng)} found in the Macula data.")

major_matches = terms_major.map(terms_macula.value_counts()).fillna(0).astype(int)
# Return count of values greater than 0 
print(major_matches[major_matches > 0].count())
print(major_matches)

There are 89   all_terms out of 20583 found in the Macula data.
There are 174 major_terms out of 8648 found in the Macula data.
There are 894 jhu_eng terms out of 1129 found in the Macula data.
174
0       0
1       0
2       0
3       0
4       0
       ..
8643    0
8644    0
8645    0
8646    0
8647    0
Name: term, Length: 8648, dtype: int32
