# Match Biblical names from multiple versions and across languages.
The JHU trabina project has done this with a list of 1128 names across 531 languages.
It would be good also to 'read' through the projects or extracts we have to do a similar matching.
We have some hand crafted data to get started: The Macula dataset and also the All Biblical Terms and Major Biblical Terms lists from Paratext.

In [1]:
#!/usr/bin/env python3

import csv
from collections import Counter
import json
import numpy as np
import os
import pandas as pd
import re
from pathlib import Path
import sys

In [2]:
def move_column(df, column_name, column_index):
    col = df.pop(column_name)
    return df.insert(column_index, col.name, col)

In [3]:
data_folder = Path("D:/GitHub/davidbaines/trabina/data")
by_lang_folder = data_folder / "by-lang"
jhu_filename = "eng"
compare_col = "English gloss"
english_names_file = by_lang_folder / jhu_filename
major_terms_out = data_folder / "major_terms.tsv"

hebrew_refs_and_names = data_folder / "hebrew_refs_and_names.tsv"
hebrew_cols = {0: 'ref', 1: 'Hebrew source', 2: 'English gloss of Hebrew', 3:'Greek gloss of Hebrew', 4:'Mandarin gloss of Hebrew'}

greek_refs_and_names = data_folder / "greek_refs_and_names.tsv"
greek_cols = {0:'ref', 1:'Greek source', 2:'Greek lemma', 3:'Greek normalized', 4:'English gloss of Greek'}

silnlp_assets_folder = Path("D:/GitHub/davidbaines/trabina/silnlp/assets")
silnlp_vref_file = silnlp_assets_folder / "vref.txt"

macula_json_file = Path("D:/GitHub/davidbaines/trabina") / "macula.json"


In [4]:
macula = pd.read_json(macula_json_file)
macula

Unnamed: 0,refs,Source,English gloss,Hebrew source,English gloss of Hebrew,Greek gloss of Hebrew,Mandarin gloss of Hebrew,Greek source,Greek lemma,Greek normalized,English gloss of Greek,book,chapter_no,verse_no,word_no,ref_only,silnlp_line_number
0,GEN 2:4!8,יְהוָ֥ה,LORD,יְהוָ֥ה,LORD,,耶和华,,,,,GEN,2.0,4.0,8.0,GEN 2:4,35
1,GEN 2:5!15,יְהוָ֤ה,LORD,יְהוָ֤ה,LORD,,耶和华,,,,,GEN,2.0,5.0,15.0,GEN 2:5,36
2,GEN 2:7!2,יְהוָ֨ה,LORD,יְהוָ֨ה,LORD,,耶和华,,,,,GEN,2.0,7.0,2.0,GEN 2:7,38
3,GEN 2:8!2,יְהוָ֧ה,LORD,יְהוָ֧ה,LORD,κύριος,耶和华,,,,,GEN,2.0,8.0,2.0,GEN 2:8,39
4,GEN 2:8!5,עֵ֖דֶן,Eden,עֵ֖דֶן,Eden,εδεμ,伊甸,,,,,GEN,2.0,8.0,5.0,GEN 2:8,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38616,REV 22:13!6,"Ὦ,",Omega,,,,,"Ὦ,",Ὦ,Ὦ,Omega,REV,22.0,13.0,6.0,REV 22:13,31162
38617,REV 22:16!2,Ἰησοῦς,Jesus,,,,,Ἰησοῦς,Ἰησοῦς,Ἰησοῦς,Jesus,REV,22.0,16.0,2.0,REV 22:16,31165
38618,REV 22:16!20,"Δαυείδ,",of David,,,,,"Δαυείδ,",Δαυίδ,Δαυείδ,of David,REV,22.0,16.0,20.0,REV 22:16,31165
38619,REV 22:20!11,Ἰησοῦ.,Jesus,,,,,Ἰησοῦ.,Ἰησοῦς,Ἰησοῦ,Jesus,REV,22.0,20.0,11.0,REV 22:20,31169


In [5]:
unique_names = set()
count_names = 0
name_columns = ['Source', 'English gloss of Hebrew', 'Greek gloss of Hebrew', 'Mandarin gloss of Hebrew', 'Greek lemma', 'Greek normalized', 'English gloss of Greek']
for col in name_columns:
    col_unique = set(macula[col].unique())
    print(f"There are {len(col_unique)}\t different names in the '{col}' column.")
    # Count how many unique names in each column.
    count_names += len(col_unique)
    unique = col_unique
    unique_names = unique_names | unique

print(f"Of the {count_names} names in all the name columns {len(unique_names)} are unique. We can match on these across the five languages in the Macula dataset.\n")
print(f"{macula.nunique()}")



There are 10751	 different names in the 'Source' column.
There are 2569	 different names in the 'English gloss of Hebrew' column.
There are 3420	 different names in the 'Greek gloss of Hebrew' column.
There are 2396	 different names in the 'Mandarin gloss of Hebrew' column.
There are 534	 different names in the 'Greek lemma' column.
There are 867	 different names in the 'Greek normalized' column.
There are 760	 different names in the 'English gloss of Greek' column.
Of the 21297 names in all the name columns 20364 are unique. We can match on these across the five languages in the Macula dataset.

refs                        38621
Source                      10751
English gloss                3214
Hebrew source                9434
English gloss of Hebrew      2569
Greek gloss of Hebrew        3420
Mandarin gloss of Hebrew     2396
Greek source                 1319
Greek lemma                   534
Greek normalized              867
English gloss of Greek        760
book                  

In [6]:
def get_name_matrix(folder):
    
    all_names = dict()
    
    folder = Path(folder)
    files = sorted(folder.glob(r'*'))
    #print([file.name[0:3] for file in files])
    
    for file in files:
        with open(file, 'r', encoding='utf-8') as fin:
            names = [name.strip('\n').title() for name in fin.readlines()]
            all_names[file.name] = names   
    
    return all_names

In [7]:
#Get all the names and make a dataframe
all_jhu_names = get_name_matrix(by_lang_folder)
jhu = pd.DataFrame.from_dict(all_jhu_names, dtype=str)
jhu.replace('-','', inplace=True)
# jhu_columns = [col for col in jhu.columns]
# print(jhu_columns)
print(f"{jhu.nunique()}")

aai_aai               483
aak_aak               471
aau_aau               481
abt_maprik            464
aby_aby               456
                     ... 
ukr_1871              491
urd_arabic            479
vie_1926compounds     987
xho_1996             1005
zul_zul               493
Length: 592, dtype: int64


In [8]:
assets_folder = Path('D:/GitHub/davidbaines/trabina/silnlp/assets')
patterns = ['All', 'Major', 'SilNt'] #'Pt6' doesn't have any glosses.

def read_assets_data(folder, pattern):
    # Function to read in the various PT metadata files. 
    # Each are read in differently.
    
    metadata_file = folder / f"{pattern}-metadata.txt"
    glosses_files = folder.glob(f"*-{pattern}-glosses.txt")
    vrefs_file = folder / f"{pattern}-vrefs.txt"
    json_file = Path(f"D:/GitHub/davidbaines/trabina/data/{pattern}_terms.json")
    tsv_file  = Path(f"D:/GitHub/davidbaines/trabina/data/{pattern}_terms.tsv")
    
    # The assest folder contains files with pattern from ['Major', 'All', SilNt', 'Pt6']
    # Different sets have different data. 
    
    # Glosses exist for certain languages in separate files.
    # Not all files exist for all patterns. The 'Major' files are as follows:
    # en-Major-glosses.txt, en-Pt6-glosses.txt , en-SilNt-glosses.txt es-Major-glosses.txt fr-Major-glosses.txt, id-Major-glosses.txt Major-metadata.txt , Major-vrefs.txt
    
    # Reading in vrefs is the same for all patterns:
    vrefs = pd.read_csv(vrefs_file,  names=['vrefs'], converters={'vrefs': lambda x: x.split('\t')})     
    #vrefs = pd.read_csv(vrefs_file,header=None).squeeze("columns")
    #vrefs.rename({0: "vrefs"}, axis="columns", inplace=True)
    #vrefs = [vref for vref in vrefs.str.split('\t', expand=True)
    
    if pattern == 'All':
        # This dataset doesn't include sense numbers. Only the first column contains data.
        # The column contains (DC) and (AR) which need to be split off.
        #print(metadata_file)
        
        terms = pd.read_table(metadata_file,header=None, usecols=[0]).squeeze("columns")
        terms.rename('terms')
        
        terms = terms.str.split(' ', expand=True)
        
        terms.rename({0: "term", 1: "note"}, axis="columns", inplace=True)
        terms['AR'] = terms['note'] == '(AR)'
        terms['DC'] = terms['note'] == '(DC)'
        terms.drop(columns=['note'],inplace=True)
    
    if pattern == 'Major':
        terms = pd.read_table(metadata_file,header=None)
        terms.rename({0: "term", 1: "domain", 2:'category'}, axis="columns", inplace=True)
        terms[['term', 'note']] = terms['term'].str.split(' ', 1, expand=True)

        terms[['term', 'sense']] = terms['term'].str.split('-', 1, expand=True)
        terms['AR'] = terms['note'] == '(AR)'
        terms['DC'] = terms['note'] == '(DC)'
        terms.drop(columns=['note'],inplace=True)
        
    if pattern == 'SilNt':
        terms = pd.read_table(metadata_file,header=None)
        terms.rename({0: "term", 1: "domain", 2:'category'}, axis="columns", inplace=True)
        # The domain column is empty.
        terms.drop(columns=['domain'],inplace=True)
        
    isos  = list()
    for gloss_file in glosses_files:
        iso = gloss_file.name[:gloss_file.name.find("-")]
        terms[iso] = pd.read_table(gloss_file,header=None, usecols=[0]).squeeze("columns")
        terms[iso] = terms[iso].fillna('')
        
    terms['vrefs'] = vrefs
    terms.to_json(json_file, orient='records')
    terms.to_csv(tsv_file, sep = '\t')
    
    return terms

In [9]:
all_terms   = read_assets_data(assets_folder, 'All')
major_terms = read_assets_data(assets_folder, 'Major')

print(f"Major terms:")
print(major_terms.head())

Major terms:
         term domain                      category sense     AR     DC  \
0   אֲבַגְתָא     PN                        person  None  False  False   
1       אֵבֶה     FL                       grasses  None  False  False   
2      אֵבוּס     RE  containers; animal husbandry  None  False  False   
3  אֲבַטִּיחַ     FL                        fruits  None  False  False   
4       אֲבִי     PN                        person  None  False  False   

        en       es         fr        id                          vrefs  
0  Abagtha   Abagtá     Avagta    Abagta                     [EST 1:10]  
1  papyrus   papiro    papyrus    pandan                     [JOB 9:26]  
2   manger  pesebre  mangeoire  palungan  [JOB 39:9, PRO 14:4, ISA 1:3]  
3    melon    melón      melon  semangka                     [NUM 11:5]  
4      Abi      Abí        Avi       Abi                     [2KI 18:2]  


In [10]:
def read_terms_from_json(pattern):
    json_file = Path(f"D:/GitHub/davidbaines/trabina/data/{pattern}_terms.json")

    with open(json_file, 'r', encoding = 'utf-8') as json_f:
        json_data = json.load(json_f)
        
    return pd.DataFrame(json_data)

In [11]:
all_terms_from_json = read_terms_from_json('all')
print(all_terms_from_json)

major_terms_from_json = read_terms_from_json('major')
print(major_terms_from_json)


            term     AR     DC           en  \
0            אֵב  False  False          bud   
1      אֲבַגְתָא  False  False      Abagtha   
2            אבד  False  False       perish   
3          אֹבֵד  False  False  destruction   
4       אֲבַדֹּה  False  False  destruction   
...          ...    ...    ...          ...   
20578        ὥρα  False   True                
20579         ὥς  False   True                
20580        ὧδε  False   True                
20581       ὦμος  False   True                
20582        ᾠδή  False   True                

                                                   vrefs  
0                                   [JOB 8:12, SNG 6:11]  
1                                             [EST 1:10]  
2      [EXO 10:7, LEV 23:30, LEV 26:38, NUM 16:33, NU...  
3                                 [NUM 24:20, NUM 24:24]  
4                                            [PRO 27:20]  
...                                                  ...  
20578  [JDT 13:4, WIS 

In [12]:
# Read in the OT Bible text
with open(r"D:\GitHub\davidbaines\trabina\data\hbo-hboWLC.txt", 'r', encoding='utf-8') as OT:
    ot_lines = [line.strip('\n') for line in OT.readlines()]
    
with open(r"D:\GitHub\davidbaines\trabina\silnlp\assets\vref.txt" , 'r', encoding='utf-8') as vref:
    ot_vrefs = [line.strip('\n') for line in vref.readlines()]    

In [13]:
print(ot_lines[:10],ot_vrefs[:10])

heb_ot = {ot_vref:verse for ot_vref,verse in zip(ot_vrefs,ot_lines)}
for i, items in enumerate(heb_ot.items()):
    if i > 10:
        break
    else:
        print(items)
# heb_ot is a dictionary with the verse reference as the key and the verse as the value.
# It is the Hebrew OT (Westminster Leningrad codex ) from Paratext extracted line by line by silnlp.

['בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃', 'וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָבֹ֔הוּ וְחֹ֖שֶׁךְ עַל־פְּנֵ֣י תְה֑וֹם וְר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל־פְּנֵ֥י הַמָּֽיִם׃', 'וַיֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר וַֽיְהִי־אֽוֹר׃', 'וַיַּ֧רְא אֱלֹהִ֛ים אֶת־הָא֖וֹר כִּי־ט֑וֹב וַיַּבְדֵּ֣ל אֱלֹהִ֔ים בֵּ֥ין הָא֖וֹר וּבֵ֥ין הַחֹֽשֶׁךְ׃', 'וַיִּקְרָ֨א אֱלֹהִ֤ים ׀ לָאוֹר֙ י֔וֹם וְלַחֹ֖שֶׁךְ קָ֣רָא לָ֑יְלָה וַֽיְהִי־עֶ֥רֶב וַֽיְהִי־בֹ֖קֶר י֥וֹם אֶחָֽד׃ פ', 'וַיֹּ֣אמֶר אֱלֹהִ֔ים יְהִ֥י רָקִ֖יעַ בְּת֣וֹךְ הַמָּ֑יִם וִיהִ֣י מַבְדִּ֔יל בֵּ֥ין מַ֖יִם לָמָֽיִם׃', 'וַיַּ֣עַשׂ אֱלֹהִים֮ אֶת־הָרָקִיעַ֒ וַיַּבְדֵּ֗ל בֵּ֤ין הַמַּ֙יִם֙ אֲשֶׁר֙ מִתַּ֣חַת לָרָקִ֔יעַ וּבֵ֣ין הַמַּ֔יִם אֲשֶׁ֖ר מֵעַ֣ל לָרָקִ֑יעַ וַֽיְהִי־כֵֽן׃', 'וַיִּקְרָ֧א אֱלֹהִ֛ים לָֽרָקִ֖יעַ שָׁמָ֑יִם וַֽיְהִי־עֶ֥רֶב וַֽיְהִי־בֹ֖קֶר י֥וֹם שֵׁנִֽי׃ פ', 'וַיֹּ֣אמֶר אֱלֹהִ֗ים יִקָּו֨וּ הַמַּ֜יִם מִתַּ֤חַת הַשָּׁמַ֙יִם֙ אֶל־מָק֣וֹם אֶחָ֔ד וְתֵרָאֶ֖ה הַיַּבָּשָׁ֑ה וַֽיְהִי־כֵֽן׃', 'וַיִּקְרָ֨א אֱלֹהִ֤ים ׀ לַיַּבָּשָׁה֙ אֶ֔רֶץ ו

In [14]:
# Make a Dataframe from the dictionary
hebrew_ot = pd.DataFrame.from_dict(heb_ot, orient='index', dtype=str, columns=['verse'])
#print(hebrew_ot.verse == '')
#print()
#print(hebrew_ot[hebrew_ot.verse == ''])
#print()
#print(hebrew_ot[hebrew_ot.verse == ''].index)

#hebrew_ot.drop(hebrew_ot['verse']=='', inplace=True)
hebrew_ot.drop(hebrew_ot[hebrew_ot.verse == ''].index, inplace=True)
hebrew_ot

Unnamed: 0,verse
GEN 1:1,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...
GEN 1:2,וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָבֹ֔הוּ וְחֹ֖שֶׁך...
GEN 1:3,וַיֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר וַֽיְהִי־אֽוֹר׃
GEN 1:4,וַיַּ֧רְא אֱלֹהִ֛ים אֶת־הָא֖וֹר כִּי־ט֑וֹב וַי...
GEN 1:5,וַיִּקְרָ֨א אֱלֹהִ֤ים ׀ לָאוֹר֙ י֔וֹם וְלַחֹ֖ש...
...,...
MAL 3:20,וְזָרְחָ֨ה לָכֶ֜ם יִרְאֵ֤י שְׁמִי֙ שֶׁ֣מֶשׁ צְ...
MAL 3:21,וְעַסּוֹתֶ֣ם רְשָׁעִ֔ים כִּֽי־יִהְי֣וּ אֵ֔פֶר ...
MAL 3:22,זִכְר֕וּ תּוֹרַ֖ת מֹשֶׁ֣ה עַבְדִּ֑י אֲשֶׁר֩ צִ...
MAL 3:23,הִנֵּ֤ה אָֽנֹכִי֙ שֹׁלֵ֣חַ לָכֶ֔ם אֵ֖ת אֵלִיָּ...


In [15]:
# Create a dictionary with a term, and a list of verse references where the term should appear.
terms = [term for term in major_terms['term']]
print(f"These are the first 10 terms in the major_terms dataframe. {terms[:10]}")

# Get the list of references
reference_lists = [refs for refs in major_terms['vrefs']]
#print(reference_lists[0:10])

terms_and_refs_dict = {term:refs for term, refs in zip(terms,reference_lists)}
print(f"These are the references where the term 'אֵבוּס' is found. {terms_and_refs_dict['אֵבוּס']}")

These are the first 10 terms in the major_terms dataframe. ['אֲבַגְתָא', 'אֵבֶה', 'אֵבוּס', 'אֲבַטִּיחַ', 'אֲבִי', 'אֲבִי', 'אֲבִי־עַלְבוֹן', 'אֲבִיאֵל', 'אֲבִיאֵל', 'אֲבִיאָסָף']
These are the references where the term 'אֵבוּס' is found. ['JOB 39:9', 'PRO 14:4', 'ISA 1:3']


In [16]:
# Perhaps we can invert this: For every verse which words do we expect to find in it?
# That would limit the length of the df to the number of verses and the width to the number
# of words in the longest verse.

# Create a list of verses with the set of terms that should appear in that verse.
print(terms[:10])

#refs_and_terms_df = pd.DataFrame([reference_lists], index = terms)
#refs_and_terms_df.set_index(, drop=True, append=False, inplace=True, verify_integrity=True)

#Are the terms unique?
#print(len(set(terms)) == len(terms))

refs_and_terms_df = pd.DataFrame(zip(terms,reference_lists),columns=['term','refs'])
refs_and_terms_df = refs_and_terms_df.explode('refs')
refs_and_terms_df

['אֲבַגְתָא', 'אֵבֶה', 'אֵבוּס', 'אֲבַטִּיחַ', 'אֲבִי', 'אֲבִי', 'אֲבִי־עַלְבוֹן', 'אֲבִיאֵל', 'אֲבִיאֵל', 'אֲבִיאָסָף']


Unnamed: 0,term,refs
0,אֲבַגְתָא,EST 1:10
1,אֵבֶה,JOB 9:26
2,אֵבוּס,JOB 39:9
2,אֵבוּס,PRO 14:4
2,אֵבוּס,ISA 1:3
...,...,...
8643,Ωλαμος,1ES 9:30
8644,Ωνους,1ES 5:22
8645,Ωξ,JDT 8:1
8646,Ωουδας,1ES 9:23


In [17]:
# Add a column which indicates whether or not the term is found in the verse.
#refs_and_terms_df['term_found_in_verse'] = 
test_term = refs_and_terms_df.term[1]
test_ref  = refs_and_terms_df.refs[1]
print(test_term,test_ref)
if test_ref in hebrew_ot.index:
    print("The test_reference is in the hebrew_ot index.")
else:
    print("The test_reference isn't in the hebrew_ot index.")
    
# This is equivalent to ``df1.at['a','A']``
# df1.loc['a', 'A']
# Out[54]: 0.13200317033032932

print(hebrew_ot.loc[test_ref,'verse'])
# Hebrew doesn't match easily - the vowel marks are often different as in this case.
print(test_term in hebrew_ot.loc[test_ref,'verse'])


אֵבֶה JOB 9:26
The test_reference is in the hebrew_ot index.
חָ֭לְפוּ עִם־אֳנִיּ֣וֹת אֵבֶ֑ה כְּ֝נֶ֗שֶׁר יָט֥וּשׂ עֲלֵי־אֹֽכֶל׃
False


In [18]:
hebrew_vowels = 0591-U+05BD, U+05BF-U+05C2, and U+05C4-U+05C7
table = str.maketrans(dict.fromkeys('aeiouAEIOU'))
'אֵבֶה'.translate(table)

SyntaxError: leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal integers (1281267943.py, line 1)

In [None]:
#refs_and_terms_df['ref_exists'] =  refs_and_terms_df.refs.isin(hebrew_ot.index)

print(hebrew_ot.loc['GEN 1:1'])
#hebrew_ot


#refs_and_terms_df['term_exists'] = refs_and_terms_df.refs.isin(hebrew_ot.index)
#refs_and_terms_df.refs.isin(hebrew_ot.index[refs_and_terms_df.refs])

#refs_and_terms_df.term  # in hebrew_ot.loc[refs_and_terms_df.refs,'verse']
#refs_and_terms_df.refs.isin(hebrew_ot.index)

In [None]:
#How many of the 8648 major terms are in the All terms data?
major_terms['all_terms_exact'] = major_terms['term'].map(all_terms['term'].value_counts())
major_terms['all_terms_exact'] = major_terms['all_terms_exact'].fillna(0)
major_terms.sort_values('all_terms_exact',ascending=False)
major_terms

In [None]:
col = major_terms['all_terms_exact'] 
count = col[col != 0].count()
print(f"There are {count} major-metadata terms that appear exactly in the All metadata file.")
print(f"There are {len(major_terms) - count} major-metadata terms that don't appear exactly in the All metadata file.")


In [None]:
def count_matches(reference_col, source_col):
    matches = source_col.map(reference_col.value_counts()).fillna(0).astype(int)
    # Return count of values greater than 0 
    return matches[matches > 0].count()

#This is very slow.
#def find_matches(reference_col, source_col):
#    matches = [source for source in source_col if source in reference_col.unique()]
#    return matches

# This is also slow.
def find_matches(reference_col, source_col):
    matches = source_col.map(reference_col.value_counts()).fillna(0).astype(int)
    # Return values greater than 0 
    return matches[matches > 0]

# This is almost instant.
def find_matches(reference_col, source_col):
    return set(source_col).intersection(set(reference_col))

def report_matches(ref_df, ref_name, ref_columns, search_dict):
        
    for name, col in search_dict.items():
        all_matches = set()
        match_count = 0
        unique_values = col.unique()
        print(f"Searching for {len(unique_values)} terms from: '{name}' in the {ref_name} data.")
        for ref_column in ref_columns:
            matches = find_matches(ref_df[ref_column],col)
            match_count += len(matches)
            print(f"There are {len(matches)} found in the '{ref_column}' column.")
            all_matches = all_matches.union(matches)

        print(f"{match_count} '{name}' matched of which {len(all_matches)} are unique.\n")

In [None]:
# Make a dataframe from terms_and_refs
#terms_and_refs = pd.DataFrame.from_dict(terms_and_refs_dict, orient='index', dtype=str, columns=['verse_refs'])

# This idea from https://stackoverflow.com/questions/33504424/pandas-dataframe-from-dictionary-with-lists
# Makes one row, with the key as column labels.
#terms_and_refs = pd.DataFrame([terms_and_refs_dict])

#terms_and_refs = pd.DataFrame(reference_lists, index=terms) 
#terms_and_refs

In [None]:
# Count how many terms from All, Major and JHU (eng) occur in the Macula data.
# Macula Index(['Original unicode', 'Greek lemma', 'Greek normalized', 'Greek gloss', 'English gloss', 'Mandarin gloss']

terms_macula         = macula['Source']
terms_macula_english = macula['English gloss']

terms_all     = all_terms['term']
terms_major   = major_terms['term']
terms_jhu_eng = jhu['eng']

print(f"There are {count_matches(terms_macula,terms_all)}   all_terms out of {len(terms_all)} found in the Macula {terms_macula.name} column.")
print(f"There are {count_matches(terms_macula,terms_major)} major_terms out of {len(terms_major)} found in the Macula {terms_macula.name} column.")
print(f"There are {count_matches(terms_macula_english,terms_jhu_eng)} jhu_eng terms out of {len(terms_jhu_eng)} found in the Macula {terms_macula.name} column.\n")

# How many of these Original language terms are found exactly as a key in other lists?
macula_search_columns = ['Source', 'Greek lemma', 'Greek normalized', 'Greek gloss of Hebrew']
search = {'All terms':terms_all, 'Major terms':terms_major}
report_matches(macula, "macula", macula_search_columns, search)

Macula columns:
Source                      10851
English gloss                3215
Hebrew source                9534
English gloss of Hebrew      2569
Greek gloss of Hebrew        3435
Mandarin gloss of Hebrew     2396
Greek source                 1319
Greek lemma                   534
Greek normalized              867
English gloss of Greek        760

In [None]:
macula_search_columns = ['English gloss']
search = {'JHU eng terms':terms_jhu_eng}
report_matches(macula, "macula",  macula_search_columns, search)

macula_search_columns = ['Source', 'Greek lemma', 'Greek normalized', 'Greek gloss of Hebrew']
search = {'JHU grc_accented_terms' : jhu['grc_accented'], 'JHU ell_helenic1 terms' : jhu['ell_hellenic1']}
report_matches(macula, "macula", macula_search_columns, search)

In [None]:
matches_es = set(jhu['spa_blph']).intersection(set(major_terms['es']))
print(f"There are {len(set(jhu['spa_blph']))} unique words in the JHU Spanish list.")
print(f"{len(matches_es)} of the words in the JHU Spanish list match those in the Major metadata.")

[word for i, word in enumerate(matches_es) if i <10]


In [None]:
matches = set(jhu['cmn_sf_ncv']).intersection(set(macula['Mandarin gloss of Hebrew']))

print(f"There are {len(set(jhu['cmn_sf_ncv']))} terms in the JHU CMN list and {len(set(macula['Mandarin gloss of Hebrew']))} Mandarin glosses in Macula.")
print(f"There are {len(matches)} that match.")
print(matches)


### Checking names in translations.
Given a list of names from the Major Terms data, and the verse references for the names check that the names appear in the extract in the expected verses.


In [None]:
merged = macula.reset_index().merge(jhu, how="left", left_on='English gloss' ,right_on='eng', indicator=True).set_index('ref')

# In the '_merge' column there are two values 'both' for those rows in jhu that matched on 'eng' and left_only for those that didn't.
# Replace 'both' with 'matched on eng and English gloss'
merged['_merge'] = merged['_merge'].replace('both','matched on eng and English gloss')
move_column(merged,'eng',3)
merged

In [None]:
# Mandarin column is 'cmn_sf_ncv'
# Greek column is 'grc_accented'
# Hebrew column is 'heb_2009'

search_dict  = {'eng':['English gloss'], 'heb_2009':['Hebrew source'], 'grc_accented': ['Greek lemma', 'Greek source','Greek normalized'], 'cmn_sf_ncv' :['Mandarin gloss of Hebrew']}
search_dict  = {'eng':['English gloss']}

# This requires too much memory (12.8 GiB) 
# For those rows for which a match hasn't been found, see if we can match on other column pairs.
#for search_for, search_in_cols in search_dict.items():
#    for search_col in search_in_cols:
#        print(search_for, search_col)
#        merged = macula.reset_index().merge(jhu, how="left", left_on=search_col ,right_on=search_for, indicator=True).set_index('ref')
#        # In the '_merge' column there are two values 'both' for those rows in jhu that matched on 'eng' and left_only for those that didn't.
#        # Replace 'both' with an indication of the two columns used in the match'
#        merged['_merge'] = merged['_merge'].replace('both',f'matched on {search_for} and {search_col}')

matches = macula
#print(macula.index.is_monotonic)

# https://stackoverflow.com/questions/70460010/map-index-of-one-dataframe-to-column-of-another-dataframe
# df1['payment'] = df1.index.map(dict(zip(df2.index,df2['paid value'])))

# Add a match column to macula for each possible match in JHU.
# Create lookup tables maybe.
    # Find the unique list of glosses from Macula and from JHU in a given language.
    # 
for search_for, search_in_cols in search_dict.items():
    for search_col in search_in_cols:
        print(search_for, search_col)
        #matches[f'match {search_for} in {search_col}']
        # 
        print(jhu.index[jhu[search_for] == matches[search_col]])
