# Match Biblical names from multiple versions and across languages.
The JHU trabina project has done this with a list of 1128 names across 531 languages.
It would be good also to 'read' through the projects or extracts we have to do a similar matching.
We have some hand crafted data to get started: The Macula dataset and also the All Biblical Terms and Major Biblical Terms lists from Paratext.

In [1]:
#!/usr/bin/env python3

import csv
from collections import Counter
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import numpy as np
import os
import pandas as pd
from pathlib import Path
import sys

In [2]:
def get_macula_df(file, sep="\t"):

    # Load the macula data. Note that the data has already been filtered for names-only using XBase.
    macula_df = pd.read_csv(file, dtype=str, sep=sep)
    macula_df.fillna('', inplace=True)
    
    #print(macula_df)
    
    #Index(['ref', 'Original unicode', 'Hebrew Original', 'Aramaic Original', 'Greek Original', 'Greek lemma', 'Greek normalized', 'Greek gloss', 'English gloss', 'Mandarin gloss']
    
    return macula_df

In [3]:
#macula_data_tsv = data_folder / "macula_names.tsv"
#macula_df = get_macula_df(macula_data_tsv)
#print(f"There are {len(macula_df['Original unicode'].unique())} unique 'Original Unicode' terms in the Macula dataset.")

#macula_source_terms = macula_df.iloc[:, [0,4,5,6]]
#print(macula_source_terms)

#unique_terms = set(macula_df['Original unicode']) | set(macula_df['Greek lemma']) | set(macula_df['Greek normalized']) | set(macula_df['Greek gloss'])
#print(f"There are {len(unique_terms)} unique source language terms in the macula data.")


In [12]:
data_folder = Path("D:/GitHub/davidbaines/trabina/data")
by_lang_folder = data_folder / "by-lang"
jhu_filename = "eng"
compare_col = "English gloss"
english_names_file = data_folder / jhu_filename

In [13]:
def read_tsv(file,column_names):
    df = pd.read_table(file, header=None, dtype=str, sep='\t')
    df.fillna('', inplace=True)
    df.rename(column_names, axis="columns", inplace=True)
    return df

In [19]:
hebrew_refs_and_names = data_folder / "hebrew_refs_and_names.tsv"
hebrew_cols = {0: 'ref', 1: 'Hebrew source', 2: 'English gloss of Hebrew', 3:'Greek gloss of Hebrew', 4:'Mandarin gloss of Hebrew'}
hebrew_refs = read_tsv(hebrew_refs_and_names, hebrew_cols)
hebrew_refs

Unnamed: 0,ref,Hebrew source,English gloss of Hebrew,Greek gloss of Hebrew,Mandarin gloss of Hebrew
0,GEN 2:4!8,יְהוָ֥ה,LORD,,耶和华
1,GEN 2:5!15,יְהוָ֤ה,LORD,,耶和华
2,GEN 2:7!2,יְהוָ֨ה,LORD,,耶和华
3,GEN 2:8!2,יְהוָ֧ה,LORD,κύριος,耶和华
4,GEN 2:8!5,עֵ֖דֶן,Eden,εδεμ,伊甸
...,...,...,...,...,...
34182,MAL 3:22!3,מֹשֶׁ֣ה,Moses,μωυσῆ,摩西
34183,MAL 3:22!8,חֹרֵב֙,Horeb,ξωρηβ,何烈山
34184,MAL 3:22!11,יִשְׂרָאֵ֔ל,Israel,ισραηλ,以色列
34185,MAL 3:23!6,אֵלִיָּ֣ה,Elijah,ηλιαν,以利亚


In [30]:
hebrew = hebrew_refs.drop(columns='ref')
hebrew.drop_duplicates(subset=None, keep='first', inplace=True)
hebrew

Unnamed: 0,Hebrew source,English gloss of Hebrew,Greek gloss of Hebrew,Mandarin gloss of Hebrew
0,יְהוָ֥ה,LORD,,耶和华
1,יְהוָ֤ה,LORD,,耶和华
2,יְהוָ֨ה,LORD,,耶和华
3,יְהוָ֧ה,LORD,κύριος,耶和华
4,עֵ֖דֶן,Eden,εδεμ,伊甸
...,...,...,...,...
34119,מַלְאָכִֽי,Malachi,ἀγγέλου,玛拉基
34137,אֲדֹנ,lord’s,,主
34141,לֵוִ֔י,Levi,,利未
34183,חֹרֵב֙,Horeb,ξωρηβ,何烈山


In [32]:
len(hebrew['Hebrew source'].unique())

9533

In [34]:
hebrew.loc[hebrew['Hebrew source'] == 'יְהוָ֧ה']

Unnamed: 0,Hebrew source,English gloss of Hebrew,Greek gloss of Hebrew,Mandarin gloss of Hebrew
3,יְהוָ֧ה,LORD,κύριος,耶和华
25,יְהוָ֧ה,LORD,κυρίου,耶和华
3536,יְהוָ֧ה,Lord,κύριος,耶和华
5557,יְהוָ֧ה,LORD,θεὸς,耶和华
7284,יְהוָ֧ה,LORD,κύριον,耶和华
7550,יְהוָ֧ה,Lord,,耶和华
7871,יְהוָ֧ה,Lord,κύριον,耶和华
13991,יְהוָ֧ה,LORD,,耶和华
15819,יְהוָ֧ה,LORD’s,,耶和华
22060,יְהוָ֧ה,LORD’s,κυρίου,耶和华


In [52]:
# It might be useful to combine all the potential glosses of a word in a single cell.

# https://stackoverflow.com/questions/58435058/combine-column-values-based-on-the-other-column-values-in-pandas-data-frame
#array_agg = lambda x: '/'.join(x.astype(str))
#grp_df = df.groupby(['line', 'priority'], as_index=False).agg({'to_line': array_agg})

#df.groupby(['line','priority'])['to_line'].apply(lambda x: '/'.join(str(y) for y in x)).reset_index(name='to_line')

#hebrew_grouped = hebrew.groupby(['Hebrew source'], as_index=False).agg({'English gloss of Hebrew': array_agg})#, 'Greek gloss of Hebrew': array_agg,'Mandarin gloss of Hebrew': array_agg})
#for line in hebrew_grouped[0:100]:
#    print(line)

for line in hebrew.groupby(['Hebrew source'])['English gloss of Hebrew'].apply(lambda x: '|'.join(str(y) for y in set(x))):
    print(line)

Ulam
Ur
Uriah
On
Ur
Ara
Edom
Edomites|Edom
Edom
Edom
Edomites|Edom
Edom
Edom
Edomites|Edom
Edom
Edom
Edomites|Edom
Edom
Edom
Edom
Edomites|Edom
Edomites|Edom
Evi
Elul
Elizaphan
Eliphelehu
Elishama
Eliphelehu
Elizaphan
Elishama
Eliathah
Eliel
Eliel
Eliel
Eliel
Eliel
Eliel
Eliab
Eliab
Eliab
Eliab
Eliab
Eliab
Eliathah
Eliab
Eliab’s
Eliab
Eliab
Eliab
Elidad
Elihu
Elihu
Elihu
Elihu
Elihu
Elihu
Elihoreph
Elimelech
Elimelech
Elimelech
Elimelech
Elienai
Eliezer
Eliezer
Eliezer
Eliezer
Eliezer
Eliezer
Eliezer
Eliezer
Eliezer
Eliam
Eliam
Eliphelet
Eliphelet
Eliphelet
Eliphaz
Eliphaz
Eliphaz
Eliphaz
Eliphaz
Eliphaz
Eliphal
Eliphaz
Eliphaz
Eliphaz
Eliphelet
Elizur
Elizaphan
Elika
Elishua
Elisheba
Elisha
Elisha
Elishah
Elisha
Elisha
Elisha
Elisha
Elisha
Elisha
Elishah
Elisha
Elisha
Elisha
Elisha
Elisha
Elishama
Elishama
Elishama
Elishama
Elishama
Elisha
Elisha
Elishaphat
Amorites
Amorites
Amorite|Amorites
Amorites|Amorite
Amorites|Amorite
Amorites
Amorites
Amorites
Amorites
Amorite|Amorites
Amorite

In [23]:
greek_refs_and_names = data_folder / "greek_refs_and_names.tsv"
greek_cols = {0:'ref', 1:'Greek source', 2:'Greek lemma', 3:'Greek normalized', 4:'English gloss of greek'}
greek_refs = read_tsv(greek_refs_and_names, greek_cols)
greek_refs

Unnamed: 0,ref,Greek source,Greek lemma,Greek normalized,English gloss of greek
0,MAT 1:1!3,Ἰησοῦ,Ἰησοῦς,Ἰησοῦ,of Jesus
1,MAT 1:1!4,Χριστοῦ,Χριστός,Χριστοῦ,Christ
2,MAT 1:1!6,Δαυεὶδ,Δαυίδ,Δαυείδ,of David
3,MAT 1:1!8,Ἀβραάμ.,Ἀβραάμ,Ἀβραάμ,of Abraham
4,MAT 1:2!1,Ἀβραὰμ,Ἀβραάμ,Ἀβραάμ,Abraham
...,...,...,...,...,...
9257,REV 22:13!6,,Ὦ,Ὦ,
9258,REV 22:16!2,,Ἰησοῦς,Ἰησοῦς,
9259,REV 22:16!20,,Δαυίδ,Δαυείδ,
9260,REV 22:20!11,,Ἰησοῦς,Ἰησοῦ,


In [29]:
greek = greek_refs.drop(columns='ref')
greek.drop_duplicates(subset=None, keep='first', inplace=True)
greek

Unnamed: 0,Greek source,Greek lemma,Greek normalized,English gloss of greek
0,Ἰησοῦ,Ἰησοῦς,Ἰησοῦ,of Jesus
1,Χριστοῦ,Χριστός,Χριστοῦ,Christ
2,Δαυεὶδ,Δαυίδ,Δαυείδ,of David
3,Ἀβραάμ.,Ἀβραάμ,Ἀβραάμ,of Abraham
4,Ἀβραὰμ,Ἀβραάμ,Ἀβραάμ,Abraham
...,...,...,...,...
9233,,Εὐφράτης,Εὐφράτην,
9234,,Ἁρμαγεδών,Ἁρμαγεδών,
9236,,Βαβυλών,ΒΑΒΥΛΩΝ,
9248,,Γώγ,Γώγ,


In [5]:
simple_df = macula_df.drop(columns= ['ref','Hebrew Original', 'Aramaic Original', 'Greek Original'])
#Remove duplicate rows from the simple_df
simple_df.drop_duplicates(subset=None, keep='first', inplace=True)

print(f"\nThe simple macula dataframe has these columns:\n{simple_df.columns}\n")
#print(simple_df)

NameError: name 'macula_df' is not defined

In [None]:
all_unique = set()
for col in simple_df.columns:
    unique = set(simple_df[col].unique())
    #print(unique,type(unique))
    all_unique = all_unique | unique

print(f"There are a total of {len(all_unique)} unique terms we can match on across the five languages in the Macula dataset.\n")
print(f"{macula_df.nunique()}")

In [None]:
data_folder = Path("D:/GitHub/trabina/data") 
by_lang_folder = data_folder / "by-lang"
jhu_filename = "eng"
compare_col = "English gloss"
english_names_file = by_lang_folder / jhu_filename
updated_macula_data_tsv = data_folder / "updated_macula_names.tsv"

macula_data_tsv = data_folder / "macula_names.tsv"
macula_df = get_macula_df(macula_data_tsv)

simple_df = macula_df.drop(columns= ['ref','Hebrew Original', 'Aramaic Original', 'Greek Original'])
#Remove duplicate rows from the simple_df
simple_df.drop_duplicates(subset=None, keep='first', inplace=True)

print(f"\nThe simple macula dataframe has these columns:\n{simple_df.columns}")
simple_df

In [None]:
def get_name_matrix(folder):
    
    all_names = dict()
    
    folder = Path(folder)
    files = sorted(folder.glob(r'*'))
    #print([file.name[0:3] for file in files])
    
    for file in files:
        with open(file, 'r', encoding='utf-8') as fin:
            names = [name.strip('\n').title() for name in fin.readlines()]
            all_names[file.name] = names   
    
    return all_names

In [None]:
#Get all the names and make a dataframe
all_jhu_names = get_name_matrix(by_lang_folder)
jhu_df = pd.DataFrame.from_dict(all_jhu_names, dtype=str)

print(f"\nThe JHU dataframe has these columns:\n {jhu_df.columns}")  
jhu_df


In [None]:
#jhu_df = jhu_df.set_index(["eng"], drop=True, append=False, inplace=False, verify_integrity=False)
jhu_df.describe()

Not sure that merging like this is useful.

In [None]:
#Concatenate the two dataframes joining on exact matches of 
# 'English gloss' and jhu_eng columns. Retain both.

merged_df = pd.merge(simple_df, jhu_df, how='inner', left_on='English gloss' ,right_on='eng', indicator=True)
merged_df.rename(columns={"_merge": "matched_on_eng"},inplace=True)
merged_df

In [None]:
# Read in the All terms data from silnlp
sil_assets_path = Path('D:/GitHub/davidbaines/trabina/silnlp/assets')
all_terms_file = sil_assets_path / 'All-metadata.txt'

all_terms = pd.read_table(all_terms_file,header=None, usecols=[0]).squeeze("columns")
all_terms.rename('terms')
print(f"There are {len(all_terms.unique())} unique terms in the All terms dataset.")


In [None]:
assets_folder = Path('D:/GitHub/davidbaines/trabina/silnlp/assets')
patterns = ['All', 'Major', 'SilNt'] #'Pt6' doesn't have any glosses.

def read_assets_data(folder, pattern):
    # Function to read in the various PT metadata files. 
    # Each are read in differently.
    
    metadata_file = folder / f"{pattern}-metadata.txt"
    glosses_files = folder.glob(f"*-{pattern}-glosses.txt")
    vrefs_file = folder / f"{pattern}-vrefs.txt"
    print(f"Reading in {pattern} files.")
    
    # The assest folder contains files with pattern from ['Major', 'All', SilNt', 'Pt6']
    # Different sets have different data. 
    
    # Glosses exist for certain languages in separate files.
    # Not all files exist for all patterns. The 'Major' files are as follows:
    # en-Major-glosses.txt, en-Pt6-glosses.txt , en-SilNt-glosses.txt es-Major-glosses.txt fr-Major-glosses.txt, id-Major-glosses.txt Major-metadata.txt , Major-vrefs.txt
    
    # Reading in vrefs is the same for all patterns:
    vrefs = pd.read_csv(vrefs_file,  names=['vrefs'], converters={'vrefs': lambda x: x.split('\t')})     
    #vrefs = pd.read_csv(vrefs_file,header=None).squeeze("columns")
    #vrefs.rename({0: "vrefs"}, axis="columns", inplace=True)
    #vrefs = [vref for vref in vrefs.str.split('\t', expand=True)
    
    if pattern == 'All':
        # This dataset doesn't include sense numbers. Only the first column contains data.
        # The column contains (DC) and (AR) which need to be split off.
        #print(metadata_file)
        
        terms = pd.read_table(metadata_file,header=None, usecols=[0]).squeeze("columns")
        terms.rename('terms')
        
        terms = terms.str.split(' ', expand=True)
        
        terms.rename({0: "term", 1: "note"}, axis="columns", inplace=True)
        terms['AR'] = terms['note'] == '(AR)'
        terms['DC'] = terms['note'] == '(DC)'
        terms.drop(columns=['note'],inplace=True)
    
    if pattern == 'Major':
        terms = pd.read_table(metadata_file,header=None)
        terms.rename({0: "term", 1: "domain", 2:'category'}, axis="columns", inplace=True)
        terms[['term', 'note']] = terms['term'].str.split(' ', 1, expand=True)

        terms[['term', 'sense']] = terms['term'].str.split('-', 1, expand=True)
        terms['AR'] = terms['note'] == '(AR)'
        terms['DC'] = terms['note'] == '(DC)'
        terms.drop(columns=['note'],inplace=True)
        
    if pattern == 'SilNt':
        terms = pd.read_table(metadata_file,header=None)
        terms.rename({0: "term", 1: "domain", 2:'category'}, axis="columns", inplace=True)
        # The domain column is empty.
        terms.drop(columns=['domain'],inplace=True)
        
        
    isos  = list()
    for gloss_file in glosses_files:
        iso = gloss_file.name[:gloss_file.name.find("-")]
        terms[iso] = pd.read_table(gloss_file,header=None, usecols=[0]).squeeze("columns")
        terms[iso] = terms[iso].fillna('')
    #print(f"The glosses are:\n{terms}")

    return terms, vrefs


all_terms, all_vrefs = read_assets_data(assets_folder, 'All')
major_terms, major_vrefs = read_assets_data(assets_folder, 'Major')
silnt_terms, silnt_vrefs = read_assets_data(assets_folder, 'SilNt')

print(f"\nAll terms:\n{all_terms.nunique()}\n")
print(f"Major terms:\n{major_terms.nunique()}\n")
print(f"SilNt terms:\n{silnt_terms.nunique()}\n")
    
       
#print(f"{pattern} terms:\n{terms}\n")
print(f"Major vrefs:\n{major_vrefs}\n")
#print(f"There are {len(vrefs)} lists of verse references. ")
    

In [None]:
major_terms.to_csv(r"D:\GitHub\davidbaines\trabina\data\major_terms.txt", sep = '\t')

In [None]:
#How many of the 8648 major terms are in the All terms data?
major_terms['all_terms_exact'] = major_terms['term'].map(all_terms['term'].value_counts())
major_terms['all_terms_exact'] = major_terms['all_terms_exact'].fillna(0)
major_terms.sort_values('all_terms_exact',ascending=False)

In [None]:
# These major terms aren't found exactly in the All-metadata.
# major_terms.loc[major_terms['all_terms_exact'].isna()]
major_terms.loc[major_terms['all_terms_exact'] == 0]

In [None]:
col = major_terms['all_terms_exact'] 
count = col[col != 0].count()
print(f"There are {count} major-metadata terms that appear exactly in the All metadata file.")
print(f"There are {len(major_terms) - count} major-metadata terms that don't appear exactly in the All metadata file.")


In [None]:
def count_matches(reference_col, source_col):
    matches = source_col.map(reference_col.value_counts()).fillna(0).astype(int)
    # Return count of values greater than 0 
    return matches[matches > 0].count()

#This is very slow.
#def find_matches(reference_col, source_col):
#    matches = [source for source in source_col if source in reference_col.unique()]
#    return matches

# This is also slow.
def find_matches(reference_col, source_col):
    matches = source_col.map(reference_col.value_counts()).fillna(0).astype(int)
    # Return values greater than 0 
    return matches[matches > 0]

# This is almost instant.
def find_matches(reference_col, source_col):
    return set(source_col).intersection(set(reference_col))

def report_matches(ref_df, ref_columns, search_dict):
        
    for name, col in search_dict.items():
        all_matches = set()
        match_count = 0
        unique_values = col.unique()
        print(f"Searching for {len(unique_values)} terms from: '{name}'.")
        for ref_column in ref_columns:
            matches = find_matches(ref_df[ref_column],col)
            match_count += len(matches)
            print(f"There are {len(matches)} found in the '{ref_column}' column.")
            all_matches = all_matches.union(matches)

        print(f"{match_count} '{name}' matched of which {len(all_matches)} are unique.")
        #print(sorted(all_matches))
        print('\n')

In [None]:
# Count how many terms from All, Major and JHU (eng) occur in the Macula data.
# Macula Index(['Original unicode', 'Greek lemma', 'Greek normalized', 'Greek gloss', 'English gloss', 'Mandarin gloss']

terms_macula         = simple_df['Original unicode']
terms_macula_english = simple_df['English gloss']

terms_all     = all_terms['term']
terms_major   = major_terms['term']
terms_jhu_eng = jhu_df['eng']

#print(f"There are {count_matches(terms_macula,terms_all)}   all_terms out of {len(terms_all)} found in the Macula 'Original unicode' column.")
#print(f"There are {count_matches(terms_macula,terms_major)} major_terms out of {len(terms_major)} found in the Macula 'Original unicode' column.")
#print(f"There are {count_matches(terms_macula_english,terms_jhu_eng)} jhu_eng terms out of {len(terms_jhu_eng)} found in the Macula 'Original unicode' column.")

# How many of these Original language terms are found exactly as a key in other lists?
macula_search_columns = ['Original unicode', 'Hebrew Original', 'Aramaic Original', 'Greek Original', 'Greek lemma', 'Greek normalized', 'Greek gloss']
search = {'All terms':terms_all, 'Major terms':terms_major}
report_matches(macula_df, macula_search_columns, search)


In [None]:
macula_search_columns = ['English gloss']
search = {'JHU eng terms':terms_jhu_eng}
report_matches(macula_df, macula_search_columns, search)

macula_search_columns = ['Original unicode', 'Greek Original', 'Greek lemma', 'Greek normalized', 'Greek gloss']
search = {'JHU grc_accented_terms' : jhu_df['grc_accented'], 'JHU ell_helenic1 terms' : jhu_df['ell_hellenic1']}
report_matches(macula_df, macula_search_columns, search)

In [None]:
def checker(wrong_options,correct_options):
    names_array=[]
    ratio_array=[]    
    for wrong_option in wrong_options:
        if wrong_option in correct_options:
            names_array.append(wrong_option)
            ratio_array.append('100')
        else:   
            x=process.extractOne(wrong_option,correct_options,scorer=fuzz.token_set_ratio)
            names_array.append(x[0])
            ratio_array.append(x[1])
    return names_array,ratio_array

In [None]:
def count_matches(reference_col, source_col):
    str2Match = source_col.fillna('').tolist()
    strOptions = reference_col.fillna('').tolist()
    
    matches = source_col.map(reference_col.value_counts()).fillna(0).astype(int)
    # Return count of values greater than 0 
    return matches[matches > 0].count()

In [None]:
major_matches = terms_major.map(terms_macula.value_counts()).fillna(0).astype(int)
# Return count of values greater than 0 
print(major_matches[major_matches > 0].count())
#print(major_matches)

#all_name_match,   all_ratio_match = checker(terms_macula,terms_all[0:100])
#major_name_match, major_ratio_match=checker(terms_macula,terms_major)

#terms_df['fuzzy_match']=pd.Series(all_name_match)
#terms_df['fuzzy_ratio']=pd.Series(all_ratio_match)
#print(all_name_match,all_ratio_match)

In [None]:
#print(len(all_name_match),len(all_ratio_match))
#all_fuzzy_matches = pd.DataFrame.from_dict({'Original unicode': terms_macula, 'all_metadata fuzzy match' : all_name_match, 'all_metadata fuzzy ratio' :all_ratio_match})
#all_fuzzy_matches

In [None]:
def get_terms_from_files(folder,filenames):
    
    all_names = dict()
    
    folder = Path(folder)
    files = [folder / filename for filename in filenames]
    #print([file.name[0:3] for file in files])
    
    for file in files:
        with open(file, 'r', encoding='utf-8') as fin:
            names = [name.strip('\n').title() for name in fin.readlines()]
            all_names[file.name] = names   
    
    return all_names

In [None]:
matches_es = set(jhu_df['spa_blph']).intersection(set(major_terms['es']))
print(f"There are {len(set(jhu_df['spa_blph']))} unique words in the JHU Spanish list.")
print(f"{len(matches_es)} of the words in the JHU Spanish list match those in the Major metadata.")

[word for i, word in enumerate(matches_es) if i <10]


In [None]:
matches = set(jhu_df['cmn_sf_ncv']).intersection(set(simple_df['Mandarin gloss']))

print(f"There are {len(set(jhu_df['cmn_sf_ncv']))} terms in the JHU CMN list and {len(set(simple_df['Mandarin gloss']))} Mandarin glosses in Macula.")
print(f"{len(matches)} match.")
print(matches)


### Checking names in translations.
Given a list of names from the Major Terms data, and the verse references for the names check that the names appear in the extract in the expected verses.


In [None]:
major_metadata