# Match Biblical names from multiple versions and across languages.
The JHU trabina project has done this with a list of 1128 names across 531 languages.
It would be good also to 'read' through the projects or extracts we have to do a similar matching.
We have some hand crafted data to get started: The Macula dataset and also the All Biblical Terms and Major Biblical Terms lists from Paratext.

In [1]:
#!/usr/bin/env python3

import csv
from collections import Counter
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import numpy as np
import os
import pandas as pd
from pathlib import Path
import sys

In [2]:
def get_macula_df(file, sep="\t"):

    # Load the macula data. Note that the data has already been filtered for names-only using XBase.
    macula_df = pd.read_csv(file, dtype=str, sep=sep)
    macula_df.fillna('', inplace=True)
    
    #print(macula_df)
    
    #Index(['ref', 'Original unicode', 'Hebrew Original', 'Aramaic Original', 'Greek Original', 'Greek lemma', 'Greek normalized', 'Greek gloss', 'English gloss', 'Mandarin gloss']
    
    return macula_df

In [3]:
data_folder = Path("D:/GitHub/trabina/data") 
by_lang_folder = data_folder / "by-lang"
jhu_filename = "eng"
compare_col = "English gloss"
english_names_file = by_lang_folder / jhu_filename
updated_macula_data_tsv = data_folder / "updated_macula_names.tsv"

macula_data_tsv = data_folder / "macula_names.tsv"
macula_df = get_macula_df(macula_data_tsv)
print(f"There are {len(macula_df['Original unicode'].unique())} unique 'Original Unicode' terms in the Macula dataset.")

#macula_source_terms = macula_df.iloc[:, [0,4,5,6]]
#print(macula_source_terms)

unique_terms = set(macula_df['Original unicode']) | set(macula_df['Greek lemma']) | set(macula_df['Greek normalized']) | set(macula_df['Greek gloss'])
print(f"There are {len(unique_terms)} unique source language terms in the macula data.")


There are 10851 unique 'Original Unicode' terms in the Macula dataset.
There are 14870 unique source language terms in the macula data.


In [4]:
simple_df = macula_df.drop(columns= ['ref','Hebrew Original', 'Aramaic Original', 'Greek Original'])
#Remove duplicate rows from the simple_df
simple_df.drop_duplicates(subset=None, keep='first', inplace=True)

print(f"\nThe simple macula dataframe has these columns:\n{simple_df.columns}\n")
#print(simple_df)


The simple macula dataframe has these columns:
Index(['Original unicode', 'Greek lemma', 'Greek normalized', 'Greek gloss',
       'English gloss', 'Mandarin gloss'],
      dtype='object')



In [5]:
all_unique = set()
for col in simple_df.columns:
    unique = set(simple_df[col].unique())
    #print(unique,type(unique))
    all_unique = all_unique | unique

print(f"There are a total of {len(all_unique)} unique terms we can match on across the five languages in the Macula dataset.\n")
print(f"{macula_df.nunique()}")

There are a total of 20479 unique terms we can match on across the five languages in the Macula dataset.

ref                 16629
Original unicode    10851
Hebrew Original      9390
Aramaic Original      181
Greek Original       1319
Greek lemma           534
Greek normalized      867
Greek gloss          3435
English gloss        3215
Mandarin gloss       2396
dtype: int64


In [6]:
data_folder = Path("D:/GitHub/trabina/data") 
by_lang_folder = data_folder / "by-lang"
jhu_filename = "eng"
compare_col = "English gloss"
english_names_file = by_lang_folder / jhu_filename
updated_macula_data_tsv = data_folder / "updated_macula_names.tsv"

macula_data_tsv = data_folder / "macula_names.tsv"
macula_df = get_macula_df(macula_data_tsv)

simple_df = macula_df.drop(columns= ['ref','Hebrew Original', 'Aramaic Original', 'Greek Original'])
#Remove duplicate rows from the simple_df
simple_df.drop_duplicates(subset=None, keep='first', inplace=True)

print(f"\nThe simple macula dataframe has these columns:\n{simple_df.columns}")
simple_df


The simple macula dataframe has these columns:
Index(['Original unicode', 'Greek lemma', 'Greek normalized', 'Greek gloss',
       'English gloss', 'Mandarin gloss'],
      dtype='object')


Unnamed: 0,Original unicode,Greek lemma,Greek normalized,Greek gloss,English gloss,Mandarin gloss
0,יְהוָ֥ה,,,,LORD,耶和华
1,יְהוָ֤ה,,,,LORD,耶和华
2,יְהוָ֨ה,,,,LORD,耶和华
3,יְהוָ֧ה,,,κύριος,LORD,耶和华
4,עֵ֖דֶן,,,εδεμ,Eden,伊甸
...,...,...,...,...,...,...
38793,ΒΑΒΥΛΩΝ,Βαβυλών,ΒΑΒΥΛΩΝ,,Babylon,
38801,Ἰησοῦ,Ἰησοῦς,Ἰησοῦ,,,
38802,Χριστοῦ,Χριστός,Χριστοῦ,,,
38805,Γὼγ,Γώγ,Γώγ,,Gog,


In [7]:
def get_name_matrix(folder):
    
    all_names = dict()
    
    folder = Path(folder)
    files = sorted(folder.glob(r'*'))
    #print([file.name[0:3] for file in files])
    
    for file in files:
        with open(file, 'r', encoding='utf-8') as fin:
            names = [name.strip('\n').title() for name in fin.readlines()]
            all_names[file.name] = names   
    
    return all_names

In [8]:
#Get all the names and make a dataframe
all_jhu_names = get_name_matrix(by_lang_folder)
jhu_df = pd.DataFrame.from_dict(all_jhu_names, dtype=str)

print(f"\nThe JHU dataframe has these columns:\n {jhu_df.columns}")  
jhu_df



The JHU dataframe has these columns:
 Index(['aai_aai', 'aak_aak', 'aau_aau', 'abt_maprik', 'aby_aby', 'acd_acd',
       'ace_ace', 'acf_acf', 'acn_acn', 'acr_cubulcu',
       ...
       'tpa_tpa', 'tpi_tpi', 'tpm_tpm', 'tsn_1908', 'tur_2009', 'ukr_1871',
       'urd_arabic', 'vie_1926compounds', 'xho_1996', 'zul_zul'],
      dtype='object', length=592)


Unnamed: 0,aai_aai,aak_aak,aau_aau,abt_maprik,aby_aby,acd_acd,ace_ace,acf_acf,acn_acn,acr_cubulcu,...,tpa_tpa,tpi_tpi,tpm_tpm,tsn_1908,tur_2009,ukr_1871,urd_arabic,vie_1926compounds,xho_1996,zul_zul
0,Aaron,Erono,Aron,Eron,Eroni,Aron,Harun,Éronn,Aron,Aaron,...,-,Aron,Aarɔn,Arone,Harun,Аарон,ہارون,A-Rôn,Uaron,Ku-Aroni
1,Abaddon,Abadonoyɨ,Abadon,Abadon,Abadoni,Abadon,Abadon,Abadonn,Abadon,Abadon,...,-,Abadon,Abadɔn,Abatone,Abadon,Авадон,ابدون,A-Ba-Đôn,Uapoliyon,Lingu-Abadoni
2,-,-,-,-,-,-,-,-,-,-,...,-,Abarim,Abarim,-,-,-,-,A-Ba-Rim,,-
3,Abba,Ápe,Abba,Wao,Ufane,Sɛi,Oe,Papa,Aba,Chawesaj,...,Ama'U,Aba,Kote,Aba,Abba,Авва,ابّا,A-Ba,Tata,Aba
4,-,-,-,-,-,-,-,-,-,-,...,-,Apdon,Abdɔn,-,-,-,-,Áp-Đôn,Uabdon,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1124,-,-,-,-,-,-,-,-,-,-,...,-,Soba,Zoba,-,-,-,-,Xô-Ba,Wasezobha,-
1125,-,-,-,-,-,-,-,-,-,-,...,-,Sora,Zora,-,-,-,-,Xô-Ra,Ezora,-
1126,Zerubbabel,Serababero,Serubabel,Serababel,Serubabeo,Serubabelɛ,Zerubabel,Zèròbabèl,Zerubabe,Zorobabel,...,-,Serubabel,Zɛrubabɛl,Serubabele,Zerubabel,Заровавель,زرُبابل,Xô-Rô-Ba-Bên,Uzerubhabheli,Uzorobabeli
1127,-,-,-,-,-,-,-,-,-,-,...,-,Suf,Zuf,-,-,-,-,Xu-Phơ,Kazufi,-


In [9]:
#jhu_df = jhu_df.set_index(["eng"], drop=True, append=False, inplace=False, verify_integrity=False)
jhu_df.describe()

Unnamed: 0,aai_aai,aak_aak,aau_aau,abt_maprik,aby_aby,acd_acd,ace_ace,acf_acf,acn_acn,acr_cubulcu,...,tpa_tpa,tpi_tpi,tpm_tpm,tsn_1908,tur_2009,ukr_1871,urd_arabic,vie_1926compounds,xho_1996,zul_zul
count,1129,1129,1129,1129,1129,1129,1129,1129,1129,1129,...,1129,1129,1129,1129,1129,1129,1129,1129,1129,1129
unique,483,472,481,465,456,478,470,491,471,482,...,96,974,953,488,425,492,480,988,1006,493
top,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
freq,556,573,549,576,583,558,568,553,551,555,...,1006,33,70,548,619,551,554,26,50,568


Not sure that merging like this is useful.

In [10]:
#Concatenate the two dataframes joining on exact matches of 
# 'English gloss' and jhu_eng columns. Retain both.

merged_df = pd.merge(simple_df, jhu_df, how='inner', left_on='English gloss' ,right_on='eng', indicator=True)
merged_df.rename(columns={"_merge": "matched_on_eng"},inplace=True)
merged_df

Unnamed: 0,Original unicode,Greek lemma,Greek normalized,Greek gloss,English gloss,Mandarin gloss,aai_aai,aak_aak,aau_aau,abt_maprik,...,tpi_tpi,tpm_tpm,tsn_1908,tur_2009,ukr_1871,urd_arabic,vie_1926compounds,xho_1996,zul_zul,matched_on_eng
0,עֵ֖דֶן,,,εδεμ,Eden,伊甸,-,-,-,-,...,Iden,Idin,-,Eden,-,-,Ê-Đen,Eden,-,both
1,עֵ֔דֶן,,,εδεμ,Eden,伊甸,-,-,-,-,...,Iden,Idin,-,Eden,-,-,Ê-Đen,Eden,-,both
2,עֵ֔דֶן,,,,Eden,伊甸,-,-,-,-,...,Iden,Idin,-,Eden,-,-,Ê-Đen,Eden,-,both
3,עֵ֑דֶן,,,τρυφῆς,Eden,伊甸,-,-,-,-,...,Iden,Idin,-,Eden,-,-,Ê-Đen,Eden,-,both
4,עֵ֜דֶן,,,τρυφῆς,Eden,伊甸,-,-,-,-,...,Iden,Idin,-,Eden,-,-,Ê-Đen,Eden,-,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9171,Σάρδεσιν,Σάρδεις,Σάρδεσιν,,Sardis,,Sardis,Sadisɨ,Sardis,Sadis,...,Sardis,Sadis,Saredise,Sart,Сарди,سردیس,Sạt-Đe,Esardes,Lasesardesi,both
9172,Φιλαδελφίαν,Φιλαδέλφεια,Φιλαδελφίαν,,Philadelphia,,Piladelfia,Piradepia,Filadelfia,Piladelpia,...,Filadelfia,Filadɛlfia,Filatelefia,-,Лаодикию,فلدلفیہ,Phi-La-Đen-Phi,Efiladelfiya,Lasefiladelfiya,both
9173,Φιλαδελφίᾳ,Φιλαδέλφεια,Φιλαδελφίᾳ,,Philadelphia,,Piladelfia,Piradepia,Filadelfia,Piladelpia,...,Filadelfia,Filadɛlfia,Filatelefia,-,Лаодикию,فلدلفیہ,Phi-La-Đen-Phi,Efiladelfiya,Lasefiladelfiya,both
9174,Σμύρνῃ,Σμύρνα,Σμύρνῃ,,Smyrna,,Simena,Sɨmena,Smerna,Smena,...,Smerna,Sumana,Semurena,Ýzmir,Ефес,سمرنہ,Si-Miệc-Nơ,Esmirna,Nasesmirna,both


In [11]:
# Read in the All terms data from silnlp
sil_assets_path = Path('D:/GitHub/davidbaines/trabina/silnlp/assets')
all_terms_file = sil_assets_path / 'All-metadata.txt'

all_terms = pd.read_table(all_terms_file,header=None, usecols=[0]).squeeze("columns")
all_terms.rename('terms')
print(f"There are {len(all_terms.unique())} unique terms in the All terms dataset.")


There are 20583 unique terms in the All terms dataset.


In [34]:
assets_folder = Path('D:/GitHub/davidbaines/trabina/silnlp/assets')
patterns = ['All', 'Major', 'SilNt'] #'Pt6' doesn't have any glosses.

def read_assets_data(folder, pattern):
    # Function to read in the various PT metadata files. 
    # Each are read in differently.
    
    metadata_file = folder / f"{pattern}-metadata.txt"
    glosses_files = folder.glob(f"*-{pattern}-glosses.txt")
    vrefs_file = folder / f"{pattern}-vrefs.txt"
    print(f"Reading in {pattern} files.")
    
    # The assest folder contains files with pattern from ['Major', 'All', SilNt', 'Pt6']
    # Different sets have different data. 
    
    # Glosses exist for certain languages in separate files.
    # Not all files exist for all patterns. The 'Major' files are as follows:
    # en-Major-glosses.txt, en-Pt6-glosses.txt , en-SilNt-glosses.txt es-Major-glosses.txt fr-Major-glosses.txt, id-Major-glosses.txt Major-metadata.txt , Major-vrefs.txt
    
    # Reading in vrefs is the same for all patterns:
    vrefs = pd.read_csv(vrefs_file,  names=['vrefs'], converters={'vrefs': lambda x: x.split('\t')})     
    #vrefs = pd.read_csv(vrefs_file,header=None).squeeze("columns")
    #vrefs.rename({0: "vrefs"}, axis="columns", inplace=True)
    #vrefs = [vref for vref in vrefs.str.split('\t', expand=True)
    
    if pattern == 'All':
        # This dataset doesn't include sense numbers. Only the first column contains data.
        # The column contains (DC) and (AR) which need to be split off.
        #print(metadata_file)
        
        terms = pd.read_table(metadata_file,header=None, usecols=[0]).squeeze("columns")
        terms.rename('terms')
        
        terms = terms.str.split(' ', expand=True)
        
        terms.rename({0: "term", 1: "note"}, axis="columns", inplace=True)
        terms['AR'] = terms['note'] == '(AR)'
        terms['DC'] = terms['note'] == '(DC)'
        terms.drop(columns=['note'],inplace=True)
    
    if pattern == 'Major':
        terms = pd.read_table(metadata_file,header=None)
        terms.rename({0: "term", 1: "domain", 2:'category'}, axis="columns", inplace=True)
        terms[['term', 'note']] = terms['term'].str.split(' ', 1, expand=True)

        terms[['term', 'sense']] = terms['term'].str.split('-', 1, expand=True)
        terms['AR'] = terms['note'] == '(AR)'
        terms['DC'] = terms['note'] == '(DC)'
        terms.drop(columns=['note'],inplace=True)
        
    if pattern == 'SilNt':
        terms = pd.read_table(metadata_file,header=None)
        terms.rename({0: "term", 1: "domain", 2:'category'}, axis="columns", inplace=True)
        # The domain column is empty.
        terms.drop(columns=['domain'],inplace=True)
        
        
    isos  = list()
    for gloss_file in glosses_files:
        iso = gloss_file.name[:gloss_file.name.find("-")]
        terms[iso] = pd.read_table(gloss_file,header=None, usecols=[0]).squeeze("columns")
        terms[iso] = terms[iso].fillna('')
    #print(f"The glosses are:\n{terms}")

    return terms, vrefs


all_terms, all_vrefs = read_assets_data(assets_folder, 'All')
major_terms, major_vrefs = read_assets_data(assets_folder, 'Major')
silnt_terms, silnt_vrefs = read_assets_data(assets_folder, 'SilNt')

print(f"\nAll terms:\n{all_terms.nunique()}\n")
print(f"Major terms:\n{major_terms.nunique()}\n")
print(f"SilNt terms:\n{silnt_terms.nunique()}\n")
    
       
#print(f"{pattern} terms:\n{terms}\n")
print(f"Major vrefs:\n{major_vrefs}\n")
#print(f"There are {len(vrefs)} lists of verse references. ")
    

Reading in All files.
Reading in Major files.
Reading in SilNt files.

All terms:
term    17350
AR          2
DC          2
en      13195
dtype: int64

Major terms:
term        6083
domain         8
category     565
sense         18
AR             2
DC             2
en          5344
es          4929
fr          5463
id          3411
dtype: int64

SilNt terms:
term        1597
category     678
en          1427
dtype: int64

Major vrefs:
                              vrefs
0                        [EST 1:10]
1                        [JOB 9:26]
2     [JOB 39:9, PRO 14:4, ISA 1:3]
3                        [NUM 11:5]
4                        [2KI 18:2]
...                             ...
8643           [1ES 5:12, 1ES 9:30]
8644                     [1ES 5:22]
8645                      [JDT 8:1]
8646                     [1ES 9:23]
8647                     [1ES 8:47]

[8648 rows x 1 columns]



In [35]:
major_terms.to_csv(r"D:\GitHub\davidbaines\trabina\data\major_terms.txt", sep = '\t')

In [36]:
#How many of the 8648 major terms are in the All terms data?
major_terms['all_terms_exact'] = major_terms['term'].map(all_terms['term'].value_counts())
major_terms['all_terms_exact'] = major_terms['all_terms_exact'].fillna(0)
major_terms.sort_values('all_terms_exact',ascending=False)

Unnamed: 0,term,domain,category,sense,AR,DC,en,es,fr,id,all_terms_exact
6540,σκύβαλον,RE,artifacts,,False,False,Scythian,bueno,Stachys,,2.0
5364,γῆ,RE,nature,2,False,False,soil,derecho,Damaris,,2.0
7049,ἀσπίς,RE,warfare,,False,True,Shelomith,variar,Apherra,,2.0
1651,חֵלֶם,PN,person,,False,False,Helem,suplicar por un favor,Hèleç,Helef,2.0
5351,Γάζα,PN,settlement,,False,False,wedding hall,Hermes,Gog,,2.0
...,...,...,...,...,...,...,...,...,...,...,...
5698,θυμιαστήριον,RE,sacrifices and offerings,,False,False,incense,compañero,Idumée,,0.0
4825,שֻׁפִּים,PN,person,2,False,False,viper,Abías,Sheshbaçar,Saron,0.0
4824,שֻׁפִּים,PN,person,1,False,False,Shuppim,Abías,six,Lasaron,0.0
546,אֶפֶס דַמִּים,PN,settlement,,False,False,Ephes-Dammim,Éser,Efès-Dammim,Efes-Damim,0.0


In [37]:
# These major terms aren't found exactly in the All-metadata.
# major_terms.loc[major_terms['all_terms_exact'].isna()]
major_terms.loc[major_terms['all_terms_exact'] == 0]

Unnamed: 0,term,domain,category,sense,AR,DC,en,es,fr,id,all_terms_exact
67,אֶבֶן בֹּהַן בֶּן־רְאוּבֵן,PN,locale,,False,False,Stone of Bohan the son of Reuben,Eben-ézer,Bohân,Batu Bohan,0.0
106,אֲדָמִי,PN,settlement,,False,False,Adami Nekeb,Admata,Adami-Nèqev,Adami-Nekeb,0.0
110,אֲדֹנָי,BE,supernatural beings and powers; titles,,False,False,Lord,Adonisédec,Seigneur,Tuhan,0.0
182,אֶזְבַּי,PN,person,,False,False,Ezbai,Azanías,Ezbaï,Esbai,0.0
185,אֻזֵן,PN,settlement,,False,False,Uzzen-Sheerah,chacal,Ouzên-Shééra,Uzen-Seera,0.0
...,...,...,...,...,...,...,...,...,...,...,...
7235,Γαβρια,PN,person,,False,True,Gaddi,aceite,graver,,0.0
7554,ἤδυσμα,RE,artifacts; perfumes and spices,,False,True,Teman,casco,cassolette,,0.0
7586,θυΐσκη,RE,?,,False,True,bag,Coná,Adaiah,,0.0
7742,καρρον,RE,?,,False,True,Kaserin,Moabita,potier,,0.0


In [38]:
col = major_terms['all_terms_exact'] 
count = col[col != 0].count()
print(f"There are {count} major-metadata terms that appear exactly in the All metadata file.")
print(f"There are {len(major_terms) - count} major-metadata terms that don't appear exactly in the All metadata file.")


There are 8065 major-metadata terms that appear exactly in the All metadata file.
There are 583 major-metadata terms that don't appear exactly in the All metadata file.


In [39]:
def count_matches(reference_col, source_col):
    matches = source_col.map(reference_col.value_counts()).fillna(0).astype(int)
    # Return count of values greater than 0 
    return matches[matches > 0].count()

#This is very slow.
#def find_matches(reference_col, source_col):
#    matches = [source for source in source_col if source in reference_col.unique()]
#    return matches

# This is also slow.
def find_matches(reference_col, source_col):
    matches = source_col.map(reference_col.value_counts()).fillna(0).astype(int)
    # Return values greater than 0 
    return matches[matches > 0]

# This is almost instant.
def find_matches(reference_col, source_col):
    return set(source_col).intersection(set(reference_col))

def report_matches(ref_df, ref_columns, search_dict):
        
    for name, col in search_dict.items():
        all_matches = set()
        match_count = 0
        unique_values = col.unique()
        print(f"Searching for {len(unique_values)} terms from: '{name}'.")
        for ref_column in ref_columns:
            matches = find_matches(ref_df[ref_column],col)
            match_count += len(matches)
            print(f"There are {len(matches)} found in the '{ref_column}' column.")
            all_matches = all_matches.union(matches)

        print(f"{match_count} '{name}' matched of which {len(all_matches)} are unique.")
        #print(sorted(all_matches))
        print('\n')

In [40]:
# Count how many terms from All, Major and JHU (eng) occur in the Macula data.
# Macula Index(['Original unicode', 'Greek lemma', 'Greek normalized', 'Greek gloss', 'English gloss', 'Mandarin gloss']

terms_macula         = simple_df['Original unicode']
terms_macula_english = simple_df['English gloss']

terms_all     = all_terms['term']
terms_major   = major_terms['term']
terms_jhu_eng = jhu_df['eng']

#print(f"There are {count_matches(terms_macula,terms_all)}   all_terms out of {len(terms_all)} found in the Macula 'Original unicode' column.")
#print(f"There are {count_matches(terms_macula,terms_major)} major_terms out of {len(terms_major)} found in the Macula 'Original unicode' column.")
#print(f"There are {count_matches(terms_macula_english,terms_jhu_eng)} jhu_eng terms out of {len(terms_jhu_eng)} found in the Macula 'Original unicode' column.")

# How many of these Original language terms are found exactly as a key in other lists?
macula_search_columns = ['Original unicode', 'Hebrew Original', 'Aramaic Original', 'Greek Original', 'Greek lemma', 'Greek normalized', 'Greek gloss']
search = {'All terms':terms_all, 'Major terms':terms_major}
report_matches(macula_df, macula_search_columns, search)


Searching for 17350 terms from: 'All terms'.
There are 82 found in the 'Original unicode' column.
There are 43 found in the 'Hebrew Original' column.
There are 1 found in the 'Aramaic Original' column.
There are 39 found in the 'Greek Original' column.
There are 98 found in the 'Greek lemma' column.
There are 52 found in the 'Greek normalized' column.
There are 60 found in the 'Greek gloss' column.
375 'All terms' matched of which 209 are unique.


Searching for 6083 terms from: 'Major terms'.
There are 80 found in the 'Original unicode' column.
There are 41 found in the 'Hebrew Original' column.
There are 1 found in the 'Aramaic Original' column.
There are 39 found in the 'Greek Original' column.
There are 94 found in the 'Greek lemma' column.
There are 52 found in the 'Greek normalized' column.
There are 19 found in the 'Greek gloss' column.
326 'Major terms' matched of which 161 are unique.




In [41]:
macula_search_columns = ['English gloss']
search = {'JHU eng terms':terms_jhu_eng}
report_matches(macula_df, macula_search_columns, search)

macula_search_columns = ['Original unicode', 'Greek Original', 'Greek lemma', 'Greek normalized', 'Greek gloss']
search = {'JHU grc_accented_terms' : jhu_df['grc_accented'], 'JHU ell_helenic1 terms' : jhu_df['ell_hellenic1']}
report_matches(macula_df, macula_search_columns, search)

Searching for 1129 terms from: 'JHU eng terms'.
There are 894 found in the 'English gloss' column.
894 'JHU eng terms' matched of which 894 are unique.


Searching for 652 terms from: 'JHU grc_accented_terms'.
There are 6 found in the 'Original unicode' column.
There are 7 found in the 'Greek Original' column.
There are 4 found in the 'Greek lemma' column.
There are 9 found in the 'Greek normalized' column.
There are 1 found in the 'Greek gloss' column.
27 'JHU grc_accented_terms' matched of which 12 are unique.


Searching for 501 terms from: 'JHU ell_helenic1 terms'.
There are 201 found in the 'Original unicode' column.
There are 202 found in the 'Greek Original' column.
There are 152 found in the 'Greek lemma' column.
There are 283 found in the 'Greek normalized' column.
There are 1 found in the 'Greek gloss' column.
839 'JHU ell_helenic1 terms' matched of which 330 are unique.




In [22]:
def checker(wrong_options,correct_options):
    names_array=[]
    ratio_array=[]    
    for wrong_option in wrong_options:
        if wrong_option in correct_options:
            names_array.append(wrong_option)
            ratio_array.append('100')
        else:   
            x=process.extractOne(wrong_option,correct_options,scorer=fuzz.token_set_ratio)
            names_array.append(x[0])
            ratio_array.append(x[1])
    return names_array,ratio_array

In [23]:
def count_matches(reference_col, source_col):
    str2Match = source_col.fillna('').tolist()
    strOptions = reference_col.fillna('').tolist()
    
    matches = source_col.map(reference_col.value_counts()).fillna(0).astype(int)
    # Return count of values greater than 0 
    return matches[matches > 0].count()

In [24]:
major_matches = terms_major.map(terms_macula.value_counts()).fillna(0).astype(int)
# Return count of values greater than 0 
print(major_matches[major_matches > 0].count())
#print(major_matches)

#all_name_match,   all_ratio_match = checker(terms_macula,terms_all[0:100])
#major_name_match, major_ratio_match=checker(terms_macula,terms_major)

#terms_df['fuzzy_match']=pd.Series(all_name_match)
#terms_df['fuzzy_ratio']=pd.Series(all_ratio_match)
#print(all_name_match,all_ratio_match)

174


In [None]:
#print(len(all_name_match),len(all_ratio_match))
#all_fuzzy_matches = pd.DataFrame.from_dict({'Original unicode': terms_macula, 'all_metadata fuzzy match' : all_name_match, 'all_metadata fuzzy ratio' :all_ratio_match})
#all_fuzzy_matches

In [25]:
def get_terms_from_files(folder,filenames):
    
    all_names = dict()
    
    folder = Path(folder)
    files = [folder / filename for filename in filenames]
    #print([file.name[0:3] for file in files])
    
    for file in files:
        with open(file, 'r', encoding='utf-8') as fin:
            names = [name.strip('\n').title() for name in fin.readlines()]
            all_names[file.name] = names   
    
    return all_names

In [47]:
matches_es = set(jhu_df['spa_blph']).intersection(set(major_terms['es']))
print(f"There are {len(set(jhu_df['spa_blph']))} unique words in the JHU Spanish list.")
print(f"{len(matches_es)} of the words in the JHU Spanish list match those in the Major metadata.")

[word for i, word in enumerate(matches_es) if i <10]


There are 495 unique words in the JHU Spanish list.
387 of the words in the JHU Spanish list match those in the Major metadata.


['Jerusalén',
 'Fares',
 'Jope',
 'Candace',
 'Lino',
 'Aser',
 'Siloé',
 'Sur',
 'Zacarías',
 'Marcos']

In [48]:
matches = set(jhu_df['cmn_sf_ncv']).intersection(set(simple_df['Mandarin gloss']))

print(f"There are {len(set(jhu_df['cmn_sf_ncv']))} terms in the JHU CMN list and {len(set(simple_df['Mandarin gloss']))} Mandarin glosses in Macula.")
print(f"{len(matches)} match.")
print(matches)


There are 920 terms in the JHU CMN list and 2396 Mandarin glosses in Macula.
14 match.
{'', '塞特', '玛土撒拉', '雅列', '亚拿突人', '主', '说', '以撒', '拉麦', '以利', '撒迦利亚', '底波拉', '扫罗', '闪'}


### Checking names in translations.
Given a list of names from the Major Terms data, and the verse references for the names check that the names appear in the extract in the expected verses.


In [None]:
major_metadata