# Read the macula names data from the greek and hebrew tsv files, add columns and write to a json file. 

In [1]:
#!/usr/bin/env python3
import csv
import json
import numpy as np
import pandas as pd
import re
from pathlib import Path


In [2]:
data_folder = Path("D:/GitHub/davidbaines/trabina/data")

hebrew_refs_and_names = data_folder / "hebrew_refs_and_names.tsv"
hebrew_cols = {0: 'ref', 1: 'Hebrew source', 2: 'English gloss of Hebrew', 3:'Greek gloss of Hebrew', 4:'Mandarin gloss of Hebrew'}

greek_refs_and_names = data_folder / "greek_refs_and_names.tsv"
greek_cols = {0:'ref', 1:'Greek source', 2:'Greek lemma', 3:'Greek normalized', 4:'English gloss of Greek'}

silnlp_assets_folder = Path("D:/GitHub/davidbaines/trabina/silnlp/assets")
silnlp_vref_file = silnlp_assets_folder / "vref.txt"

macula_as_json_output_file = data_folder / "macula.json"
macula_ot_json_output_file = data_folder / "macula_OT.json"
macula_nt_json_output_file = data_folder / "macula_NT.json"

In [3]:
def read_tsv(file,column_names):
    df = pd.read_table(file, header=None, dtype=str, sep='\t')
    df.fillna('', inplace=True)
    df.rename(column_names, axis="columns", inplace=True)
    return df

In [4]:
def move_column(df, column_name, column_index):
    col = df.pop(column_name)
    return df.insert(column_index, col.name, col)

In [5]:
def get_vrefs(silnlp_vref_file):    
    ''' Get the silnlp references to line numbers:'''
    with open(silnlp_vref_file, 'r', encoding='utf-8') as vrefs_file:
        vrefs_dict = {ref.strip('\n'): i+1 for i, ref in enumerate(vrefs_file.readlines())}
    
    vrefs = pd.DataFrame([vrefs_dict]).T
    vrefs.rename({0:'silnlp_line_number'}, axis='columns', inplace=True)

    # To convert reference to line number get the 1st (index 0) element of the vrefs for that reference. E.g.:
    #print(vrefs.loc['ENO 1:2']['silnlp_line_number'])
    return vrefs

In [6]:
def split_ref(df):
    df[['book', 'chapter_no', 'verse_no', 'word_no']] = df.ref.str.extract('^(?P<book>[A-Z0-9]{3}) (?P<chapter_no>[0-9]{1,3}):(?P<verse_no>[0-9]{1,3})!(?P<word_no>[0-9]{1,3}$)')
    
    return df

In [7]:
def add_silnlp_line_numbers(df, vrefs):
    df['ref_only'] = df.ref.str.split('!').str[0]
    df = pd.merge(df, vrefs, how='left', left_on='ref_only', right_index=True)
    return df.convert_dtypes()


In [8]:
def save_as_json(df,file):
    df.to_json(file, orient='records', indent=4)
    

In [9]:
vrefs = get_vrefs(silnlp_vref_file)
hebrew = read_tsv(hebrew_refs_and_names, hebrew_cols)
greek = read_tsv(greek_refs_and_names, greek_cols)
vrefs

Unnamed: 0,silnlp_line_number
GEN 1:1,1
GEN 1:2,2
GEN 1:3,3
GEN 1:4,4
GEN 1:5,5
...,...
ENO 42:12,41895
ENO 42:13,41896
ENO 42:14,41897
ENO 42:15,41898


In [10]:
hebrew = split_ref(hebrew)
hebrew = add_silnlp_line_numbers(hebrew,vrefs)
save_as_json(hebrew,macula_ot_json_output_file)
print(f"Saved Hebrew data to {macula_ot_json_output_file}")
hebrew[hebrew.book.str.match("\d")]

Saved Hebrew data to D:\GitHub\davidbaines\trabina\data\macula_OT.json


Unnamed: 0,ref,Hebrew source,English gloss of Hebrew,Greek gloss of Hebrew,Mandarin gloss of Hebrew,book,chapter_no,verse_no,word_no,ref_only,silnlp_line_number
11530,1SA 1:1!6,צוֹפִ֖ים,Zuphite,σιφα,琐非,1SA,1,1,6,1SA 1:1,7215
11531,1SA 1:1!8,אֶפְרָ֑יִם,Ephraim,εφραιμ,以法莲,1SA,1,1,8,1SA 1:1,7215
11532,1SA 1:1!10,אֶ֠לְקָנָה,Elkanah,ελκανα,以利加拿,1SA,1,1,10,1SA 1:1,7215
11533,1SA 1:1!12,יְרֹחָ֧ם,Jeroham,ιερεμεηλ,耶罗罕,1SA,1,1,12,1SA 1:1,7215
11534,1SA 1:1!14,אֱלִיה֛וּא,Elihu,ηλιου,以利户,1SA,1,1,14,1SA 1:1,7215
...,...,...,...,...,...,...,...,...,...,...,...
23985,2CH 36:23!5,פָּרַ֗ס,Persia,περσῶν,波斯,2CH,36,23,5,2CH 36:23,12021
23986,2CH 36:23!11,יְהוָה֙,LORD,κύριος,耶和华,2CH,36,23,11,2CH 36:23,12021
23987,2CH 36:23!20,ירוּשָׁלִַ֖ם,Jerusalem,ιερουσαλημ,耶路撒冷,2CH,36,23,20,2CH 36:23,12021
23988,2CH 36:23!22,יהוּדָ֑ה,Judah,,犹大,2CH,36,23,22,2CH 36:23,12021


In [11]:
len(hebrew['ref'].unique())

34187

In [12]:
hebrew.loc[hebrew['Hebrew source'] == 'יְהוָ֧ה']
any(hebrew['ref'].duplicated()) 

False

In [13]:
greek = split_ref(greek)
greek = add_silnlp_line_numbers(greek, vrefs)
save_as_json(greek,macula_nt_json_output_file)
print(f"Saved Greek data to {macula_nt_json_output_file}")
greek[greek.book.str.match("\d")]

Saved Greek data to D:\GitHub\davidbaines\trabina\data\macula_NT.json


Unnamed: 0,ref,Greek source,Greek lemma,Greek normalized,English gloss of Greek,book,chapter_no,verse_no,word_no,ref_only,silnlp_line_number
3502,1CO 1:1!1,Παῦλος,Παῦλος,Παῦλος,Paul,1CO,1,1,1,1CO 1:1,28432
3503,1CO 1:1!4,Χριστοῦ,Χριστός,Χριστοῦ,of Christ,1CO,1,1,4,1CO 1:1,28432
3504,1CO 1:1!5,Ἰησοῦ,Ἰησοῦς,Ἰησοῦ,Jesus,1CO,1,1,5,1CO 1:1,28432
3505,1CO 1:1!10,Σωσθένης,Σωσθένης,Σωσθένης,Sosthenes,1CO,1,1,10,1CO 1:1,28432
3506,1CO 1:2!8,"Κορίνθῳ,",Κόρινθος,Κορίνθῳ,Corinth,1CO,1,2,8,1CO 1:2,28433
...,...,...,...,...,...,...,...,...,...,...,...
4501,2JN 1:7!12,Χριστὸν,Χριστός,Χριστόν,Christ,2JN,1,7,12,2JN 1:7,30719
4502,2JN 1:9!11,Χριστοῦ,Χριστός,Χριστοῦ,of Christ,2JN,1,9,11,2JN 1:9,30721
4503,3JN 1:1!3,Γαΐῳ,Γάϊος,Γαΐῳ,To Gaius,3JN,1,1,3,3JN 1:1,30726
4504,3JN 1:9!9,Διοτρεφὴς,Διοτρέφης,Διοτρεφής,Diotrephes,3JN,1,9,9,3JN 1:9,30734


In [14]:
# Not really interested in the Deutero Canon, drop those.
macula.drop(macula[macula.silnlp_line_number > 31170].index, inplace=True)

NameError: name 'macula' is not defined