# Read the macula names data from the greek and hebrew tsv files, add columns and write to a json file. 

In [1]:
#!/usr/bin/env python3
import csv
import json
import numpy as np
import pandas as pd
import re
from pathlib import Path


In [2]:
data_folder = Path("D:/GitHub/davidbaines/trabina/data")

hebrew_refs_and_names = data_folder / "hebrew_refs_and_names.tsv"
hebrew_cols = {0: 'ref', 1: 'Hebrew source', 2: 'English gloss of Hebrew', 3:'Greek gloss of Hebrew', 4:'Mandarin gloss of Hebrew'}

greek_refs_and_names = data_folder / "greek_refs_and_names.tsv"
greek_cols = {0:'ref', 1:'Greek source', 2:'Greek lemma', 3:'Greek normalized', 4:'English gloss of Greek'}

silnlp_assets_folder = Path("D:/GitHub/davidbaines/trabina/silnlp/assets")
silnlp_vref_file = silnlp_assets_folder / "vref.txt"

macula_as_json_output_file = Path("D:/GitHub/davidbaines/trabina") / "macula.json"

In [3]:
def read_tsv(file,column_names):
    df = pd.read_table(file, header=None, dtype=str, sep='\t')
    df.fillna('', inplace=True)
    df.rename(column_names, axis="columns", inplace=True)
    return df

In [4]:
def move_column(df, column_name, column_index):
    col = df.pop(column_name)
    return df.insert(column_index, col.name, col)

In [5]:
def get_vrefs(silnlp_vref_file):    
    ''' Get the silnlp references to line numbers:'''
    with open(silnlp_vref_file, 'r', encoding='utf-8') as vrefs_file:
        vrefs_dict = {ref.strip('\n'): i+1 for i, ref in enumerate(vrefs_file.readlines())}
    
    vrefs = pd.DataFrame([vrefs_dict]).T
    vrefs.rename({0:'silnlp_line_number'}, axis='columns', inplace=True)

    # To convert reference to line number get the 1st (index 0) element of the vrefs for that reference. E.g.:
    #print(vrefs.loc['ENO 1:2']['silnlp_line_number'])
    return vrefs

In [6]:
vrefs = get_vrefs(silnlp_vref_file)
hebrew = read_tsv(hebrew_refs_and_names, hebrew_cols)
greek = read_tsv(greek_refs_and_names, greek_cols)
#vrefs

In [7]:
#hebrew

In [8]:
#len(hebrew['ref'].unique())

In [9]:
#hebrew.loc[hebrew['Hebrew source'] == 'יְהוָ֧ה']
#any(hebrew['ref'].duplicated()) 

In [10]:
#greek

In [11]:
# Combine the Hebrew and Greek dataframes.
macula = pd.concat([hebrew,greek], sort=False)
macula['refs'] = macula['ref']
macula.set_index(['ref'], inplace = True)

macula = macula.fillna('')
macula['Source'] = macula['Hebrew source'] + macula['Greek source']
macula['English gloss'] = macula['English gloss of Hebrew'] + macula['English gloss of Greek']

move_column(macula,'Source',0)
move_column(macula,'English gloss',1)
move_column(macula,'refs',0)

macula[['book', 'chapter_no', 'verse_no', 'word_no']] = macula.refs.str.extract('^(?P<book>[A-Z]{3}) (?P<chapter_no>[0-9]{1,3}):(?P<verse_no>[0-9]{1,3})!(?P<word_no>[0-9]{1,3}$)')

In [12]:
macula['ref_only'] = macula.refs.str.split('!').str[0]
macula = pd.merge(macula, vrefs, how='left', left_on='ref_only', right_index=True)
macula.silnlp_line_number = macula.silnlp_line_number.fillna(0).astype(int)
macula


Unnamed: 0_level_0,refs,Source,English gloss,Hebrew source,English gloss of Hebrew,Greek gloss of Hebrew,Mandarin gloss of Hebrew,Greek source,Greek lemma,Greek normalized,English gloss of Greek,book,chapter_no,verse_no,word_no,ref_only,silnlp_line_number
ref,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
GEN 2:4!8,GEN 2:4!8,יְהוָ֥ה,LORD,יְהוָ֥ה,LORD,,耶和华,,,,,GEN,2,4,8,GEN 2:4,35
GEN 2:5!15,GEN 2:5!15,יְהוָ֤ה,LORD,יְהוָ֤ה,LORD,,耶和华,,,,,GEN,2,5,15,GEN 2:5,36
GEN 2:7!2,GEN 2:7!2,יְהוָ֨ה,LORD,יְהוָ֨ה,LORD,,耶和华,,,,,GEN,2,7,2,GEN 2:7,38
GEN 2:8!2,GEN 2:8!2,יְהוָ֧ה,LORD,יְהוָ֧ה,LORD,κύριος,耶和华,,,,,GEN,2,8,2,GEN 2:8,39
GEN 2:8!5,GEN 2:8!5,עֵ֖דֶן,Eden,עֵ֖דֶן,Eden,εδεμ,伊甸,,,,,GEN,2,8,5,GEN 2:8,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
REV 22:13!6,REV 22:13!6,"Ὦ,",Omega,,,,,"Ὦ,",Ὦ,Ὦ,Omega,REV,22,13,6,REV 22:13,31162
REV 22:16!2,REV 22:16!2,Ἰησοῦς,Jesus,,,,,Ἰησοῦς,Ἰησοῦς,Ἰησοῦς,Jesus,REV,22,16,2,REV 22:16,31165
REV 22:16!20,REV 22:16!20,"Δαυείδ,",of David,,,,,"Δαυείδ,",Δαυίδ,Δαυείδ,of David,REV,22,16,20,REV 22:16,31165
REV 22:20!11,REV 22:20!11,Ἰησοῦ.,Jesus,,,,,Ἰησοῦ.,Ἰησοῦς,Ἰησοῦ,Jesus,REV,22,20,11,REV 22:20,31169


In [21]:
# This indicates the number of rows that are missing an English gloss.
macula[macula['English gloss']=='']['English gloss'].count()


198

In [22]:
macula[macula['English gloss']=='']

Unnamed: 0_level_0,refs,Source,English gloss,Hebrew source,English gloss of Hebrew,Greek gloss of Hebrew,Mandarin gloss of Hebrew,Greek source,Greek lemma,Greek normalized,English gloss of Greek,book,chapter_no,verse_no,word_no,ref_only,silnlp_line_number
ref,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
GEN 21:14!20,GEN 21:14!20,בְּאֵ֥ר שָֽׁבַע,,בְּאֵ֥ר שָֽׁבַע,,,,,,,,GEN,21,14,20,GEN 21:14,528
GEN 21:31!6,GEN 21:31!6,בְּאֵ֣ר שָׁ֑בַע,,בְּאֵ֣ר שָׁ֑בַע,,,,,,,,GEN,21,31,6,GEN 21:31,545
GEN 21:32!3,GEN 21:32!3,בְאֵ֣ר שָׁ֑בַע,,בְאֵ֣ר שָׁ֑בַע,,,,,,,,GEN,21,32,3,GEN 21:32,546
GEN 21:33!3,GEN 21:33!3,בְאֵ֣ר שָׁ֑בַע,,בְאֵ֣ר שָׁ֑בַע,,,,,,,,GEN,21,33,3,GEN 21:33,547
GEN 22:19!9,GEN 22:19!9,בְּאֵ֣ר שָׁ֑בַע,,בְּאֵ֣ר שָׁ֑בַע,,,,,,,,GEN,22,19,9,GEN 22:19,567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MRK 16:99!8,MRK 16:99!8,Πέτρον,,,,,,Πέτρον,Πέτρος,Πέτρον,,MRK,16,99,8,MRK 16:99,0
MRK 16:99!17,MRK 16:99!17,Ἰησοῦς,,,,,,Ἰησοῦς,Ἰησοῦς,Ἰησοῦς,,MRK,16,99,17,MRK 16:99,0
GAL 2:19!10,GAL 2:19!10,Χριστῷ,,,,,,Χριστῷ,Χριστός,Χριστῷ,,GAL,2,19,10,GAL 2:19,29167
REV 20:4!20,REV 20:4!20,Ἰησοῦ,,,,,,Ἰησοῦ,Ἰησοῦς,Ἰησοῦ,,REV,20,4,20,REV 20:4,31111


In [23]:
# Remove those rows.
macula.drop(macula[macula['English gloss'] == ''].index, inplace=True)

In [13]:
macula.to_json(macula_as_json_output_file, orient='records')