# Read the macula names data from the greek and hebrew tsv files, add columns and write to a json file. 

In [1]:
#!/usr/bin/env python3
import csv
import json
import numpy as np
import pandas as pd
import re
from pathlib import Path


In [2]:
data_folder = Path("D:/GitHub/davidbaines/trabina/data")

hebrew_refs_and_names = data_folder / "hebrew_refs_and_names.tsv"
hebrew_cols = {0: 'ref', 1: 'Hebrew source', 2: 'English gloss of Hebrew', 3:'Greek gloss of Hebrew', 4:'Mandarin gloss of Hebrew'}

greek_refs_and_names = data_folder / "greek_refs_and_names.tsv"
greek_cols = {0:'ref', 1:'Greek source', 2:'Greek lemma', 3:'Greek normalized', 4:'English gloss of Greek'}

silnlp_assets_folder = Path("D:/GitHub/davidbaines/trabina/silnlp/assets")
silnlp_vref_file = silnlp_assets_folder / "vref.txt"

macula_as_json_output_file = data_folder / "macula.json"

In [3]:
def read_tsv(file,column_names):
    df = pd.read_table(file, header=None, dtype=str, sep='\t')
    df.fillna('', inplace=True)
    df.rename(column_names, axis="columns", inplace=True)
    return df

In [4]:
def move_column(df, column_name, column_index):
    col = df.pop(column_name)
    return df.insert(column_index, col.name, col)

In [5]:
def get_vrefs(silnlp_vref_file):    
    ''' Get the silnlp references to line numbers:'''
    with open(silnlp_vref_file, 'r', encoding='utf-8') as vrefs_file:
        vrefs_dict = {ref.strip('\n'): i+1 for i, ref in enumerate(vrefs_file.readlines())}
    
    vrefs = pd.DataFrame([vrefs_dict]).T
    vrefs.rename({0:'silnlp_line_number'}, axis='columns', inplace=True)

    # To convert reference to line number get the 1st (index 0) element of the vrefs for that reference. E.g.:
    #print(vrefs.loc['ENO 1:2']['silnlp_line_number'])
    return vrefs

In [6]:
vrefs = get_vrefs(silnlp_vref_file)
hebrew = read_tsv(hebrew_refs_and_names, hebrew_cols)
greek = read_tsv(greek_refs_and_names, greek_cols)
vrefs

Unnamed: 0,silnlp_line_number
GEN 1:1,1
GEN 1:2,2
GEN 1:3,3
GEN 1:4,4
GEN 1:5,5
...,...
ENO 42:12,41895
ENO 42:13,41896
ENO 42:14,41897
ENO 42:15,41898


In [7]:
#hebrew

In [8]:
#len(hebrew['ref'].unique())

In [9]:
#hebrew.loc[hebrew['Hebrew source'] == 'יְהוָ֧ה']
#any(hebrew['ref'].duplicated()) 

In [10]:
#greek

In [11]:
# Combine the Hebrew and Greek dataframes.
macula = pd.concat([hebrew,greek], sort=False)

macula['refs'] = macula['ref']
macula.set_index(['ref'], inplace = True)

macula = macula.fillna('')
macula['Source'] = macula['Hebrew source'] + macula['Greek source']
macula['English gloss'] = macula['English gloss of Hebrew'] + macula['English gloss of Greek']

move_column(macula,'Source',0)
move_column(macula,'English gloss',1)
move_column(macula,'refs',0)

macula[['book', 'chapter_no', 'verse_no', 'word_no']] = macula.refs.str.extract('^(?P<book>[A-Z]{3}) (?P<chapter_no>[0-9]{1,3}):(?P<verse_no>[0-9]{1,3})!(?P<word_no>[0-9]{1,3}$)')

In [12]:
macula['ref_only'] = macula.refs.str.split('!').str[0]
macula = pd.merge(macula, vrefs, how='left', left_on='ref_only', right_index=True)
#print(macula.dtypes)
macula = macula.convert_dtypes()
#print(macula.dtypes)


In [13]:
# This indicates the number of rows that are missing an English gloss.
#macula[macula['English gloss']=='']['English gloss'].count()

# To see the rows:
#macula[macula['English gloss']=='']

# Remove these rows.
macula.drop(macula[macula['English gloss'] == ''].index, inplace=True)

In [14]:
# Not really interested in the Deutero Canon, drop those.
macula.drop(macula[macula.silnlp_line_number > 31170].index, inplace=True)

In [15]:
macula.to_json(macula_as_json_output_file, orient='records')