In [1]:
#!/usr/bin/env python3

from argparse import ArgumentParser
import csv
from collections import Counter
import json
import numpy as np
import os
import pandas as pd
import re
from pathlib import Path
import sys
#import urllib.request

In [2]:
def get_vrefs(silnlp_vref_file):    
    
    # Get the silnlp references to line numbers:
    with open(silnlp_vref_file, 'r', encoding='utf-8') as vrefs_file:
        vrefs_dict = {ref.strip('\n'): i+1 for i, ref in enumerate(vrefs_file.readlines())}
    
    vrefs = pd.DataFrame([vrefs_dict]).T
    vrefs.rename({0:'silnlp_line_number'}, axis='columns', inplace=True)
    return vrefs

In [3]:
assets_folder = Path("D:/GitHub/davidbaines/trabina/silnlp/assets")
silnlp_vref_file = assets_folder / "vref.txt"
silnlp_vrefs = get_vrefs(silnlp_vref_file)
silnlp_vrefs


Unnamed: 0,silnlp_line_number
GEN 1:1,1
GEN 1:2,2
GEN 1:3,3
GEN 1:4,4
GEN 1:5,5
...,...
ENO 42:12,41895
ENO 42:13,41896
ENO 42:14,41897
ENO 42:15,41898


In [4]:
def read_assets_data(folder, pattern):
    # Function to read in the various PT metadata files. 
    # Each are read in differently.
    
    metadata_file = folder / f"{pattern}-metadata.txt"
    glosses_files = folder.glob(f"*-{pattern}-glosses.txt")
    vrefs_file = folder / f"{pattern}-vrefs.txt"
    json_file = Path(f"D:/GitHub/davidbaines/trabina/data/{pattern}_terms.json")
    tsv_file  = Path(f"D:/GitHub/davidbaines/trabina/data/{pattern}_terms.tsv")
    assets_folder = Path("D:/GitHub/davidbaines/trabina/silnlp/assets")
    silnlp_vref_file = assets_folder / "vref.txt"
    silnlp_vrefs = get_vrefs(silnlp_vref_file)
    
    # The assest folder contains files with pattern from ['Major', 'All', SilNt', 'Pt6']
    # Different sets have different data. 
    
    # Glosses exist for certain languages in separate files.
    # Not all files exist for all patterns. The 'Major' files are as follows:
    # en-Major-glosses.txt, en-Pt6-glosses.txt , en-SilNt-glosses.txt es-Major-glosses.txt fr-Major-glosses.txt, id-Major-glosses.txt Major-metadata.txt , Major-vrefs.txt
    
    # Reading in vrefs is the same for all patterns:
    vrefs = pd.read_csv(vrefs_file,  names=['vrefs'], converters={'vrefs': lambda x: x.split('\t')})     
    
    if pattern == 'All':
        # This dataset doesn't include sense numbers. Only the first column contains data.
        # The column contains (DC) and (AR) which need to be split off.
                
        terms = pd.read_table(metadata_file,header=None, usecols=[0]).squeeze("columns")
        terms.rename('terms')
        
        terms = terms.str.split(' ', expand=True)
        
        terms.rename({0: "term", 1: "note"}, axis="columns", inplace=True)
        terms['AR'] = terms['note'] == '(AR)'
        terms['DC'] = terms['note'] == '(DC)'
        terms.drop(columns=['note'],inplace=True)
        
    
    if pattern == 'Major':
        # In this dataset all columns contain data.
        # The first column contains (DC) and (AR) and sense numbers which need to be split off
        
        terms = pd.read_table(metadata_file,header=None)
        terms.rename({0: "term", 1: "domain", 2:'category'}, axis="columns", inplace=True)
        terms[['term', 'note']] = terms['term'].str.split(' ', 1, expand=True)

        terms[['term', 'sense']] = terms['term'].str.split('-', 1, expand=True)
        terms['AR'] = terms['note'] == '(AR)'
        terms['DC'] = terms['note'] == '(DC)'
        terms.drop(columns=['note'],inplace=True)
        
    if pattern == 'SilNt':
        # In this dataset the domain column is empty.
        terms = pd.read_table(metadata_file,header=None)
        terms.rename({0: "term", 1: "domain", 2:'category'}, axis="columns", inplace=True)
        terms.drop(columns=['domain'],inplace=True)
        
    isos  = list()
    for gloss_file in glosses_files:
        iso = gloss_file.name[:gloss_file.name.find("-")]
        terms[iso] = pd.read_table(gloss_file,header=None, usecols=[0]).squeeze("columns")
        terms[iso] = terms[iso].fillna('')
    
    # Don't remove the one Major term missing an English gloss here.
    # Incase this is messing up the indexing.
    #if pattern == 'Major':
    #    terms.drop(terms[terms.en == ''].index, inplace=True)
    print(terms[terms.en == ''])

    # Add the verse references
    terms['vrefs'] = vrefs
    
    explode = False
    if explode:
        # Explode the dataset by vref  #To save filespace this could be done after reading instead.
        # Also it might be useful for scanning through the data not to explode.
        terms = terms.explode('vrefs',ignore_index=True)

        # Add the silnlp_line_numbers for each vref
        terms = pd.merge(terms, silnlp_vrefs, how='left', left_on='vrefs', right_index=True)
        terms.silnlp_line_number = terms.silnlp_line_number.fillna(0).astype(int)        

    terms.to_json(json_file, orient='records')
    #terms.to_csv(tsv_file, sep = '\t')
    
    return terms

In [5]:
major_terms = read_assets_data(assets_folder, 'Major')
major_terms

        term domain category sense     AR    DC en es fr id
8647  Ὡσαίας     PN   person  None  False  True            


Unnamed: 0,term,domain,category,sense,AR,DC,en,es,fr,id,vrefs
0,אֲבַגְתָא,PN,person,,False,False,Abagtha,Abagtá,Avagta,Abagta,[EST 1:10]
1,אֵבֶה,FL,grasses,,False,False,papyrus,papiro,papyrus,pandan,[JOB 9:26]
2,אֵבוּס,RE,containers; animal husbandry,,False,False,manger,pesebre,mangeoire,palungan,"[JOB 39:9, PRO 14:4, ISA 1:3]"
3,אֲבַטִּיחַ,FL,fruits,,False,False,melon,melón,melon,semangka,[NUM 11:5]
4,אֲבִי,PN,person,,False,False,Abi,Abí,Avi,Abi,[2KI 18:2]
...,...,...,...,...,...,...,...,...,...,...,...
8643,Ωλαμος,PN,person,,False,True,Ono,,,,"[1ES 5:12, 1ES 9:30]"
8644,Ωνους,PN,person,,False,True,Ox,,,,[1ES 5:22]
8645,Ωξ,PN,person,,False,True,Judah,,,,[JDT 8:1]
8646,Ωουδας,PN,person,,False,True,Jeshaiah,,,,[1ES 9:23]


In [6]:
all_terms  = read_assets_data(assets_folder, 'All')
all_terms

            term     AR    DC en
20475   χορτώδης  False  True   
20476  Χοσαμαιος  False  True   
20477       Χους  False  True   
20478       χοῦς  False  True   
20479      χόλος  False  True   
...          ...    ...   ... ..
20578        ὥρα  False  True   
20579         ὥς  False  True   
20580        ὧδε  False  True   
20581       ὦμος  False  True   
20582        ᾠδή  False  True   

[108 rows x 4 columns]


Unnamed: 0,term,AR,DC,en,vrefs
0,אֵב,False,False,bud,"[JOB 8:12, SNG 6:11]"
1,אֲבַגְתָא,False,False,Abagtha,[EST 1:10]
2,אבד,False,False,perish,"[EXO 10:7, LEV 23:30, LEV 26:38, NUM 16:33, NU..."
3,אֹבֵד,False,False,destruction,"[NUM 24:20, NUM 24:24]"
4,אֲבַדֹּה,False,False,destruction,[PRO 27:20]
...,...,...,...,...,...
20578,ὥρα,False,True,,"[JDT 13:4, WIS 10:7, SIR 11:22, SIR 11:27, SIR..."
20579,ὥς,False,True,,[3MA 1:12]
20580,ὧδε,False,True,,"[TOB 5:5, 2MA 1:6]"
20581,ὦμος,False,True,,"[SIR 6:25, BAR 2:21, LJE 1:3, LJE 1:25, 2MA 12..."


In [7]:
silnt_terms = read_assets_data(assets_folder, 'SilNt')
silnt_terms

Empty DataFrame
Columns: [term, category, en]
Index: []


Unnamed: 0,term,category,en,vrefs
0,Αἰγύπτιος,9.7.2.1 - Names of countries,Egyptian,"[ACT 7:22, ACT 7:24, ACT 7:28, ACT 21:38, HEB ..."
1,Αἰθίοψ,9.7.2.1 - Names of countries,Ethiopian,"[ACT 8:27, ACT 8:27]"
2,Αἰνέας,9.7.1.1 - Personal names,Aeneas,"[ACT 9:33, ACT 9:34]"
3,Αἰνών,9.7.2.2 - Names of regions,Aenon,[JHN 3:23]
4,Αἴγυπτος,9.7.2.1 - Names of countries,Egypt,"[MAT 2:13, MAT 2:14, MAT 2:15, MAT 2:19, ACT 2..."
...,...,...,...,...
1592,Ῥοῦφος,9.7.1.1 - Personal names,Rufus,"[MRK 15:21, ROM 16:13]"
1593,Ῥωμαῖος,9.7.2.3 - Names of cities,Roman,"[JHN 11:48, ACT 2:10, ACT 16:21, ACT 16:37, AC..."
1594,Ῥόδη,9.7.1.1 - Personal names,Rhoda,[ACT 12:13]
1595,Ῥόδος,9.7.2.2 - Names of regions,Rhodes,[ACT 21:1]


In [8]:
print(len(all_terms.en.unique() ))
print(len(major_terms.en.unique() ))

major_in_all = major_terms[major_terms.en.isin(all_terms.en)].en.unique()

major_in_all
print(f"{len(major_in_all)} of the {len(major_terms.en.unique())} Major Terms English names also occur in the {len(all_terms.en.unique())} All Terms data")


13195
5344
3839 of the 5344 Major Terms English names also occur in the 13195 All Terms data
