In [1]:
#!/usr/bin/env python3

from argparse import ArgumentParser
import csv
from collections import Counter
import json
import numpy as np
import os
import pandas as pd
import re
from pathlib import Path
import sys
#import urllib.request

In [2]:
def get_vrefs(silnlp_vref_file):    
    
    # Get the silnlp references to line numbers:
    with open(silnlp_vref_file, 'r', encoding='utf-8') as vrefs_file:
        vrefs_dict = {ref.strip('\n'): i+1 for i, ref in enumerate(vrefs_file.readlines())}
    
    vrefs = pd.DataFrame([vrefs_dict]).T
    vrefs.rename({0:'silnlp_line_number'}, axis='columns', inplace=True)
    return vrefs

In [3]:
def read_assets_data(folder, pattern):
    # Function to read in the various PT metadata files. 
    # Each are read in differently.
    
    metadata_file = folder / f"{pattern}-metadata.txt"
    glosses_files = folder.glob(f"*-{pattern}-glosses.txt")
    vrefs_file = folder / f"{pattern}-vrefs.txt"
    json_file = Path(f"D:/GitHub/davidbaines/trabina/data/{pattern}_terms.json")
    tsv_file  = Path(f"D:/GitHub/davidbaines/trabina/data/{pattern}_terms.tsv")
    
    # The assest folder contains files with pattern from ['Major', 'All', SilNt', 'Pt6']
    # Different sets have different data. 
    
    # Glosses exist for certain languages in separate files.
    # Not all files exist for all patterns. The 'Major' files are as follows:
    # en-Major-glosses.txt, en-Pt6-glosses.txt , en-SilNt-glosses.txt es-Major-glosses.txt fr-Major-glosses.txt, id-Major-glosses.txt Major-metadata.txt , Major-vrefs.txt
    
    # Reading in vrefs is the same for all patterns:
    vrefs = pd.read_csv(vrefs_file,  names=['vrefs'], converters={'vrefs': lambda x: x.split('\t')})     
    
    if pattern == 'All':
        # This dataset doesn't include sense numbers. Only the first column contains data.
        # The column contains (DC) and (AR) which need to be split off.
                
        terms = pd.read_table(metadata_file,header=None, usecols=[0]).squeeze("columns")
        terms.rename('terms')
        
        terms = terms.str.split(' ', expand=True)
        
        terms.rename({0: "term", 1: "note"}, axis="columns", inplace=True)
        terms['AR'] = terms['note'] == '(AR)'
        terms['DC'] = terms['note'] == '(DC)'
        terms.drop(columns=['note'],inplace=True)
        
    
    if pattern == 'Major':
        # In this dataset all columns contain data.
        # The first column contains (DC) and (AR) and sense numbers which need to be split off
        
        terms = pd.read_table(metadata_file,header=None)
        terms.rename({0: "term", 1: "domain", 2:'category'}, axis="columns", inplace=True)
        terms[['term', 'note']] = terms['term'].str.split(' ', 1, expand=True)

        terms[['term', 'sense']] = terms['term'].str.split('-', 1, expand=True)
        terms['AR'] = terms['note'] == '(AR)'
        terms['DC'] = terms['note'] == '(DC)'
        terms.drop(columns=['note'],inplace=True)
        
    if pattern == 'SilNt':
        # In this dataset the domain column is empty.
        terms = pd.read_table(metadata_file,header=None)
        terms.rename({0: "term", 1: "domain", 2:'category'}, axis="columns", inplace=True)
        terms.drop(columns=['domain'],inplace=True)
        
    isos  = list()
    for gloss_file in glosses_files:
        iso = gloss_file.name[:gloss_file.name.find("-")]
        terms[iso] = pd.read_table(gloss_file,header=None, usecols=[0]).squeeze("columns")
        terms[iso] = terms[iso].fillna('')
    
    # Remove the one Major term missing an English gloss.
    if pattern == 'Major':
        terms.drop(terms[terms.en == ''].index, inplace=True)
        
        
    # Add the verse references
    terms['vrefs'] = vrefs
    
    # Explode the dataset by vref  #To save filespace this could be done after reading instead.
    terms = terms.explode('vrefs',ignore_index=True)
    terms.to_json(json_file, orient='records')
    #terms.to_csv(tsv_file, sep = '\t')
    
    return terms

In [5]:
assets_folder = Path("D:/GitHub/davidbaines/trabina/silnlp/assets")

all_terms  = read_assets_data(assets_folder, 'All')
major_terms = read_assets_data(assets_folder, 'Major')
silnt_terms = read_assets_data(assets_folder, 'SilNt')

#print(f"\nAll terms:\n{all_terms.nunique()}\n")
#print(f"Major terms:\n{major_terms.nunique()}\n")
#print(f"SilNt terms:\n{silnt_terms.nunique()}\n")   
all_terms

Unnamed: 0,term,AR,DC,en,vrefs
0,אֵב,False,False,bud,JOB 8:12
1,אֵב,False,False,bud,SNG 6:11
2,אֲבַגְתָא,False,False,Abagtha,EST 1:10
3,אבד,False,False,perish,EXO 10:7
4,אבד,False,False,perish,LEV 23:30
...,...,...,...,...,...
281034,ᾠδή,False,True,,1MA 4:54
281035,ᾠδή,False,True,,1MA 13:51
281036,ᾠδή,False,True,,2MA 7:6
281037,ᾠδή,False,True,,3MA 6:32


In [6]:
major_terms


Unnamed: 0,term,domain,category,sense,AR,DC,en,es,fr,id,vrefs
0,אֲבַגְתָא,PN,person,,False,False,Abagtha,Abagtá,Avagta,Abagta,EST 1:10
1,אֵבֶה,FL,grasses,,False,False,papyrus,papiro,papyrus,pandan,JOB 9:26
2,אֵבוּס,RE,containers; animal husbandry,,False,False,manger,pesebre,mangeoire,palungan,JOB 39:9
3,אֵבוּס,RE,containers; animal husbandry,,False,False,manger,pesebre,mangeoire,palungan,PRO 14:4
4,אֵבוּס,RE,containers; animal husbandry,,False,False,manger,pesebre,mangeoire,palungan,ISA 1:3
...,...,...,...,...,...,...,...,...,...,...,...
104226,Ωλαμος,PN,person,,False,True,Ono,,,,1ES 5:12
104227,Ωλαμος,PN,person,,False,True,Ono,,,,1ES 9:30
104228,Ωνους,PN,person,,False,True,Ox,,,,1ES 5:22
104229,Ωξ,PN,person,,False,True,Judah,,,,JDT 8:1


In [8]:
silnt_terms

Unnamed: 0,term,category,en,vrefs
0,Αἰγύπτιος,9.7.2.1 - Names of countries,Egyptian,ACT 7:22
1,Αἰγύπτιος,9.7.2.1 - Names of countries,Egyptian,ACT 7:24
2,Αἰγύπτιος,9.7.2.1 - Names of countries,Egyptian,ACT 7:28
3,Αἰγύπτιος,9.7.2.1 - Names of countries,Egyptian,ACT 21:38
4,Αἰγύπτιος,9.7.2.1 - Names of countries,Egyptian,HEB 11:29
...,...,...,...,...
23928,Ῥώμη,9.7.2.3 - Names of cities,Rome,ACT 28:14
23929,Ῥώμη,9.7.2.3 - Names of cities,Rome,ACT 28:16
23930,Ῥώμη,9.7.2.3 - Names of cities,Rome,ROM 1:7
23931,Ῥώμη,9.7.2.3 - Names of cities,Rome,ROM 1:15


In [16]:
print(len(all_terms.en.unique() ))
print(len(major_terms.en.unique() ))

major_in_all = major_terms[major_terms.en.isin(all_terms.en)].en.unique()

major_in_all
print(f"{len(major_in_all)} of the {len(major_terms.en.unique())} English names also occur in the {len(all_terms.en.unique())} All Terms data")


13195
5343
3838 of the 5343 English names also occur in the All Terms data
