In [1]:
#!/usr/bin/env python3

from argparse import ArgumentParser
import csv
from collections import Counter
import json
import numpy as np
import os
import pandas as pd
import re
from pathlib import Path
import sys
#import urllib.request

In [2]:
def get_vrefs(silnlp_vref_file):    
    
    # Get the silnlp references to line numbers:
    with open(silnlp_vref_file, 'r', encoding='utf-8') as vrefs_file:
        vrefs_dict = {ref.strip('\n'): i+1 for i, ref in enumerate(vrefs_file.readlines())}
    
    vrefs = pd.DataFrame([vrefs_dict]).T
    vrefs.rename({0:'silnlp_line_number'}, axis='columns', inplace=True)
    return vrefs

In [3]:
assets_folder = Path("D:/GitHub/davidbaines/trabina/silnlp/assets")
silnlp_vref_file = assets_folder / "vref.txt"
silnlp_vrefs = get_vrefs(silnlp_vref_file)
silnlp_vrefs


Unnamed: 0,silnlp_line_number
GEN 1:1,1
GEN 1:2,2
GEN 1:3,3
GEN 1:4,4
GEN 1:5,5
...,...
ENO 42:12,41895
ENO 42:13,41896
ENO 42:14,41897
ENO 42:15,41898


In [4]:
def read_assets_data(folder, pattern):
    # Function to read in the various PT metadata files. 
    # Each are read in differently.
    
    metadata_file = folder / f"{pattern}-metadata.txt"
    glosses_files = folder.glob(f"*-{pattern}-glosses.txt")
    vrefs_file = folder / f"{pattern}-vrefs.txt"
    json_file = Path(f"D:/GitHub/davidbaines/trabina/data/{pattern}_terms.json")
    tsv_file  = Path(f"D:/GitHub/davidbaines/trabina/data/{pattern}_terms.tsv")
    assets_folder = Path("D:/GitHub/davidbaines/trabina/silnlp/assets")
    silnlp_vref_file = assets_folder / "vref.txt"
    silnlp_vrefs = get_vrefs(silnlp_vref_file)
    
    # The assest folder contains files with pattern from ['Major', 'All', SilNt', 'Pt6']
    # Different sets have different data. 
    
    # Glosses exist for certain languages in separate files.
    # Not all files exist for all patterns. The 'Major' files are as follows:
    # en-Major-glosses.txt, en-Pt6-glosses.txt , en-SilNt-glosses.txt es-Major-glosses.txt fr-Major-glosses.txt, id-Major-glosses.txt Major-metadata.txt , Major-vrefs.txt
    
    # Reading in vrefs is the same for all patterns:
    vrefs = pd.read_csv(vrefs_file,  names=['vrefs'], converters={'vrefs': lambda x: x.split('\t')})     
    
    if pattern == 'All':
        # This dataset doesn't include sense numbers. Only the first column contains data.
        # The column contains (DC) and (AR) which need to be split off.
                
        terms = pd.read_table(metadata_file,header=None, usecols=[0]).squeeze("columns")
        terms.rename('terms')
        
        terms = terms.str.split(' ', expand=True)
        
        terms.rename({0: "term", 1: "note"}, axis="columns", inplace=True)
        terms['AR'] = terms['note'] == '(AR)'
        terms['DC'] = terms['note'] == '(DC)'
        terms.drop(columns=['note'],inplace=True)
        
    
    if pattern == 'Major':
        # In this dataset all columns contain data.
        # The first column contains (DC) and (AR) and sense numbers which need to be split off
        
        terms = pd.read_table(metadata_file,header=None)
        terms.rename({0: "term", 1: "domain", 2:'category'}, axis="columns", inplace=True)
        terms[['term', 'note']] = terms['term'].str.split(' ', 1, expand=True)

        terms[['term', 'sense']] = terms['term'].str.split('-', 1, expand=True)
        terms['AR'] = terms['note'] == '(AR)'
        terms['DC'] = terms['note'] == '(DC)'
        terms.drop(columns=['note'],inplace=True)
        
    if pattern == 'SilNt':
        # In this dataset the domain column is empty.
        terms = pd.read_table(metadata_file,header=None)
        terms.rename({0: "term", 1: "domain", 2:'category'}, axis="columns", inplace=True)
        terms.drop(columns=['domain'],inplace=True)
        
    isos  = list()
    for gloss_file in glosses_files:
        iso = gloss_file.name[:gloss_file.name.find("-")]
        terms[iso] = pd.read_table(gloss_file,header=None, usecols=[0]).squeeze("columns")
        terms[iso] = terms[iso].fillna('')
    
    # Don't remove the one Major term missing an English gloss here.
    # Incase this is messing up the indexing.
    #if pattern == 'Major':
    #    terms.drop(terms[terms.en == ''].index, inplace=True)
    #print(terms[terms.en == ''])

    # Add the verse references
    terms['vrefs'] = vrefs
    
    explode = False
    explode = True
    if explode:
        # Explode the dataset by vref  #To save filespace this could be done after reading instead.
        # Also it might be useful for scanning through the data not to explode.
        terms = terms.explode('vrefs',ignore_index=True)

        # Add the silnlp_line_numbers for each vref
        terms = pd.merge(terms, silnlp_vrefs, how='left', left_on='vrefs', right_index=True)

        #Don't want to set missing silnlp_line_number rows to zero - need to remove them.
        #terms.silnlp_line_number = terms.silnlp_line_number.fillna(0).astype(int)        
        terms.dropna(subset=['silnlp_line_number'], inplace=True)
        
        # Not really interested in the Deutero Canon, drop those too.
        terms.drop(terms[terms.silnlp_line_number > 31170].index, inplace=True)
        
        
    terms.to_json(json_file, orient='records')
    #terms.to_csv(tsv_file, sep = '\t')
    
    return terms

In [5]:
major_terms = read_assets_data(assets_folder, 'Major')
major_terms

Unnamed: 0,term,domain,category,sense,AR,DC,en,es,fr,id,vrefs,silnlp_line_number
0,אֲבַגְתָא,PN,person,,False,False,Abagtha,Abagtá,Avagta,Abagta,EST 1:10,12716.0
1,אֵבֶה,FL,grasses,,False,False,papyrus,papiro,papyrus,pandan,JOB 9:26,13081.0
2,אֵבוּס,RE,containers; animal husbandry,,False,False,manger,pesebre,mangeoire,palungan,JOB 39:9,13847.0
3,אֵבוּס,RE,containers; animal husbandry,,False,False,manger,pesebre,mangeoire,palungan,PRO 14:4,16846.0
4,אֵבוּס,RE,containers; animal husbandry,,False,False,manger,pesebre,mangeoire,palungan,ISA 1:3,17727.0
...,...,...,...,...,...,...,...,...,...,...,...,...
88425,ὡσαννά,MI,"help, protect, save; worship",,False,False,Hosea,Hacufá,sainteté,,MAT 21:15,23910.0
88426,ὡσαννά,MI,"help, protect, save; worship",,False,False,Hosea,Hacufá,sainteté,,MRK 11:9,24718.0
88427,ὡσαννά,MI,"help, protect, save; worship",,False,False,Hosea,Hacufá,sainteté,,MRK 11:10,24719.0
88428,ὡσαννά,MI,"help, protect, save; worship",,False,False,Hosea,Hacufá,sainteté,,JHN 12:13,26662.0


In [6]:
print("These major terms have silnlp_line_number == 0")
#major_terms.query('(silnlp_line_number == 0)').sample(n=15)
major_terms.query('(silnlp_line_number == 0)')

These major terms have silnlp_line_number == 0


Unnamed: 0,term,domain,category,sense,AR,DC,en,es,fr,id,vrefs,silnlp_line_number


In [7]:
all_terms  = read_assets_data(assets_folder, 'All')
all_terms

Unnamed: 0,term,AR,DC,en,vrefs,silnlp_line_number
0,אֵב,False,False,bud,JOB 8:12,13045.0
1,אֵב,False,False,bud,SNG 6:11,17695.0
2,אֲבַגְתָא,False,False,Abagtha,EST 1:10,12716.0
3,אבד,False,False,perish,EXO 10:7,1785.0
4,אבד,False,False,perish,LEV 23:30,3433.0
...,...,...,...,...,...,...
228227,ᾠδή,False,False,ill repute,REV 14:3,30998.0
228228,ᾠδή,False,False,ill repute,REV 14:3,30998.0
228229,ᾠδή,False,False,ill repute,REV 15:3,31018.0
228230,ᾠδή,False,False,ill repute,REV 15:3,31018.0


In [8]:
silnt_terms = read_assets_data(assets_folder, 'SilNt')
silnt_terms

Unnamed: 0,term,category,en,vrefs,silnlp_line_number
0,Αἰγύπτιος,9.7.2.1 - Names of countries,Egyptian,ACT 7:22,27207
1,Αἰγύπτιος,9.7.2.1 - Names of countries,Egyptian,ACT 7:24,27209
2,Αἰγύπτιος,9.7.2.1 - Names of countries,Egyptian,ACT 7:28,27213
3,Αἰγύπτιος,9.7.2.1 - Names of countries,Egyptian,ACT 21:38,27770
4,Αἰγύπτιος,9.7.2.1 - Names of countries,Egyptian,HEB 11:29,30268
...,...,...,...,...,...
23928,Ῥώμη,9.7.2.3 - Names of cities,Rome,ACT 28:14,27981
23929,Ῥώμη,9.7.2.3 - Names of cities,Rome,ACT 28:16,27983
23930,Ῥώμη,9.7.2.3 - Names of cities,Rome,ROM 1:7,28005
23931,Ῥώμη,9.7.2.3 - Names of cities,Rome,ROM 1:15,28013


In [9]:
print(len(all_terms.en.unique() ))
print(len(major_terms.en.unique() ))

major_in_all = major_terms[major_terms.en.isin(all_terms.en)].en.unique()

major_in_all
print(f"{len(major_in_all)} of the {len(major_terms.en.unique())} Major Terms English names also occur in the {len(all_terms.en.unique())} All Terms data")


9119
4551
3048 of the 4551 Major Terms English names also occur in the 9119 All Terms data
