# Exploration and Prep of SDBH

In [1]:
import collections, Levenshtein, re
import xml.etree.ElementTree as ET
from tf.fabric import Fabric

In [2]:
TF = Fabric(locations='~/github/etcbc/bhsa/tf/c')
api = TF.load('''
              book chapter verse
              lex qere 
              voc_lex_utf8
              lex_utf8 pdp
              ''')
api.makeAvailableIn(globals())

This is Text-Fabric 4.3.5
Api reference : https://dans-labs.github.io/text-fabric/Api/General/
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

114 features found and 0 ignored
  0.00s loading features ...
   |     0.01s B book                 from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B chapter              from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B verse                from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.16s B lex_utf8             from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.00s B qere                 from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.12s B lex                  from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B voc_lex_utf8         from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.12s B pdp                  from /Users/cody/github/etcbc/bhsa/tf/c
  4.65s All features loaded/computed - for details use loadLog()


In [3]:
sdbh_resource = '/Users/cody/github/marble-lexicon/SDBH/SDBH.XML'

In [4]:
sdbh_tree = ET.parse(sdbh_resource)

In [5]:
root = sdbh_tree.getroot()

## Conversion To TF

### Map SDBH Domains to Domain Codes

In [6]:
domains = '/Users/cody/github/marble-lexicon/SDBH/SDBH.DM1'
domains2 = '/Users/cody/github/marble-lexicon/SDBH/SDBH.DM2'

domain2code = {}

with open(domains, 'r') as infile:
    domains = [dm.split('\\') + ['version 1'] for dm in infile.read().split('\n\n') 
                   if ''.join(dm.split('\\'))] # <- avoid null lines

with open(domains2, 'r') as infile2:
    domains.extend([dm.split('\\') + ['version 2'] for dm in infile2.read().split('\n\n') 
                       if ''.join(dm.split('\\'))] 
                  )
    
# temporary fix: avoid overwriting Kinship
# Kinship is a .DM1 domain, but it has the same
# name in .DM2. Avoid overwrite here:
domains = [d for d in domains if d and d[4] != 'code 089\n']
    
for i, dom in enumerate(domains):
    dom_data = dict((data.split(' ', 1)[0], data.split(' ', 1)[1]) for data in dom
                        if data.split())
    
    if 'label' in dom_data and 'code' in dom_data:          
        domain2code[dom_data['label'].strip()] = dom_data['version'] + '.' + dom_data['code'].strip()

In [8]:
domain2code['Human']

'2.075'

### Map Lexical Domains to Verse and Word References

### Note of Caution

In this section I find a number of domains without matches in the SDBH.DM1 and SDBH.DM2 files. However, these codes are often formed with categories from those domain files. Some examples are:

> 'Kinship > Officials'<br>
> 'Parts: Plants'

Another one that I have found, however, does not seem to have any direct correspondence:

> חוּץ - 'Referents of Location'

I am not sure why this item does not have a corresponding code.

For this items, I have the option to split the codes into their individual parts and match them to the corresponding codes from the two domain files. For domains like "Kinship > Officials," this appears to work quite well. For some others, though, such as חוץ, caution should be exercised.

I have chosen not to split the codes for now, due to complications and uncertainties that result from doing so.

In [9]:
ref2domains = collections.defaultdict(dict)
cautions = set()
unknowns = set()
split_unknowns = False

for entry in root.findall('Lexicon_Entry'):
    
    this_lex = entry.attrib['Lemma']
    
    for meaning in entry.findall('BaseForms/BaseForm/LEXMeanings/'):
        
        domains = [mean.text for mean in meaning.findall('LEXDomains/LEXDomain')]
        domains = [word for word in domains 
                      if word in domain2code]
        
        domains = '|'.join(domains)
        
        if not domains and split_unknowns: # try again
            domains = [mean.text for mean in meaning.findall('LEXDomains/LEXDomain')]
            
            if domains: # track unmatched domains
                cautions |= set(domains)
            
            domains = [word for domstring in domains 
                          for word in re.findall('|'.join(domain2code.keys()), domstring)]
            domains = '|'.join(domains)
            
            if not domains: # give up
                continue
            
        elif not domains and not split_unknowns:
            unknowns |= set(mean.text for mean in meaning.findall('LEXDomains/LEXDomain'))
            
            
        for ref in meaning.findall('LEXReferences/LEXReference'):

            ref2domains[ref.text[:14]][this_lex] = domains
            
print(len(cautions), 'cautions registered')
print(len(unknowns), 'unknowns registered')

0 cautions registered
1462 unknowns registered


In [10]:
ref2domains['00100202300042'] # test

{'אִישׁ': 'People'}

In [11]:
domain2code['People']

'1.001001002003'

In [12]:
list(cautions)[:10]

[]

In [14]:
list(unknowns)[:30]

['Parts: Creatures > Good',
 'Large > Open',
 'Thin > False',
 'High > Shine',
 'Substances.Smoke',
 'People > Referents',
 'Diligent > Possess',
 'Names of Trees',
 'Chastise > Urge',
 'Clean > Great',
 'Move > Confident',
 'Dissociate > Different',
 'Joy > People',
 'Parts: Ceatures > Sounds > Time',
 'Weak > Afraid',
 'Hold',
 'Deities > Great',
 'Marker',
 'Stance > Non-Happen',
 'Parts: Creatures > Dimension > Time',
 'Attach > Close',
 'adverb',
 'Signs > Soldiers',
 'Love > People',
 'Detach > Reject',
 'Search > See',
 'Ingest > Non-Space',
 'Low >  Great',
 'Space > Non-Possess',
 'Exchange > Strong']

### Prepare Conversion Functions

In [15]:
books = '''Genesis
Exodus
Leviticus
Numbers
Deuteronomy
Joshua
Judges
Ruth
1_Samuel
2_Samuel
1_Kings
2_Kings
1_Chronicles
2_Chronicles
Ezra
Nehemiah
Esther
Job
Psalms
Proverbs
Ecclesiastes
Song_of_songs
Isaiah
Jeremiah
Lamentations
Ezekiel
Daniel
Hosea
Joel
Amos
Obadiah
Jonah
Micah
Nahum
Habakkuk
Zephaniah
Haggai
Zechariah
Malachi'''.split('\n')

books = dict((i+1, book) for i, book in enumerate(books))
consonants = set(letter for w in F.otype.s('word')
                 for letter in F.lex_utf8.v(w))
consonants = list(consonants)
consonants.remove('ׁ')
consonants.remove('ׂ')
#consonants.append()

finals = {'ם': 'מ',
          'ן' : 'נ',
          'ך' : 'כ',
          'ף' : 'פ',
          'ץ' : 'צ'}

In [16]:
def strip(word_string):
    '''
    strips all accentuations
    '''
    word_string = word_string.replace('־', ' ')
    for final in finals:
        word_string = word_string.replace(final, finals[final])
    return ''.join(w for w in word_string if w in consonants)

def with_qere_words(verse, option=1):
    
    '''
    Returns a list of word nodes
    where words are repeated in the
    case of a qere reading.
    '''
    
    words = L.d(verse, 'word')
    qeres = [w for w in words if F.qere.v(w)]
    
    qeres_count = [(qeres[i+1] - w if i+1 < len(qeres) else 0) for i, w in enumerate(qeres)
                      ]
    
    if option == 1:
        for qe, ct in zip(qeres, qeres_count):
            if ct != 1:
                index = words.index(qe) + 1
                words.insert(index, qe)
                
    elif option == 2:
        for qe in qeres:
            index = words.index(qe) + 1
            words.insert(index, qe)
                    
    return words
    
def not_qere(wordnode):
    '''
    Check's whether a wordnode's
    enclosing verse has a qere reading
    or not.
    '''
    
    verse = L.u(wordnode, 'verse')[0]
    qeres = [w for w in L.d(verse, 'word') if F.qere.v(w)]
    
    if not qeres:
        return True
    else:
        return False
    
def look_around(word_node, target_lex, window=4):
    '''
    A last ditch option for lex matching.
    Looks ahead and behind n words.
    '''
    
    verse_words = L.d(L.u(word_node, 'verse')[0], 'word')
    nodes = [word_node+i for i in range(-window, window)
                if word_node+i in verse_words
                and Levenshtein.ratio(strip(F.voc_lex_utf8.v(L.u(word_node+i, 'lex')[0])), target_lex) > 0.7]
    
    if nodes:
        return nodes[0]
    else:
        return None
    
    
def get_node(ref_string, qere_option=1):
    
    '''
    Uses an SDBH reference ID to
    find the corresponding Text-Fabric 
    word node.
    '''

    book = books[round(int(ref_string[:3]))]
    chapt = round(int(ref_string[3:6]))
    verse = round(int(ref_string[6:9]))
    word = int(round(int(ref_string[-3:])) / 2) - 1
    verse_node = T.nodeFromSection((book, chapt, verse))
    verse_words = with_qere_words(verse_node, option=qere_option)
    word_node = verse_words[word]
    
#     if ref_string == '00902000200052':
#         print(f'looking at pos {word}')
#         print(T.text([word_node]))
#         for i, w in enumerate(verse_words):
#             print(i, w, T.text([w]))

    return word_node

### Map Domains to TF Word Nodes

In [17]:
word2domain = {}
exceptions = []

for ref, data in ref2domains.items():
    
    for lex, domains in data.items():    
            
        if not domains:
            continue
            
        lex = strip(lex)
            
        try:
            wordnode = get_node(ref)
            etcbc_lex = strip(F.lex_utf8.v(wordnode))

            if Levenshtein.ratio(etcbc_lex, lex) > 0.7 or etcbc_lex in lex or lex in etcbc_lex:
                word2domain[wordnode] = domains
                continue

            # try a second time with alternative qere disambig
            wordnode = get_node(ref, qere_option=2)
            etcbc_lex = strip(F.lex_utf8.v(wordnode))
            if Levenshtein.ratio(etcbc_lex, lex) > 0.7 or etcbc_lex in lex or lex in etcbc_lex:
                word2domain[wordnode] = domains
                
            elif look_around(wordnode, lex): # third try with a window search
                word2domain[look_around(wordnode, lex)] = domains
                
            elif not_quere(wordnode): # on fourth attempt, if no qere in verse, take the node
                word2domain[wordnode] = domains
                
            else:
                exceptions.append((f'{ref}: unmatched lex: SBDH {lex} ≠ ETCBC {etcbc_lex}'))

        except:
            
            try:
                wordnode = get_node(ref, qere_option=2)
                etcbc_lex = strip(F.lex_utf8.v(wordnode))
                if Levenshtein.ratio(etcbc_lex, lex) > 0.7 or etcbc_lex in lex or lex in etcbc_lex:
                    word2domain[wordnode] = domains
                
            except Exception as e:
                exceptions.append((f'{ref}: {e}; SBDH lex {lex}'))
    
print('exceptions:', len(exceptions))
print('good matches', len(word2domain))

exceptions: 100
good matches 204454


In [18]:
exceptions[:10]

['00600400100042: list index out of range; SBDH lex אמר',
 '01101501800098: list index out of range; SBDH lex אמר',
 '01500400700054: list index out of range; SBDH lex ארמית',
 '01300601100020: list index out of range; SBDH lex בנ',
 '01300601100022: list index out of range; SBDH lex בנ',
 '01401101800036: list index out of range; SBDH lex בנ',
 '00900500600046: list index out of range; SBDH lex גבול',
 '00902401900044: list index out of range; SBDH lex הרג',
 '01300601301040: list index out of range; SBDH lex ושני',
 '00902000200070: list index out of range; SBDH lex זה']

#### Troubleshooting Zone

In [19]:
problem = '00100900500040'
pbook = books[round(int(problem[:3]))]
pchapt = round(int(problem[3:6]))
pverse = round(int(problem[6:9]))
pword = int(round(int(problem[-3:])) / 2) - 1

test = T.nodeFromSection((pbook, pchapt, pverse))

test_words = with_qere_words(test, option=2)

# print(f'problem at {pbook} {pchapt}:{pverse}, {test}')
# print(f'seeking word at pos {pword}\n')

# for i, w in enumerate(test_words):
#     lex = L.u(w, 'lex')[0]
#     print(i, w, F.lex_utf8.v(w))

### Map domains to codes to words

In [20]:
word2code = {}

for w, domains in word2domain.items():
    
    if '>' in domains:
        print(domains)
        break
    
    domains = [word for word in domains.split('|')]

    codes = '|'.join(domain2code.get(dom, '') for dom in domains)
    
    if codes:
        word2code[w] = codes
        
len(word2code)

204454

In [21]:
word2domain[1136]

'People'

### Export Good Matches to TF Resource

In [22]:
meta = {'': {'created_by': 'Renier de Blois (UBS)',
         'coreData': 'BHSA',
         'coreVersion': 'c'
        },
        
    'sem_domain_code' : {'source': 'Exported from the SDBH.XML',
                    'valueType': 'str'},
        
    'sem_domain': {'source': 'Exported from the SDBH.XML',
              'valueType': 'str'}
   }

newFeatures = {'sem_domain_code': word2code,
               'sem_domain': word2domain
              }

save_TF = Fabric(locations='~/github/verb_semantics/project_code/sdbh', silent=True)
api = save_TF.load('', silent=True)

save_TF.save(nodeFeatures=newFeatures, edgeFeatures={}, metaData=meta)
print('EXPORT DONE!')

  0.00s Feature "otype" not available in
/Users/cody/github/verb_semantics/project_code/sdbh/
  0.00s Not all features could be loaded/computed


   |     0.35s T sem_domain           to /Users/cody/github/verb_semantics/project_code/sdbh
   |     0.40s T sem_domain_code      to /Users/cody/github/verb_semantics/project_code/sdbh
EXPORT DONE!


### Testing

In [23]:
TF = Fabric(locations=['~/github/etcbc/bhsa/tf/c', '~/github/verb_semantics/project_code/sdbh'])
api = TF.load('''
              book chapter verse
              lex qere 
              voc_lex_utf8
              lex_utf8 pdp
              sem_domain 
              sem_domain_code 
              gloss
              ''')
api.makeAvailableIn(globals())

This is Text-Fabric 4.3.5
Api reference : https://dans-labs.github.io/text-fabric/Api/General/
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

116 features found and 0 ignored
  0.00s loading features ...
   |     0.01s B book                 from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B chapter              from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B verse                from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.19s B lex_utf8             from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.00s B qere                 from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.15s B lex                  from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B voc_lex_utf8         from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.14s B pdp                  from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.66s T sem_domain           from /Users/cody/github/verb_

In [24]:
for w in L.d(T.nodeFromSection(('Genesis', 1)), 'word')[:30]:
    
    if not F.sem_domain.v(w):
        continue
    
    print(T.text([w]), F.sem_domain.v(w))

בָּרָ֣א  Exist
אֱלֹהִ֑ים  Deities
אָֽרֶץ׃  Land
אָ֗רֶץ  Land
הָיְתָ֥ה  Exist
תֹ֨הוּ֙  Non-Exist
בֹ֔הוּ  Non-Exist
חֹ֖שֶׁךְ  Dark
תְהֹ֑ום  Waterbodies
מְרַחֶ֖פֶת  Move


In [25]:
# find uncovered nouns

uncovereds = []
covereds = []

for word in F.otype.s('word'):
    
    if F.pdp.v(word) in {'nmpr', 'subs'}:
        
        if not F.sem_domain.v(word):
            uncovereds.append(word)
            
        else:
            covereds.append(word)
            
len(uncovereds)

33438

In [26]:
uncovered_lexs = collections.Counter(F.lex.v(w) for w in uncovereds)
covered_lexs = collections.Counter(F.lex.v(w) for w in covereds)

len(uncovered_lexs)

2109

In [27]:
uncovered_lexs.most_common(10)

[('PNH/', 2127),
 ('JD/', 1611),
 ('>JC/', 1298),
 ('<JN/', 823),
 ('BJT/', 803),
 ('NPC/', 754),
 ('CNH/', 643),
 ('R>C/', 613),
 ('LB/', 584),
 ('BN/', 568)]

In [28]:
covered_lexs.most_common(10)

[('JHWH/', 6626),
 ('KL/', 5276),
 ('BN/', 4369),
 ('MLK/', 2521),
 ('>LHJM/', 2509),
 ('JFR>L/', 2499),
 ('>RY/', 2459),
 ('JWM/', 2233),
 ('<M/', 1613),
 ('DBR/', 1439)]