# Exploration and Prep of SDBH

In [523]:
import collections, Levenshtein, re
import xml.etree.ElementTree as ET
from tf.fabric import Fabric

In [462]:
TF = Fabric(locations='~/github/etcbc/bhsa/tf/c')
api = TF.load('''
              book chapter verse
              lex qere 
              voc_lex_utf8
              lex_utf8
              ''')
api.makeAvailableIn(globals())

This is Text-Fabric 4.1.2
Api reference : https://dans-labs.github.io/text-fabric/Api/General/
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

114 features found and 0 ignored
  0.00s loading features ...
   |     0.01s B book                 from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B chapter              from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B verse                from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.23s B lex_utf8             from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.00s B qere                 from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.12s B lex                  from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B voc_lex_utf8         from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.00s Feature overview: 109 for nodes; 4 for edges; 1 configs; 7 computed
  5.88s All features loaded/computed - for details use loadLog

In [56]:
sdbh_resource = '/Users/cody/github/marble-lexicon/SDBH/SDBH.XML'

In [57]:
sdbh_tree = ET.parse(sdbh_resource)

In [58]:
root = sdbh_tree.getroot()

## Conversion To TF

### Map SDBH Domains to Domain Codes

In [606]:
domains = '/Users/cody/github/marble-lexicon/SDBH/SDBH.DM1'
domain2code = {}

with open(domains, 'r') as infile:
    domains = [dm.split('\\') for dm in infile.read().split('\n\n') 
                   if ''.join(dm.split('\\'))]
    
for i, dom in enumerate(domains):
    
    dom_data = dict((data.split()[0], data.split()[1]) for data in dom
                        if data.split())
    
    
    if 'label' in dom_data and 'code' in dom_data:          
        domain2code[dom_data['label']] = dom_data['code']

### Map Lexical Domains to Verse and Word References

In [625]:
ref2domains = collections.defaultdict(dict)

for entry in root.findall('Lexicon_Entry'):
    this_lex = entry.attrib['Lemma']
    for meaning in entry.findall('BaseForms/BaseForm/LEXMeanings/'):
        domains = [mean.text for mean in meaning.findall('LEXDomains/LEXDomain')]
        domains = [word for domstring in domains 
                      for word in domstring.split()
                      if word in domain2code]
        domains = '|'.join(domains)
        
        for ref in meaning.findall('LEXReferences/LEXReference'):        
            ref2domains[ref.text[:14]][this_lex] = domains

In [626]:
ref2domains['00901201500040'] # test

{'אָב': 'Kinship'}

### Prepare Conversion Functions

In [627]:
books = '''Genesis
Exodus
Leviticus
Numbers
Deuteronomy
Joshua
Judges
Ruth
1_Samuel
2_Samuel
1_Kings
2_Kings
1_Chronicles
2_Chronicles
Ezra
Nehemiah
Esther
Job
Psalms
Proverbs
Ecclesiastes
Song_of_songs
Isaiah
Jeremiah
Lamentations
Ezekiel
Daniel
Hosea
Joel
Amos
Obadiah
Jonah
Micah
Nahum
Habakkuk
Zephaniah
Haggai
Zechariah
Malachi'''.split('\n')

books = dict((i+1, book) for i, book in enumerate(books))
consonants = set(letter for w in F.otype.s('word')
                 for letter in F.lex_utf8.v(w))
consonants = list(consonants)
consonants.remove('ׁ')
consonants.remove('ׂ')
#consonants.append()

finals = {'ם': 'מ',
          'ן' : 'נ',
          'ך' : 'כ',
          'ף' : 'פ',
          'ץ' : 'צ'}

In [628]:
def strip(word_string):
    '''
    strips all accentuations
    '''
    word_string = word_string.replace('־', ' ')
    for final in finals:
        word_string = word_string.replace(final, finals[final])
    return ''.join(w for w in word_string if w in consonants)

def with_qere_words(verse, option=1):
    
    '''
    Returns a list of word nodes
    where words are repeated in the
    case of a qere reading.
    '''
    
    words = L.d(verse, 'word')
    qeres = [w for w in words if F.qere.v(w)]
    
    qeres_count = [(qeres[i+1] - w if i+1 < len(qeres) else 0) for i, w in enumerate(qeres)
                      ]
    
    if option == 1:
        for qe, ct in zip(qeres, qeres_count):
            if ct != 1:
                index = words.index(qe) + 1
                words.insert(index, qe)
                
    elif option == 2:
        for qe in qeres:
            index = words.index(qe) + 1
            words.insert(index, qe)
                    
    return words
    
def look_around(word_node, target_lex, window=4):
    '''
    A last ditch option for lex matching.
    Looks ahead and behind n words.
    '''
    
    verse_words = L.d(L.u(word_node, 'verse')[0], 'word')
    nodes = [word_node+i for i in range(-window, window)
                if word_node+i in verse_words
                and Levenshtein.ratio(strip(F.voc_lex_utf8.v(L.u(word_node+i, 'lex')[0])), target_lex) > 0.7]
    
    if nodes:
        return nodes[0]
    else:
        return None
    
    
def get_node(ref_string, qere_option=1):
    
    '''
    Uses an SDBH reference ID to
    find the corresponding Text-Fabric 
    word node.
    '''

    book = books[round(int(ref_string[:3]))]
    chapt = round(int(ref_string[3:6]))
    verse = round(int(ref_string[6:9]))
    word = int(round(int(ref_string[-3:])) / 2) - 1
    verse_node = T.nodeFromSection((book, chapt, verse))
    verse_words = with_qere_words(verse_node, option=qere_option)
    word_node = verse_words[word]
    
#     if ref_string == '00902000200052':
#         print(f'looking at pos {word}')
#         print(T.text([word_node]))
#         for i, w in enumerate(verse_words):
#             print(i, w, T.text([w]))

    return word_node

### Map & Export Domains and Domain Codes to TF Word Nodes

In [638]:
word2domain = {}
exceptions = []

for ref, data in ref2domains.items():
    
    for lex, domains in data.items():    
            
        if not domains:
            continue
            
        lex = strip(lex)
            
        try:
            wordnode = get_node(ref)
            etcbc_lex = strip(F.lex_utf8.v(wordnode))

            if Levenshtein.ratio(etcbc_lex, lex) > 0.7 or etcbc_lex in lex or lex in etcbc_lex:
                word2domain[wordnode] = domains
                continue

            # try a second time with alternative qere disambig
            wordnode = get_node(ref, qere_option=2)
            etcbc_lex = strip(F.lex_utf8.v(wordnode))
            if Levenshtein.ratio(etcbc_lex, lex) > 0.7 or etcbc_lex in lex or lex in etcbc_lex:
                word2domain[wordnode] = domains
                
            elif look_around(wordnode, lex):
                word2domain[look_around(wordnode, lex)] = domains
                
            else:
                exceptions.append((f'{ref}: unmatched lex: SBDH {lex} ≠ ETCBC {etcbc_lex}'))

        except:
            
            try:
                wordnode = get_node(ref, qere_option=2)
                etcbc_lex = strip(F.lex_utf8.v(wordnode))
                if Levenshtein.ratio(etcbc_lex, lex) > 0.7 or etcbc_lex in lex or lex in etcbc_lex:
                    word2domain[wordnode] = domains
                
            except Exception as e:
                exceptions.append((f'{ref}: {e}; SBDH lex {lex}'))
    
print('exceptions:', len(exceptions))
print('good matches', len(word2domain))

exceptions: 1868
good matches 260366


In [639]:
exceptions[:10]

['02002702000010: unmatched lex: SBDH אבדונ ≠ ETCBC אבדה',
 '02801301400016: unmatched lex: SBDH איה ≠ ETCBC אהי',
 '02801301400022: unmatched lex: SBDH איה ≠ ETCBC אהי',
 '02003100400030: unmatched lex: SBDH או ≠ ETCBC אי',
 '02003100400032: unmatched lex: SBDH אוה ≠ ETCBC אי',
 '00502601400008: unmatched lex: SBDH אוני ≠ ETCBC אנה',
 '02800900400026: unmatched lex: SBDH אוני ≠ ETCBC אנה',
 '02302601900024: unmatched lex: SBDH אורה ≠ ETCBC ארת',
 '01200403900018: unmatched lex: SBDH אורה ≠ ETCBC ארת',
 '01403202800042: unmatched lex: SBDH אורה ≠ ETCBC אורות']

In [640]:
problem = '02604004400032'
pbook = books[round(int(problem[:3]))]
pchapt = round(int(problem[3:6]))
pverse = round(int(problem[6:9]))
pword = int(round(int(problem[-3:])) / 2) - 1

test = T.nodeFromSection((pbook, pchapt, pverse))

test_words = with_qere_words(test, option=2)

print(f'problem at {pbook} {pchapt}:{pverse}, {test}')
print(f'seeking word at pos {pword}\n')

for i, w in enumerate(test_words):
    lex = L.u(w, 'lex')[0]
    print(i, w, F.lex_utf8.v(w))

problem at Ezekiel 40:44, 1428257
seeking word at pos 15

0 285925 ו
1 285926 מן
2 285927 חוץ
3 285928 ל
4 285929 ה
5 285930 שׁער
6 285931 ה
7 285932 פנימי
8 285933 לשׁכה
9 285934 שׁיר
10 285935 ב
11 285936 ה
12 285937 חצר
13 285938 ה
14 285939 פנימי
15 285940 אשׁר
16 285941 אל
17 285942 כתף
18 285943 שׁער
19 285944 ה
20 285945 צפון
21 285946 ו
22 285947 פנה
23 285948 דרך
24 285949 ה
25 285950 דרום
26 285951 אחד
27 285952 אל
28 285953 כתף
29 285954 שׁער
30 285955 ה
31 285956 קדים
32 285957 פנה
33 285958 דרך
34 285959 ה
35 285960 צפון


### Map domains to codes to words

In [641]:
word2code = {}

for w, domains in word2domain.items():

    domains = [word.strip() for word in re.split('\||>', domains)]
    codes = '|'.join(domain2code.get(dom, '') for dom in domains)
    
    if codes:
        word2code[w] = codes
        
len(word2code)

260366

### Export Good Matches to TF Resource

In [644]:
meta = {'': {'created_by': 'Renier de Blois (UBS)',
         'coreData': 'BHSA',
         'coreVersion': 'c'
        },
        
    'sem_domain_code' : {'source': 'Exported from the SDBL.XML',
                    'valueType': 'str'},
        
    'sem_domain': {'source': 'Exported from the SDBL.XML',
              'valueType': 'str'}
   }

newFeatures = {'sem_domain_code': word2code,
               'sem_domain': word2domain
              }

save_TF = Fabric(locations='~/github/semantics/project_code/sdbh', silent=True)
api = save_TF.load('', silent=True)

save_TF.save(nodeFeatures=newFeatures, edgeFeatures={}, metaData=meta)

  0.00s Feature "otype" not available in
/Users/cody/github/semantics/project_code/sdbh/
  0.00s Not all features could be loaded/computed


   |     0.46s T sem_domain           to /Users/cody/github/semantics/project_code/sdbh
   |     0.43s T sem_domain_code      to /Users/cody/github/semantics/project_code/sdbh


### Testing

In [645]:
TF = Fabric(locations=['~/github/etcbc/bhsa/tf/c', '~/github/semantics/project_code/sdbh'])
api = TF.load('''
              book chapter verse
              lex
              sem_domain 
              sem_domain_code 
              gloss
              ''')
api.makeAvailableIn(globals())

This is Text-Fabric 4.1.2
Api reference : https://dans-labs.github.io/text-fabric/Api/General/
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

116 features found and 0 ignored
  0.00s loading features ...
   |     0.01s B book                 from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.00s B chapter              from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.00s B verse                from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.11s B lex                  from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.83s T sem_domain           from /Users/cody/github/semantics/project_code/sdbh
   |     0.82s T sem_domain_code      from /Users/cody/github/semantics/project_code/sdbh
   |     0.00s B gloss                from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.00s Feature overview: 111 for nodes; 4 for edges; 1 configs; 7 computed
  5.81s All features loaded/computed -

In [646]:
for w in L.d(T.nodeFromSection(('Genesis', 1)), 'word'):
    
    if not F.sem_domain.v(w):
        continue
    
    print(T.text([w]), F.sem_domain.v(w))

בָּרָ֣א  Exist
אֱלֹהִ֑ים  Deities
אֵ֥ת  Identifiers
אֵ֥ת  Identifiers
הָיְתָ֥ה  Exist
תֹ֨הוּ֙  Non-Exist
בֹ֔הוּ  Non-Exist
חֹ֖שֶׁךְ  Dark
עַל־ Location
תְהֹ֑ום  Waterbodies
ר֣וּחַ  Spirit|Deities
אֱלֹהִ֔ים  Deities|Intense
מְרַחֶ֖פֶת  Move
עַל־ Location
מָּֽיִם׃  Liquids
יֹּ֥אמֶר  Speak
אֱלֹהִ֖ים  Deities
יְהִ֣י  Exist
אֹ֑ור  Shine
יְהִי־ Exist
אֹֽור׃  Shine
יַּ֧רְא  See
אֱלֹהִ֛ים  Deities
אֶת־ Identifiers
אֹ֖ור  Shine
כִּי־ Perception
יַּבְדֵּ֣ל  Divide
אֱלֹהִ֔ים  Deities
בֵּ֥ין  Occurrence
אֹ֖ור  Shine
בֵ֥ין  Occurrence
חֹֽשֶׁךְ׃  Dark
יִּקְרָ֨א  Speak
אֱלֹהִ֤ים׀  Deities
אֹור֙  Shine
יֹ֔ום  Shine|Universe|Time
חֹ֖שֶׁךְ  Dark
קָ֣רָא  Speak
לָ֑יְלָה  Time
יְהִי־ Happen
עֶ֥רֶב  Time
יְהִי־ Happen
בֹ֖קֶר  Time
יֹ֥ום  Time
אֶחָֽד׃ פ  Quantity|Frequency
יֹּ֣אמֶר  Speak
אֱלֹהִ֔ים  Deities
יְהִ֥י  Exist
רָקִ֖יעַ  Universe
בְּ Location
תֹ֣וךְ  Orientation
מָּ֑יִם  Liquids
יהִ֣י  Events
מַבְדִּ֔יל  Divide
בֵּ֥ין  Location
מַ֖יִם  Liquids
מָֽיִם׃  Liquids
יַּ֣עַשׂ  Exist
אֱלֹהִים֮  Deities
אֶת