In [14]:
from nltk import toolbox as tb
from xml.etree import ElementTree as ET

# use NLTK to parse toolbox data as an XML tree
td = tb.ToolboxData()
td.open('tobelo.db')
tob = td.parse()
td.close()


print('Main entries (lx): ' , len(tob)-1)

# count for subentries
n=0
for entry in tob:
    n += len(entry.findall('se'))
print('Subentries (se): ' , n)
    


Main entries (lx):  3268
Subentries (se):  0


In [15]:
for entry in tob[:10]:
    lx = entry.findtext('lx')
    ps = entry.findtext('ps')
    defn = [] 
    for ge in entry.findall('ge'):
        defn.append(ge.text)
    print([lx,ps,defn])

[None, None, []]
['abari', 'o', ['news']]
['abo', 'ma', ['foam']]
['ade-ade', 'ho', ['to narrate', 'conversation', 'story']]
['adono', 'ho', ['come up to', 'reach', 'to arrive somewhere', 'pertaining to time', 'sufficient']]
['aerani', 'ma', ['wonderful', 'strange']]
['aere', 'i na', ['ant trails']]
['aere', 'o', ['high *tide']]
['aga', 'ho', ['dumb', 'stupid']]
['agomo', 'ma', ['thick']]


In [16]:
# write out first entries as csv (first part of speech only)
import csv

entries = []
n = 0
for entry in tob[1:]:
    n += 1
    lx = entry.findtext('lx')
    ps = entry.findtext('ps')
    if entry.findtext('de') == None: # no \de field
        ges = []
        for ge in entry.findall('ge'):
            ges.append(ge.text)
        de = ', '.join(ges)   # combine all \ge fields
    else:
        de = entry.findtext('de')
    if entry.findtext('sd') == None:  # no \sd field
        sd = ''
    else:
        sd = entry.findtext('sd')
    entries.append([n, lx,ps,de,sd])


with open("tob.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(entries)
    

In [None]:
# how many entries have mulitple senses
# (i.e., multiple parts of speech field \ps)
n=0
for entry in tob:
    if len(entry.findall('ps')) > 1:
        n+=1
print(n, 'entries with more than one \\ps')

In [None]:
# flatten db so that separate senses are converted to headwords
#
f = tb.StandardFormat()
f.open('tobelo.db')
lines = []
for (x,y) in list(f.fields()):
    lines.append( (x,y))
f.close()

ps_flag = False # set a flat
lx = '' # start with empty lx
tob_sf = [] # start with empty list
for (x,y) in lines:
    if x == 'lx':
        ps_flag = False
        lx = y
    if x == 'ps':
        if ps_flag == True: # 2nd or subsequent ps
            tob_sf.append(['lx',lx]) # insert lx field
        else:
            ps_flag = True # 1st ps
    tob_sf.append([x,y])
            
with open('tobelo-flat.db','w',newline="") as f:
     for line in tob_sf:
        f.writelines('\\' + line[0] + ' ' + line[1] +'\n')


In [None]:
# get list of Parts of Speech
pos = []
for entry in tob:
    for ps in entry.findall('ps'):
        if ps.text not in pos:
            pos.append(ps.text)
print(pos)

In [None]:
# get list of Parts of Speech
# and write to TLex csv file
pos = []
pos3 = []
for entry in tob:
    for ps in entry.findall('ps'):
        if ps.text not in pos:
            pos.append(ps.text)
            pos3.append([ps.text,ps.text,ps.text])

with open('pos.csv','w',newline="") as f:
    writer = csv.writer(f)
    writer.writerows(pos3)

In [None]:
# get list of Semantic Domains
# and write to TLex csv file
sem = []
sem3 = []
for entry in tob:
    for sd in entry.findall('sd'):
        if sd.text not in sem:
            sem.append(sd.text)
            sem3.append([sd.text,sd.text,sd.text])

print(sem)

with open('sem.csv','w',newline="") as f:
    writer = csv.writer(f)
    writer.writerows(sem3)

In [18]:
# Write out flattened file as csv
#
import csv

td = tb.ToolboxData()
td.open('tobelo-flat.db')
tob_flat = td.parse()
td.close()

entries = []
n = 0
for entry in tob_flat[1:]:
    n += 1
    lx = entry.findtext('lx')
    ps = entry.findtext('ps')
    if entry.findtext('de') == None: # no \de field
        ges = []
        for ge in entry.findall('ge'):
            ges.append(ge.text)
        de = ', '.join(ges)   # combine all \ge fields
    else:
        de = entry.findtext('de')
    if entry.findtext('sd') == None:  # no \sd field
        sd = ''
    else:
        sd = entry.findtext('sd')
    if entry.findtext('xv') == None:
        xv = ''
    else:
        xv = entry.findtext('xv')
    if entry.findtext('xe') == None:
        xe = ''
    else:
        xe = entry.findtext('xe')
        
    entries.append([n, lx,ps,de,sd,xv,xe])


with open("tob-flat.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(entries)
    

In [None]:
# write to XML for import to Tshwanelex.
# No need to "flatten" file since Tshwanelex support
# multiple senses.
#
s = ''
for entry in tob[1:]:
    lx = entry.findtext('lx')
    
    sds = []
    for sd in entry.findall('sd'):
        sds.append(sd.text)
    sem_doms = ','.join(sds)

    s += '<Lemma LemmaSign="' + lx + '" SemDomain="' + sem_doms + '">\n'
    
    
    # ps tag should trigger new sense, but need to deal with entries which lack ps tags
    # set flag for entires with no ps
    NoPS = entry.findtext('ps') == None
        
    sn = 0 # sense number
    defn = '' # \de field
    ges = [] # \ge fields
    for line in entry:
        if line.tag == 'ps' or NoPS: # new sense (or no sense)
            sn += 1
            if sn > 1: # close previous sense
                if len(defn) > 0: # use defn
                    s += '<TE TE="' + defn + '"/>\n' 
                else: # use \ge fields
                    for ge in ges:
                        s += '<TE TE="' + ge + '"/>\n'
                s += '</Sense>\n'  
                defn = '' # reset
                ges = []  # reset
            # start new sense
            if NoPS:
                s += '<Sense SenseNumber="' + str(sn) + '">\n'
                NoPS = False
            else:
                s += '<Sense SenseNumber="' + str(sn) + '" PartOfSpeech="' + line.text +  '">\n'
        else:
            if line.tag == 'de':
                defn = line.text
            elif line.tag == 'ge':
                ges.append(line.text)
            elif line.tag == 'sc':
                s += '<Definition SciName="' + line.text + '"/>\n'
            elif line.tag == 'xv':
                s += '<Example Example="' + line.text + '" '
            elif line.tag == 'xe':
                s += 'Translation="' + line.text + '"/>\n'

    # close last sense
    if len(defn) > 0: # use defn
        s += '<TE TE="' + defn + '"/>\n' 
    else: # iterate over \ge fields
        for ge in ges:
            s += '<TE TE="' + ge + '"/>\n'
    s += '</Sense>\n'  

    
    s += '</Lemma>\n\n'
    
with open('tobelo-out.xml','w',newline="") as f:
    f.write(s)