# Generate morphology JSON files from CATSS dataset

In [1]:
import sys
import regex 
import collections
from greekutils import beta2unicode
from pathlib import Path
data = Path('../source/patched')

In [2]:
# map book names (some books are split up); filenames copied/pasted from above printout
book_norms = {    
    '01.Gen.1.mlxx':'01.GEN.mlxx',
    '02.Gen.2.mlxx':'01.GEN.mlxx',
    '03.Exod.mlxx':'02.EXO.mlxx',
    '04.Lev.mlxx':'03.LEV.mlxx',
    '05.Num.mlxx':'04.NUM.mlxx',
    '06.Deut.mlxx':'05.DEU.mlxx',
    '07.JoshB.mlxx':'06.JOS_B.mlxx',
    '08.JoshA.mlxx':'07.JOS_A.mlxx',
    '09.JudgesB.mlxx':'08.JDG_B.mlxx',
    '10.JudgesA.mlxx':'09.JDG_A.mlxx',
    '11.Ruth.mlxx':'10.RUT.mlxx',
    '12.1Sam.mlxx':'11.1SA.mlxx',
    '13.2Sam.mlxx':'12.2SA.mlxx',
    '14.1Kings.mlxx':'13.1KI.mlxx',
    '15.2Kings.mlxx':'14.2KI.mlxx',
    '16.1Chron.mlxx':'15.1CH.mlxx',
    '17.2Chron.mlxx':'16.2CH.mlxx',
    '18.1Esdras.mlxx':'17.1ES.mlxx',
    '19.2Esdras.mlxx':'18.2ES.mlxx',
    '20.Esther.mlxx':'19.ESG.mlxx',
    '21.Judith.mlxx':'20.JDT.mlxx',
    '22.TobitBA.mlxx':'21.TOB_BA.mlxx',
    '23.TobitS.mlxx':'22.TOB_S.mlxx',
    '24.1Macc.mlxx':'23.1MA.mlxx',
    '25.2Macc.mlxx':'24.2MA.mlxx',
    '26.3Macc.mlxx':'25.3MA.mlxx',
    '27.4Macc.mlxx':'26.4MA.mlxx',
    '28.Psalms1.mlxx':'27.PSA.mlxx',
    '29.Psalms2.mlxx':'27.PSA.mlxx',
    '30.Odes.mlxx':'28.ODA.mlxx',
    '31.Proverbs.mlxx':'29.PRO.mlxx',
    '32.Qoheleth.mlxx':'30.ECC.mlxx',
    '33.Canticles.mlxx':'31.SNG.mlxx',
    '34.Job.mlxx':'32.JOB.mlxx',
    '35.Wisdom.mlxx':'33.WIS.mlxx',
    '36.Sirach.mlxx':'34.SIR.mlxx',
    '37.PsSol.mlxx':'35.PSS.mlxx',
    '38.Hosea.mlxx':'36.HOS.mlxx',
    '39.Micah.mlxx':'37.MIC.mlxx',
    '40.Amos.mlxx':'38.AMO.mlxx',
    '41.Joel.mlxx':'39.JOL.mlxx',
    '42.Jonah.mlxx':'40.JON.mlxx',
    '43.Obadiah.mlxx':'41.OBA.mlxx',
    '44.Nahum.mlxx':'42.NAM.mlxx',
    '45.Habakkuk.mlxx':'43.HAB.mlxx',
    '46.Zeph.mlxx':'44.ZEP.mlxx',
    '47.Haggai.mlxx':'45.HAG.mlxx',
    '48.Zech.mlxx':'46.ZEC.mlxx',
    '49.Malachi.mlxx':'47.MAL.mlxx',
    '50.Isaiah1.mlxx':'48.ISA.mlxx',
    '51.Isaiah2.mlxx':'48.ISA.mlxx',
    '52.Jer1.mlxx':'49.JER.mlxx',
    '53.Jer2.mlxx':'49.JER.mlxx',
    '54.Baruch.mlxx':'50.BAR.mlxx',
    '55.EpJer.mlxx':'51.LJE.mlxx',
    '56.Lam.mlxx':'52.LAM.mlxx',
    '57.Ezek1.mlxx':'53.EZE.mlxx',
    '58.Ezek2.mlxx':'53.EZE.mlxx',
    '59.BelOG.mlxx':'54.BEL_OG.mlxx',
    '60.BelTh.mlxx':'55.BEL_TH.mlxx',
    '61.DanielOG.mlxx':'56.DAG.mlxx',
    '62.DanielTh.mlxx':'57.DAG_TH.mlxx',
    '63.SusOG.mlxx':'58.SUS_OG.mlxx',
    '64.SusTh.mlxx':'59.SUS_TH.mlxx' 
}


final_letter = r'{}(?=\s|$)'

final_grk = [
    (regex.compile(final_letter.format('σ')), 'ς'),
]

def sub_final(string, re_set):
    """Substitute final letters in Hebrew"""
    for patt, repl in re_set:
        string = patt.sub(repl, string)
    return string

def utf8_greek(string):
    """Convert transcribed Greek to UTF8"""
    utf8_string = beta2unicode.convert(string)
    finalized_string = sub_final(utf8_string, final_grk)
    return finalized_string

In [3]:
errors = []

morph_data = collections.defaultdict(lambda: collections.defaultdict(list))
    
for file in sorted(data.glob('*.mlxx')):
    
    new_file = book_norms[file.name]
    book_name = new_file.split('.')[1]
    
    lines = file.read_text().split('\n')
        
    print(f'processing words for {file.name}...')

    for i, line in enumerate(lines):

        line_data = line.strip().split()

        # length of 0/1 is either blank line or section marker with no chapter/verse label
        if len(line_data) == 1 and line_data[0] == '':
            continue
        # exception for some superscriptions or in-doubt texts w/out chapter:verse label
        elif len(line_data) == 1 and line_data[0] != '': 
            line_data.append('0:0') # place-holder chapter:verse

        if len(line_data) == 2:
            ref_str = f'{book_name} {line_data[1]}'
            
        # length > 2 is a slot
        elif len(line_data) > 2:
    
            # get slot data
            trans = line_data[0]
            morph = '.'.join(line_data[1:]) # morpho data into dot-separated string, disambiguate later
            utf8 = utf8_greek(trans)
            morph_data[new_file][ref_str].append((utf8, morph, trans))

processing words for 01.Gen.1.mlxx...
processing words for 02.Gen.2.mlxx...
processing words for 03.Exod.mlxx...
processing words for 04.Lev.mlxx...
processing words for 05.Num.mlxx...
processing words for 06.Deut.mlxx...
processing words for 07.JoshB.mlxx...
processing words for 08.JoshA.mlxx...
processing words for 09.JudgesB.mlxx...
processing words for 10.JudgesA.mlxx...
processing words for 11.Ruth.mlxx...
processing words for 12.1Sam.mlxx...
processing words for 13.2Sam.mlxx...
processing words for 14.1Kings.mlxx...
processing words for 15.2Kings.mlxx...
processing words for 16.1Chron.mlxx...
processing words for 17.2Chron.mlxx...
processing words for 18.1Esdras.mlxx...
processing words for 19.2Esdras.mlxx...
processing words for 20.Esther.mlxx...
processing words for 21.Judith.mlxx...
processing words for 22.TobitBA.mlxx...
processing words for 23.TobitS.mlxx...
processing words for 24.1Macc.mlxx...
processing words for 25.2Macc.mlxx...
processing words for 26.3Macc.mlxx...
proc

In [4]:
morph_data['01.GEN.mlxx']['GEN 1:1']

[('ἐν', 'P.E)N', 'E)N'),
 ('ἀρχῇ', 'N1.DSF.A)RXH/', 'A)RXH=|'),
 ('ἐποίησεν', 'VAI.AAI3S.POIE/W', 'E)POI/HSEN'),
 ('ὁ', 'RA.NSM.O(', 'O('),
 ('θεὸς', 'N2.NSM.QEO/S', 'QEO\\S'),
 ('τὸν', 'RA.ASM.O(', 'TO\\N'),
 ('οὐρανὸν', 'N2.ASM.OU)RANO/S', 'OU)RANO\\N'),
 ('καὶ', 'C.KAI/', 'KAI\\'),
 ('τὴν', 'RA.ASF.O(', 'TH\\N'),
 ('γῆν', 'N1.ASF.GH=', 'GH=N')]

In [5]:
# Notes to myself
# prototypical counts per type:
# 3 - adjv, noun, verb
# 2 - advb, conj, intj, part, prep, inum
# 3 - inum, pron, propn
# 4 - propn (N.N.M.MESRAIM), verb (participle)

# those with overloaded lexemes:
# verb(>3, not participle), verb(>4, participle)

# store new features here: feature_name to node to feature 
features = collections.defaultdict(dict)

# conversion dicts
typs = {'N': 'noun',
        'V': 'verb',
        'A': 'adjv',
        'R': 'pron',
        'C': 'conj',
        'X': 'part',
        'I': 'intj',
        'M': 'inum',
        'P': 'prep',
        'D': 'advb'}
       #'N': 'propn' proper noun, added below with special rule
    
# nominals 
# [case][number][gender]
cases = {'N': 'nom',
         'G': 'gen',
         'D': 'dat',
         'A': 'acc',
         'V': 'voc'}
numbers = {'S': 'sg',
          'D': 'du',
          'P': 'pl'}
genders = {'M': 'm',
          'F': 'f',
          'N': 'n'}
degrees = {'C': 'comparative',
          'S': 'superlative'}

# verbs
# [tense][voice][mood][person][number] [case][number][gender]

tenses = {'P': 'present',
         'I': 'imperfect',
         'F': 'future',
         'A': 'aorist',
         'X': 'perfect',
         'Y': 'pluperfect'}
voices = {'A': 'active',
         'M': 'middle',
         'P': 'passsive'}
moods = {'I': 'indc',
         'D': 'impv',
         'S': 'subj',
         'O': 'optv',
         'N': 'infv',
         'P': 'ptcp'}
    
def parse_morpho(morph_str):
    """Parse dot-separated LXX morphology string"""
    
    split_morph = morph_str.split('.')

    morph_code = '.'.join(split_morph[:-1])
    
    # parse morphology codes in order of appearance:

    # 1. assign subtypes and types

    styp = split_morph[0] # subtype

    # get type; exception for proper nouns; nouns with no subtypes
    if styp == 'N':
        typ = 'propn'
    else:
        typ = typs[styp[0]] # type is only first char of code, convert it

    # 2. assign parsing data

    # indeclinable words
    if len(split_morph) == 2 or typ in {'advb', 'conj'}:
        case, gender, number, degree, tense, voice, mood, person = ('' for i in range(1,9))
        lexeme = '.'.join(split_morph[1:])

    # nominal words with case/gender/number
    elif typ in {'adjv', 'noun', 'inum', 'pron', 'propn'}:

        parsing_data = split_morph[1]
        case = ''
        gender = ''
        number = ''
        degree = ''

        # get parsing; some parsing codes have < 3 values, loop is thus necessary
        for i, char in enumerate(parsing_data):

            # dative/dual disambiguation
            if i == 0 and char == 'D': 
                case = 'dat'
            elif i != 0 and char == 'D':
                number = 'du'

            # disambiguation for 'S' superlative
            elif all([char == 'S' or char == 'C', len(parsing_data) == 4,
                      typ == 'adjv', i != 1]):
                degree = degrees.get(char, '')

            # all other parsings
            elif char != 'D':
                case = cases.get(char, '') if not case else case
                gender = genders.get(char, '') if not gender else gender
                number = numbers.get(char, '') if not number else number
                degree = '' if not degree else degree

        # set non applicable values to null
        person, tense, voice, mood = ('' for i in range(1,5))

        lexeme = '.'.join(split_morph[2:])


    # verbs
    elif typ == 'verb':

        parsing_data = split_morph[1]
        tense = tenses[parsing_data[0] ]
        try:
            voice = voices[parsing_data[1]]

        except:
            raise Exception(morph_str)
        mood = moods[parsing_data[2]]

        # handle participles 
        try:
            gender = genders[parsing_data[5]] # only participles have >4 chars
            number = numbers[parsing_data[4]]
            case = cases[parsing_data[3]]
            person = '' # non-applicable values
            degree = ''

        except IndexError:

            # all normal verbs
            try:
                person = parsing_data[3]
                number = numbers[parsing_data[4]]
                case = '' # non-applicable values
                gender = '' 
                degree = ''

            # handle infinitives
            except IndexError: 
                person = '' # non-applicable values
                number = ''
                case = ''
                gender = ''
                degree = ''

        lexeme = '.'.join(split_morph[2:])

    # return features
    features = {
        'typ': typ,
        'styp': styp,
        'lexeme': lexeme,
        'morph_code': morph_code,
        'case': case,
        'number': number,
        'gender': gender,
        'degree': degree,
        'tense': tense,
        'voice': voice,
        'mood': mood,
        'person': person,
    }
    
    # filter empty features
    features = {k:v for k,v in features.items() if v}
    
    return features


# reassemble data here into a list of lists
morph_data_plus = []

for book, verses in morph_data.items():
    book_data = [book]
    for verse_ref, lines in verses.items():
        verse_data = [verse_ref]
        for utf8, morpho, trans in lines:
            word_data = {'utf8': utf8, 'trans':trans}
            word_data.update(parse_morpho(morpho))
            verse_data.append(word_data)
        book_data.append(verse_data)
    morph_data_plus.append(book_data)

In [6]:
len(morph_data_plus)

59

In [7]:
import json

# export prototype dataset
out_dir = Path('../JSON/morphology')
out_dir.mkdir(exist_ok=True)
for book_data in morph_data_plus:
    file_name = out_dir.joinpath(Path(book_data[0] + '.json'))
    file_data = book_data[1:]
    with open(file_name, 'w', encoding='UTF8') as outfile:
        json.dump(file_data, outfile, ensure_ascii=False, indent=2)