# Generate Parallel Dataset

The clean dataset is a modernization of the CATSS Database, presented with minimal changes, in UTF8, exported as JSON.

In [1]:
import sys
import regex 
import collections
import pandas as pd
from pathlib import Path
from greekutils import beta2unicode # do: pip install greek-utils==0.2
from pprint import pprint

sys.path.append('../')
import regex_patterns as repatts

data = Path('../source/patched')

In [2]:
# CCAT transcription to UTF8
# Greek to be handled by greekutils

# Hebrew
trans2utf8 = {
    ')': 'א',
    'B': 'ב',
    'G': 'ג',
    'D': 'ד',
    'H': 'ה',
    'W': 'ו',
    'Z': 'ז',
    'X': 'ח',
    '+': 'ט',
    'Y': 'י',
    'K': 'כ',
    'L': 'ל',
    'M': 'מ',
    'N': 'נ',
    'S': 'ס',
    '(': 'ע',
    'P': 'פ',
    'C': 'צ',
    'Q': 'ק',
    'R': 'ר',
    '&': 'שׂ',
    '$': 'שׁ',
    'T': 'ת',
    '-': '־',
    '\\': '',
    ' ': ' ',
}

hfletter = r'{}(?=\s|$)'
final_heb = (
    (hfletter.format('\u05DE'), 'ם'),
    (hfletter.format('\u05DB'), 'ך'),
    (hfletter.format('\u05E0'), 'ן'),
    (hfletter.format('\u05E4'), 'ף'),
    (hfletter.format('\u05E6'), 'ץ'),
)
final_heb = [(regex.compile(patt), repl) for patt,repl in final_heb]

def sub_final_hb(string):
    """Substitute final letters in Hebrew"""
    for patt, repl in final_heb:
        string = patt.sub(repl, string)
    return string

def utf8_hebrew(string):
    """Convert transcribed Hebrew to UTF8
    
    NB: does not provide final letters (e.g. ם).
    """
    utf8_string = ''
    for c in string:
        utf8_string += trans2utf8.get(c, '')
    utf8_string = sub_final_hb(utf8_string)
    return utf8_string

In [3]:
beta2unicode.convert('SUNAGWGH\\N')

'συναγωγὴν'

In [4]:
utf8_hebrew('HYTH $M B\BYT-LXMM')

'היתה שׁם בבית־לחמם'

## Process the files

We process the CATSS database files into JSONs. 

The datastructure is illustrated below:

**NB: hypothetical data for illustration purposes**

```
[ # list of verses
    
    [ # list data for a verse
    
        'Gen 1:9',     # verse reference
        
        [ # three-list of text columns
            
            [ # Hebrew column A
                ('מקום', {'retcon'}), # text entry + text critical notes
            ],
            
            [ # Hebrew column B
                ('מקוה', {'?'}), # text entry + text critical notes
            ], 
            
            [ # Greek column
                ('συναγωγὴν', {''}), # text entry + text critical notes
            ],   
        ],
    ],
]
```

Note that some captured contexts may be empty. In this case, the notes are added to empty strings.

Each list consists of a single verse, headed by a reference string. The 
second element in the list is a three-list of two-tuples.

Each two-tuple represents a column in the database, and they consist 
of `(text, text-critical notes)`. Hebrew column B contains retroverted
readings and it is frequently empty.

Each column can contain multiple text entries, for cases where there are 
separate notes per column. For instance, the database might contain 
words wrapped in curly brackets `{}` with notations that are separate
from another word that is not contained in them. Thus, each column
can contain more than 1 entry.

## Parsing Strategy

Each line contains 2-3 columns of data. Those columns contain original language text and markup.

We will draw a strong distinction between text and markup, and seek to separate the two. 

To do this, we first run a pattern search for markup patterns. The markup patterns divide into
several subsets, depending on their behavior:

* substitutions - represents text elements that have been transported elsewhere (contains no text elements)
* context-based - markup that applies within a context of a stretch of text
* capturers - markup that captures text within its vicinity based on capture groups

If all markup patterns fail to match a string, the algorithm will then run the regex patterns for matching 
original language text to recognize it as text.

A "context" is an important concept for the parser. The highest level context is a given column
(for instance, Hebrew column A). Contexts can then be split into smaller pieces based on markup
capture groups. For instance, the following markup: `{...MQVH <1:23>}` captures everything in
between the brackets. Everything in the brackets then becomes a new, smaller context. The pattern
matches are run again, recursively, against this context to recognize the text and the additional
markup. That markup, if it is of the context-based type, will then only be applied to this word.

In [5]:
# compile the patterns for matching

# original language text
hchars = repatts.hchars
gchars = repatts.gchars
hb_patt = regex.compile(f'[{hchars} ]+')
grk_patt = regex.compile(f'[{gchars} ]+')

# markup text
comp_patt = lambda pattern: (regex.compile(pattern[0]),) + pattern[1:]
common_tc_patts = [comp_patt(p) for p in repatts.common_tc]
hb_tc_patts = common_tc_patts + [comp_patt(p) for p in repatts.heb_tc]
gk_tc_patts = common_tc_patts + [comp_patt(p) for p in repatts.greek_tc]

In [7]:
# Regex Testing Cell

# test_re = regex.compile(r"=?\{d\}\??\s*([^\s]*)")
# test_str = "K/DMWT/NW {d} {...MLK %p}"
# position = 10
# match = test.match(test_str, position)
# match

In [44]:
def parse_context(context, markup_patts, text_patt, position=0, column_list=[], markups=set(), ident='', debug=[]):
    """Parse a context of text and markup in structured JSON."""
    
    elements = []
    
    def report(*messages):
        debug.extend(ident+m for m in messages)
    
    report(f'analyzing context: {context}')
    
    while context and (position < len(context)):
    
        matched = False # track matches in the loop
        
        for patt, kind, tag, desc, indices in markup_patts:
            
            # process markup
            if match := patt.match(context, position):
                    
                # run any optional formats on tag
                tag = tag.format(**{k:(match.groups()[i] or '') for k, i in indices.items()})
                    
                # tag markup within the context
                if kind == 'con':
                    report(f'  markup pattern match: {patt.pattern}', f'    match: {match.group(0)}')
                    markups.add(tag)

                # run parser recursively for captured sub-contexts
                elif kind == 'cap':
                    report(f'  markup pattern match: {patt.pattern}')
                    subcontext = match.groups()[indices['txt']]
                    elements.extend(
                            parse_context(
                                subcontext, markup_patts, text_patt, markups={tag}, column_list=[],
                                ident=ident+'    ', debug=debug,
                            )
                    )

                # deal with substitution markups
                elif kind == 'sub':
                    subcontext = match.groups()[indices['txt']]
                    elements.append(('', {tag}))
                    
                else:
                    raise Exception(f'PATTERN ERROR for {patt}: NO KIND')

                # advance the position
                position = match.end()
                matched = True
                break
                
        if not matched:

            # process original language text
            if match := text_patt.match(context, position):
                elements.append((match.group(), set()))
                position = match.end()
                report(f'\ttext match: {match.group(0)}')

            # no match found, raise a syntax error
            elif position < len(context):
                error = f'SYNTAX ERROR AT POSITION {position} `{context[position]}` i.e. `{context[position-1:position+2]}`: {context}'
                report(error)
                raise Exception(error)
      
    # we're done
    # apply contextual markup to all elements
    # and return the goods
    for element, markup_set in elements:
        markup_set |= markups
        column_list.append((element, markup_set))
        
    return column_list

In [45]:
test_context = "K/DMWT/NW {d} {...MLK %p}"
debug = []
parse_context(test_context, hb_tc_patts, hb_patt, column_list=[], debug=debug)

[('K/DMWT/NW', {'<doub'}), ('MLK ', {'>doub', 'prep', 'trans'})]

In [46]:
print('\n'.join(debug))

analyzing context: K/DMWT/NW {d} {...MLK %p}
  markup pattern match: ({.*?}|[^\s]+)\s*\??(?=\{d\})
    analyzing context: K/DMWT/NW
    	text match: K/DMWT/NW
  markup pattern match: =?\{d\}\??\s*({.*?}|[^\s]*)
    analyzing context: {...MLK %p}
      markup pattern match: {\.\.\.(.+?)}
        analyzing context: MLK %p
        	text match: MLK 
          markup pattern match: =?%p([-+])?
            match: %p


<hr>

In [5]:
para_data = []

# -- regex patterns --
continued_column = regex.compile(r'[^\s]+.*#\s*$') # '#' at end of col preceded by some non-space char
content = regex.compile(r'.*[^\s].*') # string has some non-space char (content)

def line_is_continued(col1, col2):
    """Return boolean whether any column in a line is continued in next line"""
    if continued_column.match(col1) or continued_column.match(col2):
        return True
    else:
        return False

def is_dataline(line):
    """Return boolean on whether a line contains data content"""
    return all([
        content.match(line), 
        not repatts.ref_string.match(line)
    ])

def get_continued_columns(lines, counter):
    """Recursively retrieve data-lines continued on next line (marked with #).
    
    The function recursively retrieves subsequent lines if a starting line
    is marked with a continuation marker (#). Each line that is retrieved
    must be split into its columns, and those columns in turn must be 
    checked for continuation markers. This is done recursively until there
    is no continuation marker found. The function retrieves the lines using 
    the current index position; it advances the index by adding 1 each time. 
    It yields all additional columns it finds as 2-tuples.
    """
    line = lines[counter]
    if is_dataline(line):
        heb_col, grk_col = line.split('\t')
        if line_is_continued(heb_col, grk_col):
            counter += 1
            next_cols = lines[counter].split('\t')
            yield next_cols
            yield from get_continued_columns(lines, counter) # recursive call here

def transcribe_hebrew(string):
    """Transcribe a string of Hebrew column text from the parallel database."""
    pass

def transcribe_greek(string):
    """Transcribe a string of Greek column text from the parallel database."""
    pass

def clean_text(string, lang='hebrew'):
    """Clean string of text from the parallel database."""
    pass

# analysis data
test = []
errors = []

tc_inventory = collections.defaultdict(lambda: collections.Counter())
tc_refs = collections.defaultdict(lambda: collections.defaultdict(list))

cross_refs = regex.compile(r'\<.*\d+.*?\>|\[.*\d+.*?\]')
num_repl = regex.compile(r'\d+')

# process files
for file in sorted(data.glob('*.par')):
    
    # read the file
    lines = file.read_text().split('\n')
    
    verse_data = []
    position = 0
    
    while position < len(lines):
    
        line = lines[position]
    
        # detect a new verse at verse reference string
        if repatts.ref_string.match(line):
            
            # store last verse, make space for new one, store new one
            if verse_data:
                para_data.append(verse_data)
                verse_data = []
            verse_data.append(line)
        
        elif line:
            
            # for debugging
            show_tuple = (file.name, position, verse_data[0], line)
            
            # extract the two columns
            heb_col, grk_col = line.split('\t')
            
            # NB: that for Sirach the Hebrew columns can sometimes
            # be split several ways since there are numerous Hebrew 
            # sources, deriving from various manuscripts
            # the sources are indicated by a following number;
            # thus, it may be possible to split along stand-alone integers
            # to divide up the text
            
            # seperate heb col a and b (optional)
            if '=' in heb_col:
                # TODO: **VERY IMPORTANT**:
                # column B can have multiple elements with `=` prepended
                # so instead of a split we should index and split to preserve'
                # the first `=` sign; this should allow all discrete symbols
                # to parsed alongside the `=` sign
                heb_colA, heb_colB = heb_col.split('=', 1)
            else:
                heb_colA = heb_col
                heb_colB = ''
            
            # compile inventory of TC values
            for lang, lang_tc, col in [('hb', hebrew_tc, heb_col), ('gk', greek_tc, grk_col)]:
                
                for sigla in lang_tc.findall(col):
                    
                    no_num_sigla = num_repl.sub('\d', sigla)
                    
                    # count the sigla
                    if cross_refs.findall(sigla):
                        tc_inventory['crossrefs'][no_num_sigla] += 1
                        tc_refs['crossrefs'][no_num_sigla].append(show_tuple)
                    else:
                        tc_inventory[lang][no_num_sigla] += 1
                        tc_refs[lang][no_num_sigla].append(show_tuple)
            
            # collect parts of the columns continued on next line(s) in doc
            # this is done recursively to ensure all lines are retrieved
            cont_cols = list(get_continued_columns(lines, position))

            for hb_cc, gk_cc in cont_cols:
                position += 1
                heb_col += hb_cc
                grk_col += gk_cc

        # it's an empty line; move on
        else:
            pass
            
        position += 1

NameError: name 'hebrew_tc' is not defined

In [44]:
for lang, values in tc_inventory.items():
    print(lang + ' has', len(values), 'different tc notes')

gk has 114 different tc notes
hb has 584 different tc notes
crossrefs has 281 different tc notes


In [49]:
# export spreadsheets to investigate all sigla

for lang in ('hb', 'gk'):
    inv_data = []
    for sigla, count in tc_inventory[lang].items():
        exs = tc_refs[lang][sigla]
        ex1 = exs[0]
        ex_str = f'{ex1[0]}.{ex1[1]} {ex1[-1]}'
        select_exs = '; '.join(e[2] for e in exs[1:12])
        sigla = sigla.replace('=', "'=")
        inv_data.append((sigla, count, ex_str, select_exs))
    inv_df = pd.DataFrame(inv_data, columns=['sigla', 'freq', 'ex', 'other exs'])
    inv_df.to_csv(f'{lang}_tc_inventory.tsv', sep='\t')

In [41]:
show = tc_refs['hb']['I']
#show = test

for case in show[:100]:
    pprint(case)
    print()

('05.Deut.par', 11172, 'Deut 24:20', "--+ '' =;KI <24.22>\tO(/TI")

('17.1Esdras.par', 6514, '1Esdr 9:32', ')L(ZR =:)LIW(NY\tE)LIWNA=S [e10.31]')



In [42]:
hb_inventory = pd.DataFrame.from_dict(tc_inventory['hb'], orient='index').sort_values(by=0, ascending=False)

hb_inventory.to_csv('hb_tc_inventory.tsv', sep='\t')

In [43]:
gk_inventory = pd.DataFrame.from_dict(tc_inventory['gk'], orient='index').sort_values(by=0, ascending=False)

gk_inventory.to_csv('gk_tc_inventory.tsv', sep='\t')

In [16]:
tc_inventory['hb'].most_common(150)

[('--+', 20102),
 (',,a', 10635),
 ('\\d', 9372),
 ("''", 8722),
 ('=', 8675),
 ('{...}', 6263),
 ('[..]', 4531),
 ('=;', 3740),
 ('^', 3719),
 ('*', 2903),
 ('}', 2175),
 ('=:', 1825),
 ('=@', 1525),
 ('**', 1493),
 ('=v', 1298),
 ('{...', 1235),
 ('=?', 935),
 ('<sp>', 893),
 ('^^^', 868),
 ('=%p', 776),
 ('=%vap', 495),
 ('{\\d}', 487),
 ('=??', 485),
 ('{x}', 464),
 ('=%p+', 419),
 ('{d}', 395),
 (']', 389),
 ('=%p-', 377),
 ('{..r', 366),
 ('{!}p', 344),
 ('.m', 333),
 ('=%vpa', 328),
 ('{!}nd', 327),
 ('{**}', 290),
 ('{...?', 283),
 ('.', 280),
 ('[', 273),
 ('.dr', 270),
 ('.rd', 234),
 ('=@?', 219),
 ('=+', 211),
 ('{..^', 207),
 ('?', 195),
 ('#', 179),
 ('{*}', 174),
 (',', 172),
 ('.yw', 147),
 ('.wy', 135),
 ('=vs', 128),
 ('<<', 124),
 ('>>', 115),
 ('={d}', 113),
 ('a', 103),
 ('>\\d', 92),
 ('=?@', 78),
 ('.j', 78),
 ('.s', 75),
 ('*[', 75),
 ('<sp^>', 68),
 ('{!}-', 66),
 ('---', 65),
 ('.nm', 61),
 ('=r', 61),
 ("''=", 61),
 ('=:?', 56),
 ('.mn', 55),
 ('<', 53),
 ('-

In [17]:
tc_inventory['gk'].most_common(125)

[('---', 17561),
 ("''", 7784),
 ('}', 7349),
 ("'", 4272),
 ('^', 3717),
 ('{..^', 2809),
 ('{...', 2060),
 ('{..p', 1584),
 ('{---%}', 1346),
 ('{d}', 1028),
 ('{t}', 935),
 ('^^^', 867),
 ('{...}', 744),
 ('{..d', 588),
 ('{x}', 562),
 ('?', 477),
 ('#', 358),
 ('{p}', 289),
 ('{t?}', 288),
 ('{', 189),
 ('{d?}', 165),
 ('{...?', 137),
 ("'}", 71),
 ('{c', 54),
 ('{s}', 47),
 ('?}', 28),
 ('{..?', 28),
 ('[\\d.\\d', 22),
 ('--', 20),
 ('*', 19),
 ('{..d?', 15),
 ('---?', 15),
 ("---''", 14),
 ('{..^?', 14),
 ('-', 13),
 ('s', 13),
 ('~', 12),
 ('{..', 11),
 ('{d}{...', 9),
 ('----', 8),
 ('[e\\d.\\d', 8),
 ('{...^', 7),
 ('{?d}', 7),
 ('{c?', 5),
 ('{..r', 5),
 ('}?', 5),
 ('{d}?', 5),
 ('{z}', 5),
 ('{c}', 3),
 ('{d?}{...', 3),
 ('[cc\\d.\\d', 3),
 ('{..~', 3),
 ('{..p?', 2),
 ('{..^.', 2),
 ('{...d', 2),
 ('{...p', 2),
 ('[c', 2),
 (']', 2),
 ('{g', 2),
 ('}}', 1),
 ('{....}', 1),
 ('?^', 1),
 ('{t.}', 1),
 ('{....', 1),
 ('<t?>', 1),
 ('{pm}', 1),
 ('\\d}', 1),
 ('<fm', 1),
 ('--

## Testing the Line-merger

Here is a prototype for collecting the next lines recursively.

In [67]:
test = Path('../source/patched/17.1Esdras.par').read_text().split('\n')

lines = test[3067]

lines

'*$MLY **$LMY\t*SUBAI+/ [e2.46]'

In [149]:

i = 0
while i < len(lines):
    
    line = lines[i]
    
    if not line or ref_string.match(line):
        i += 1
        continue
    
    print(line)
    heb_col, grk_col = line.split('\t')
    cont_lines = list(get_continued_columns(lines, i))
    #print(cont_lines)
    for hb_cc, gk_cc in cont_lines:
        i += 1 # advance position in doc
        heb_col += hb_cc
        grk_col += gk_cc

    print('\theb:', heb_col)
    print('\tgrk:', grk_col)
    print()
            
    i += 1

W/B/QWM/H	KAI\ {..dE)N TW=|} A)NASTH=NAI #
	heb: W/B/QWM/H#
	grk: KAI\ {..dE)N TW=|} A)NASTH=NAI #{..dAU)TH\N}



In [129]:
cont_column = re.compile(r'[^\s]+.*#\s*$')

for line in lines:
    for col in line.split('\t'):
        if cont_column.match(col):
            print(col, 'match')

KAI\ {..dE)N TW=|} A)NASTH=NAI # match


In [151]:
line_is_continued(*'BR)Y W/B/$(RYM =:BR)WM$(RYM #\tBAROUMSEWRIM {t}'.split('\t'))

True