# Generate Parallel Dataset

The clean dataset is a modernization of the CATSS Database, presented with minimal changes, in UTF8, exported as JSON.

In [1]:
import re
from pathlib import Path
from greekutils import beta2unicode # do: pip install greek-utils==0.2
from pprint import pprint

data = Path('../source/patched')

In [2]:
# CCAT transcription to UTF8
# Greek to be handled by greekutils

# Hebrew
trans2utf8 = {
    ')': 'א',
    'B': 'ב',
    'G': 'ג',
    'D': 'ד',
    'H': 'ה',
    'W': 'ו',
    'Z': 'ז',
    'X': 'ח',
    '+': 'ט',
    'Y': 'י',
    'K': 'כ',
    'L': 'ל',
    'M': 'מ',
    'N': 'נ',
    'S': 'ס',
    '(': 'ע',
    'P': 'פ',
    'C': 'צ',
    'Q': 'ק',
    'R': 'ר',
    '&': 'שׂ',
    '$': 'שׁ',
    'T': 'ת',
    '-': '־',
    '\\': '',
    ' ': ' ',
}

def utf8_hebrew(string):
    """Convert transcribed Hebrew to UTF8
    
    NB: does not provide final letters (e.g. ם).
    """
    utf8_string = ''
    for c in string:
        utf8_string += trans2utf8.get(c, '')
    return utf8_string

Demonstrate the functions.

In [3]:
beta2unicode.convert('SUNAGWGH\\N')

'συναγωγὴν'

In [4]:
utf8_hebrew('HYTH $M B\BYT-LXMM')

'היתה שׁמ בבית־לחממ'

## Process the files

We process the CATSS database files into JSONs. 

The datastructure is illustrated below:

```
[ # list of verses
    
    [ # list data for a verse
    
        'Gen 1:9',     # verse reference
        
        [ # three-list of text columns
            
            [ # Hebrew column A
                ('מקום', ''), # text entry + text critical notes
            ],
            
            [ # Hebrew column B
                ('מקוה', '?'), # text entry + text critical notes
            ], 
            
            [ # Greek column
                ('συναγωγὴν', ''), # text entry + text critical notes
            ],   
        ],
    ],
]
```

Each list consists of a single verse, headed by a reference string. The 
second element in the list is a three-list of two-tuples.

Each two-tuple represents a column in the database, and they consist 
of `(text, text-critical notes)`. Hebrew column B contains retroverted
readings and it is frequently empty.

Each column can contain multiple text entries, for cases where there are 
separate notes per column. For instance, the database might contain 
words wrapped in curly brackets `{}` with notations that are separate
from another word that is not contained in them. Thus, each column
can contain more than 1 entry.

In [18]:
para_data = []

# -- regex patterns --
ref_string = re.compile(r'^[A-Za-z1-9/]+ \d+:?\d*$') # e.g. 'Gen 1:3'
continued_column = re.compile(r'[^\s]+.*#\s*$') # '#' at end of col preceded by some non-space char
content = re.compile(r'.*[^\s].*') # string has some non-space char (content)

test = []
errors = []
passages = set()

def line_is_continued(col1, col2):
    """Return boolean whether any column in a line is continued in next line"""
    if continued_column.match(col1) or continued_column.match(col2):
        return True
    else:
        return False

def is_dataline(line):
    """Return boolean on whether a line contains data content"""
    return all([
        content.match(line), 
        not ref_string.match(line)
    ])

def get_continued_columns(lines, counter):
    """Recursively retrieve data-lines continued on next line (marked with #).
    
    The function recursively retrieves subsequent lines if a starting line
    is marked with a continuation marker (#). Each line that is retrieved
    must be split into its columns, and those columns in turn must be 
    checked for continuation markers. This is done recursively until there
    is no continuation marker found. The function retrieves the lines using 
    the current index position; it advances the index by adding 1 each time. 
    It yields all additional columns it finds as 2-tuples.
    """
    line = lines[counter]
    if is_dataline(line):
        heb_col, grk_col = line.split('\t')
        if line_is_continued(heb_col, grk_col):
            counter += 1
            next_cols = lines[counter].split('\t')
            yield next_cols
            yield from get_continued_columns(lines, counter) # recursive call here

# process files
for file in sorted(data.glob('*.par')):
    
    # read the file
    lines = file.read_text().split('\n')
    
    verse_data = []
    position = 0
    
    while position < len(lines):
    
        line = lines[position]
    
        # detect a new verse at verse reference string
        if ref_string.match(line):
            
            # store last verse, make space for new one, store new one
            if verse_data:
                para_data.append(verse_data)
                verse_data = []
            verse_data.append(line)
        
        elif line:
            
            # extract the two columns
            try:
                heb_col, grk_col = line.split('\t')
            except:
                raise Exception(file.name, position, line)
            
            try:
                # collect parts of the columns continued on next line(s) in doc
                # this is done recursively to ensure all lines are retrieved
                cont_cols = list(get_continued_columns(lines, position))

                for hb_cc, gk_cc in cont_cols:
                    position += 1
                    heb_col += hb_cc
                    grk_col += gk_cc
                    
            except:
                errors.append((file.name, position, line))
                
            if len(cont_cols) > 1:
                test.append((file.name, position, line, heb_col, grk_col))
                
        # it's an empty line; move on
        else:
            pass
            
        position += 1

In [19]:
len(test)

6

In [17]:
for case in test:
    pprint(case)
    print()

('08.JudgesB.par',
 452,
 'W/)T BNT/YH {..rL)}\tOU)DE\\ TA\\ PERI/OIKA AU)TH=S {d} #',
 'W/)T BNT/YH {..rL)}###',
 'OU)DE\\ TA\\ PERI/OIKA AU)TH=S {d} #OU)DE\\ TA\\S QUGATE/RAS AU)TH=S {d} '
 '#{...OU)DE\\ TA\\ PERI/OIKA AU)TH=S} {d} #{...OU)DE\\ TA\\S QUGATE/RAS '
 'AU)TH=S}')

('11.1Sam.par',
 6643,
 'W/HXRMTM\tKAI\\ IERIM {t} {d} {...KAI\\ #',
 'W/HXRMTM##',
 'KAI\\ IERIM {t} {d} {...KAI\\ #E)COLEQREU/SEIS AU)TO\\N {d} KAI\\ '
 '#A)NAQEMATIEI=S AU)TO\\N}')

('13.1Kings.par',
 2979,
 'H/MMLKWT ={d}H/MLKYM <5.4>\tTAI=S BASILEI/AIS [2.46b] {d} #',
 'H/MMLKWT ={d}H/MLKYM <5.4>##',
 'TAI=S BASILEI/AIS [2.46b] {d} #TOI=S BASILEU=SIN [2.46k] #TW=N BASILE/WN '
 '[10.26a]')

('17.1Esdras.par',
 532,
 'H/QYNWT =SPR DBRY ?H/YMYM\tTH=| BI/BLIW| TW=N I(STOROUME/NWN #',
 'H/QYNWT =SPR DBRY ?H/YMYM# ?L/MLKY :YHWDH#',
 'TH=| BI/BLIW| TW=N I(STOROUME/NWN #PERI\\ TW=N BASILE/WN TH=S I)OUDAI/AS '
 '#[cc35.25]')

('23.Prov.par',
 9159,
 'XRDT ={d}XRPT .dp\tFOBHQE/NTES {d} KAI\\ AI)SXUNQE/NTES #',
 'XRD

## Testing the Line-merger

Here is a prototype for collecting the next lines recursively.

In [11]:
test = Path('../source/patched/02.Exodus.par').read_text().split('\n')

lines = test[16283:16287]

lines

['\t#',
 '--+\tE)N AU)TAI=S',
 'W/)T BGDY\tKAI\\ TOU\\S XITW=NAS',
 'BN/YW\tTOI=S UI(OI=S AARWN']

In [149]:

i = 0
while i < len(lines):
    
    line = lines[i]
    
    if not line or ref_string.match(line):
        i += 1
        continue
    
    print(line)
    heb_col, grk_col = line.split('\t')
    cont_lines = list(get_continued_columns(lines, i))
    #print(cont_lines)
    for hb_cc, gk_cc in cont_lines:
        i += 1 # advance position in doc
        heb_col += hb_cc
        grk_col += gk_cc

    print('\theb:', heb_col)
    print('\tgrk:', grk_col)
    print()
            
    i += 1

W/B/QWM/H	KAI\ {..dE)N TW=|} A)NASTH=NAI #
	heb: W/B/QWM/H#
	grk: KAI\ {..dE)N TW=|} A)NASTH=NAI #{..dAU)TH\N}



In [129]:
cont_column = re.compile(r'[^\s]+.*#\s*$')

for line in lines:
    for col in line.split('\t'):
        if cont_column.match(col):
            print(col, 'match')

KAI\ {..dE)N TW=|} A)NASTH=NAI # match


In [151]:
line_is_continued(*'BR)Y W/B/$(RYM =:BR)WM$(RYM #\tBAROUMSEWRIM {t}'.split('\t'))

True