# Generate Parallel Dataset

The clean dataset is a modernization of the CATSS Database, presented with minimal changes, in UTF8, exported as JSON.

In [1]:
import re
from pathlib import Path
from greekutils import beta2unicode # do: pip install greek-utils==0.2

data = Path('../source/patched')

In [2]:
# CCAT transcription to UTF8
# Greek to be handled by greekutils

# Hebrew
trans2utf8 = {
    ')': 'א',
    'B': 'ב',
    'G': 'ג',
    'D': 'ד',
    'H': 'ה',
    'W': 'ו',
    'Z': 'ז',
    'X': 'ח',
    '+': 'ט',
    'Y': 'י',
    'K': 'כ',
    'L': 'ל',
    'M': 'מ',
    'N': 'נ',
    'S': 'ס',
    '(': 'ע',
    'P': 'פ',
    'C': 'צ',
    'Q': 'ק',
    'R': 'ר',
    '&': 'שׂ',
    '$': 'שׁ',
    'T': 'ת',
    '-': '־',
    '\\': '',
    ' ': ' ',
}

def utf8_hebrew(string):
    """Convert transcribed Hebrew to UTF8
    
    NB: does not provide final letters (e.g. ם).
    """
    utf8_string = ''
    for c in string:
        utf8_string += trans2utf8.get(c, '')
    return utf8_string

Demonstrate the functions.

In [3]:
beta2unicode.convert('SUNAGWGH\\N')

'συναγωγὴν'

In [4]:
utf8_hebrew('HYTH $M B\BYT-LXMM')

'היתה שׁמ בבית־לחממ'

## Process the files

We process the CATSS database files into JSONs. 

The datastructure is illustrated below:

```
[ # list of verses
    
    [ # list data for a verse
    
        'Gen 1:9',     # verse reference
        
        [ # three-list of text columns
            
            [ # Hebrew column A
                ('מקום', ''), # text entry + text critical notes
            ],
            
            [ # Hebrew column B
                ('מקוה', '?'), # text entry + text critical notes
            ], 
            
            [ # Greek column
                ('συναγωγὴν', ''), # text entry + text critical notes
            ],   
        ],
    ],
]
```

Each list consists of a single verse, headed by a reference string. The 
second element in the list is a three-list of two-tuples.

Each two-tuple represents a column in the database, and they consist 
of `(text, text-critical notes)`. Hebrew column B contains retroverted
readings and it is frequently empty.

Each column can contain multiple text entries, for cases where there are 
separate notes per column. For instance, the database might contain 
words wrapped in curly brackets `{}` with notations that are separate
from another word that is not contained in them. Thus, each column
can contain more than 1 entry.

In [10]:
para_data = []

# regex patterns
ref_string = re.compile(r'^[A-Za-z1-9/]+ \d+:?\d*$')

test = []
passages = set()

# process files
for file in sorted(data.glob('*.par')):
    
    # read the file
    lines = file.read_text().split('\n')
    
    verse_data = []
    i = 0
    
    while i < len(lines):
    
        line = lines[i]
    
        # detect a new verse
        if ref_string.match(line):
            
            # store last verse, make space for new one, store new one
            if verse_data:
                para_data.append(verse_data)
                verse_data = []
            verse_data.append(line)
        
        elif line:
            heb_col, grk_col = line.split('\t')
                
        # it's an empty line; move on
        else:
            pass
            
        # update position
        i += 1

Testing the line merger.

## Things to look for: How many cases of bad # endings can I find, if any?

Here is a prototype for collecting the next lines recursively.

In [44]:
test = Path('../source/patched/11.1Sam.par').read_text().split('\n')

lines = test[6641:6644]

lines

['W/HXRMTM\tKAI\\ IERIM {t} {d} {...KAI\\ #',
 '#\tE)COLEQREU/SEIS AU)TO\\N {d} KAI\\ #',
 '#\tA)NAQEMATIEI=S AU)TO\\N}']

In [54]:
def get_next(lines, counter):
    """Recursively retrieve the next line for lines ending in #"""
    line = lines[counter]
    if re.match('.*#\s*', line):
        counter += 1
        if counter < len(lines):
            yield lines[counter]
            yield from get_next(lines, counter)
        
i = 0
while i < len(lines):
    
    line = lines[i]
    
    heb_col, greek_col = line.split('\t')
    
    next_lines = list(get_next(lines, i)) # get next lines
    i += len(next_lines) # update position by N-lines gotten
    
    print(next_lines)
    
    i += 1

['#\tE)COLEQREU/SEIS AU)TO\\N {d} KAI\\ #', '#\tA)NAQEMATIEI=S AU)TO\\N}']
