# Africa Collation test

## Test data

I have added the exact folios for Bod47, and the first folio for G1 O1 followed by a random folio for test purposes.

### Format for folio
a line containing "FOLIO xx yyy" where xx = folio/page number, and yyy = url. If one is missing it will cause an error.

### Format for linebreak
Nothing to add, the end of line is read by python in the file.


## Conventions in JSON

What property to add, and how to call them? Here I have folio_no, folio_url, linebreak.

Folio and line can be either a property, or both a token and a property. so there are four options:
- folio (number, url) and lines (endline true/false) only property of all tokens
- folio token, line property
- folio property, line token
- folio/line both tokens

### normalisation
I have only added lower case normalisation. We could also add rules for punctuation marks

In [2]:
import json
import os


# list of paths to textfiles
#TODO: choice between paths, or all files in aa directory
textfiles = ["data/Bod47.txt", "data/O1.txt", "data/G1.txt"]

# converts plain text to json input for CollateX
# folio/linebreak can be either property only (False, by default) 
# or token and property (True)
def collatex_text_to_json(textfiles, token_folio=False, token_linebreak=False):
    
    # witnesses list
    listwit = []
    
    
    # for each file in textfiles
    for path in textfiles:
        
        # witness dict
        wit = {}
        
        # get filename
        filename = os.path.basename(path)
        
        # witness identifier (siglum)
        witness_id = os.path.splitext(filename)[0]
        
        # set witness id
        wit['id'] = witness_id
        
        # list of witness tokens
        wit['tokens'] = []
        
        # open file
        with open(path, 'r', encoding='utf-8') as file:
            
            # folio value
            folio_no = ''
            folio_url = ''
            newfolio = False
            
            # read line by line
            for line in file:
                
                # if folio line
                if line.startswith('FOLIO'):
                    
                    # update folio number and url
                    FOLIO, folio_no, folio_url = line.split(' ')
                    newfolio = True
                    
                    # if the folio should be a token
                    if token_folio:
                        
                        # create token
                        t = {'t': FOLIO, 'n': FOLIO, 'folio_no':folio_no, 'folio_url':folio_url, 'newfolio': newfolio}
                        
                        # append token to the list
                        wit['tokens'].append(t)
                
                else:
                    
                    # separate words at whitespace
                    listwords = line.split(' ')
                    
                    # for each word
                    for word in listwords:
                         
                        # remove endline
                        token = word.split('\n')
                        
                        # normalisation. Here lowercase. (also possible to add rules for punctuation marks)
                        n = token[0].lower()
                        
                        # create a token
                        t = {'t': token[0], 'n': n, 'folio_no':folio_no, 'folio_url':folio_url, 'endline':False if len(token) == 1 else True, 'newfolio': newfolio}
                        
                        # append token to the list
                        wit['tokens'].append(t)
                        
                        # we are not anymore at the start of a new folio
                        newfolio = False
                        
                    # if endline is a token
                    if token_linebreak:
                        t = {'t': 'LINE', 'n': '\n', 'folio_no':folio_no, 'folio_url':folio_url, 'endline':True, 'newfolio': newfolio}
                        wit['tokens'].append(t)
        
    
    
        # add to witnesses list
        listwit.append(wit)
    
    # return output as json
    json_witnesses = {'witnesses': listwit}
    return json_witnesses

In [3]:
# first test: folio/lines are only properties
test_json_input_1 = collatex_text_to_json(textfiles)
#print(test_json_input)

# second test: both folio/lines are tokens
test_json_input_2 = collatex_text_to_json(textfiles, token_folio=True, token_linebreak=True)

In [4]:
from collatex import *

result1 = collate(test_json_input_1, output='json', layout='vertical', segmentation=False)
result2 = collate(test_json_input_2, output='json', layout='vertical', segmentation=False)

In [7]:
# special html table for Africa example, both folio/line token
html_table = '<table style="border: 1px solid #000000; border-collapse: collapse;" cellpadding="4" border="1"><tr>'

witnesses = json.loads(result2)['witnesses']
witrange = range(len(witnesses))

for wit in witnesses:
    html_table += '<th>'+wit+'</th>'
html_table += '</tr>'

for row in list(zip(*json.loads(result2)['table'])):
    
    html_table += '<tr>'
    
    # folio lines
    
    # it checks only the first token in the list 
    # (but we assume here only one token per cell because of the segmentation)
    if any(row[i][0]['t'] == 'FOLIO' for i in witrange if row[i] is not None):
        for cell in row:
            string = '' if cell is None else '<a href="'+cell[0]['folio_url']+'">f.'+cell[0]['folio_no']+'</a>'
            html_table += '<td style="">'+string+'</td>' if cell is None else '<td style="border-top:5px double;">'+string+'</td>'
    
    # endlines
    elif any(row[i][0]['t'] == 'LINE' for i in witrange if row[i] is not None):
        for cell in row:
            html_table += '<td style="background-color:gold;"></td>'
    
    # others
    else:
        # set background color according to variation
        # if there is an empty cell, there is a variation
        if any(row[i] == None for i in witrange):
            bgcolor = '#FF7F7'
        # otherwise
        else:
            # list of token t values
            t = [row[i][0]['t'] for i in witrange]
            # if the list has more than one value, there is a variation
            if len(set(t)) > 1:
                bgcolor = '#FF7F7'
            else:
                bgcolor = ''
        # create html cells
        for cell in row:
            string = '-' if cell is None else cell[0]['t']
            html_table += '<td bgcolor="'+bgcolor+'">'+string+'</td>'
        html_table += '</tr>'

html_table += '</table>'



In [8]:
from IPython.core.display import HTML
HTML(html_table)

Bod47,O1,G1
f.1r,f.1r,f.1r
-,-,fletus
Passeris,-,passeris
appelatio,-,lesbie
,,
Passer,Passer,Passer
delitiae,delicie,delicie
meae,mee,mee
puellae,puelle,puelle
,,


In [284]:
# conversion of JSON for compatibility with pycoviz (remove FOLIO/LINE rows?)