### Add a property to the token for normalized forms to be used not in the alignment, but for interpretation in the analysis stage and then in the visualization.

## 0. No normalization

In [None]:
from collatex import *
collation = Collation()
W1 = open( "data/example4/W1.txt", encoding='utf-8' ).read()
W2 = open( "data/example4/W2.txt", encoding='utf-8' ).read()
W3 = open( "data/example4/W3.txt", encoding='utf-8' ).read()
W4 = open( "data/example4/W4.txt", encoding='utf-8' ).read()
collation.add_plain_witness( "W1", W1 )
collation.add_plain_witness( "W2", W2 )
collation.add_plain_witness( "W3", W3 )
collation.add_plain_witness( "W4", W4 )
table = collate(collation, output='html2', segmentation=False)
print(table)


## 1. Dictionary

This method requires the manual creation of a dictionary. In this example, the dictionary is built with **three columns**: the first for the **original form**, the second for the **normalized form** to be used during the **alignment**, the third for the **normalized form** to be used in the **interpretation**, after the alignment and before the visualisation.
The first column must have a value, while the second and third columns may stay empty (later addition: if it's empty, does it take the t value by default???).

In [48]:
import csv
import re
from collatex import *
collation = Collation()

# Create the dictionary (here 'dictionary_norm.csv') with three columns: the first for the original form (t), the second for the normalized form to be used during the alignment (n), the third for the normalized form to be used in the interpretation (p), after the alignment and before the visualisation. The first column must have a value, while the second and third columns may stay empty.
Normit = {}
with open('dictionary_norm.csv') as csvfile:
    reader = csv.DictReader(csvfile, fieldnames=['Original', 'NormalisedAlignment', 'NormalisedInterpretation'])
    for row in reader:
        Normit[row['Original']]= row['NormalisedAlignment']
        
NormitInterpretation = {}
with open('dictionary_norm.csv') as csvfile:
    reader = csv.DictReader(csvfile, fieldnames=['Original', 'NormalisedAlignment', 'NormalisedInterpretation'])
    for row in reader:
        NormitInterpretation[row['Original']]= row['NormalisedInterpretation']

from collatex.core_classes import WordPunctuationTokenizer
tokenizer = WordPunctuationTokenizer()

#read in the witnesses  from your file system 
W1 = open( "data/example4/W1.txt", encoding='utf-8' ).read()
W2 = open( "data/example4/W2.txt", encoding='utf-8' ).read()
W3 = open( "data/example4/W3.txt", encoding='utf-8' ).read()
W4 = open( "data/example4/W4.txt", encoding='utf-8' ).read()

# build a function to tokenize and to normalize by replace keys to be found in the dictionary by the corresponding values 
def tokennormalizer(witness) :
    tokens_as_strings = tokenizer.tokenize(witness)
    list = []
    for token_string in tokens_as_strings:
        normversion = re.sub(r'\s+$',"", token_string)
        replaceversion = Normit.get(normversion,normversion)
        token_norm = NormitInterpretation.get(normversion,normversion)
        list.append({'t':token_string, 'n':replaceversion, 'p':token_norm})
    return(list)

tokens_W1 = tokennormalizer(W1) 
tokens_W2 = tokennormalizer(W2) 
tokens_W3 = tokennormalizer(W3) 
tokens_W4 = tokennormalizer(W4) 
#Print to check what's in the properties; can be deleted once we can visualize it. Can check also in the collation with json output.
##print(tokens_W1, tokens_W2, tokens_W3, tokens_W4)
 
witness_W1 = { "id": "W1", "tokens":tokens_W1 }
witness_W2 = { "id": "W2", "tokens":tokens_W2 }
witness_W3 = { "id": "W3", "tokens":tokens_W3 }
witness_W4 = { "id": "W4", "tokens":tokens_W4 }


input = { "witnesses": [ witness_W1, witness_W2, witness_W3, witness_W4 ] }



graph = collate(input, output='json', segmentation=False) 
print(graph)

## !!! Probabilmente NON SERVONO n (normalized alignment) e p (normalized interpretation), ma solo l'originale (t) e quello normalizzato (n) bastano.
## Ora da questo json dobbiamo tirare fuori una html table come la vogliamo noi, ovvero
# rosso, quando n sono diversi
# verde, quando n e t sono uguali
# giallo, quando n sono uguali, ma t diversi







# ================= ALL THIS NOT USED ======================================
# from the code in display module
from collatex.HTML import Table, TableRow, TableCell
from textwrap import fill


from IPython.display import HTML

def visualize_table_vertically_with_colors_CUSTOM(table, collation):
    # print the table vertically
    # switch columns and rows
    rows = []
    for column in table.columns:
        cells = []
        for witness in collation.witnesses:
            cell = column.tokens_per_witness.get(witness.sigil)
            cells.append(TableCell(text=fill("".join(item.token_data["t"] for item in cell) if cell else "-", 20), bgcolor="FF5000" if column.variant else "00FFFF"))
        rows.append(TableRow(cells=cells))
    sigli = []
    
    for witness in collation.witnesses:
        sigli.append(witness.sigil)
    
    x = Table(header_row=sigli, rows=rows)
    print(x)
    return display(HTML(str(x)))

# table = collate(input, output='table', segmentation=False) ## graph
# visualize_table_vertically_with_colors_CUSTOM(table, collation)
# ==========================================================================




{"table": [[[{"_sigil": "W1", "_token_array_position": 0, "n": "Lors", "p": "Lors", "t": "Lors "}], null, [{"_sigil": "W1", "_token_array_position": 1, "n": "conte", "p": "conte", "t": "conte "}], [{"_sigil": "W1", "_token_array_position": 2, "n": "li", "p": "li", "t": "li "}], [{"_sigil": "W1", "_token_array_position": 3, "n": "rois", "p": "rois", "t": "rois "}], [{"_sigil": "W1", "_token_array_position": 4, "n": "a", "p": "a", "t": "a "}], [{"_sigil": "W1", "_token_array_position": 5, "n": "la", "p": "la", "t": "la "}], [{"_sigil": "W1", "_token_array_position": 6, "n": "reine", "p": "reine", "t": "reine "}], [{"_sigil": "W1", "_token_array_position": 7, "n": "coment", "p": "coment", "t": "coment "}], [{"_sigil": "W1", "_token_array_position": 8, "n": "la", "p": "la", "t": "la "}], [{"_sigil": "W1", "_token_array_position": 9, "n": "dame", "p": "dame", "t": "dame "}], [{"_sigil": "W1", "_token_array_position": 10, "n": "del", "p": "del", "t": "del "}], [{"_sigil": "W1", "_token_array

In [49]:
import json
 
data = json.loads(graph)
for element in data['table']:
    print(element)


[[{'_sigil': 'W1', '_token_array_position': 0, 'n': 'Lors', 'p': 'Lors', 't': 'Lors '}], None, [{'_sigil': 'W1', '_token_array_position': 1, 'n': 'conte', 'p': 'conte', 't': 'conte '}], [{'_sigil': 'W1', '_token_array_position': 2, 'n': 'li', 'p': 'li', 't': 'li '}], [{'_sigil': 'W1', '_token_array_position': 3, 'n': 'rois', 'p': 'rois', 't': 'rois '}], [{'_sigil': 'W1', '_token_array_position': 4, 'n': 'a', 'p': 'a', 't': 'a '}], [{'_sigil': 'W1', '_token_array_position': 5, 'n': 'la', 'p': 'la', 't': 'la '}], [{'_sigil': 'W1', '_token_array_position': 6, 'n': 'reine', 'p': 'reine', 't': 'reine '}], [{'_sigil': 'W1', '_token_array_position': 7, 'n': 'coment', 'p': 'coment', 't': 'coment '}], [{'_sigil': 'W1', '_token_array_position': 8, 'n': 'la', 'p': 'la', 't': 'la '}], [{'_sigil': 'W1', '_token_array_position': 9, 'n': 'dame', 'p': 'dame', 't': 'dame '}], [{'_sigil': 'W1', '_token_array_position': 10, 'n': 'del', 'p': 'del', 't': 'del '}], [{'_sigil': 'W1', '_token_array_position':

In [50]:
# import json

data = json.loads(graph)
for row in data['table']:
    print(row[0])

[{'_sigil': 'W1', '_token_array_position': 0, 'n': 'Lors', 'p': 'Lors', 't': 'Lors '}]
[{'_sigil': 'W2', '_token_array_position': 13, 'n': 'Lors', 'p': 'Lors', 't': 'Lors '}]
[{'_sigil': 'W3', '_token_array_position': 26, 'n': 'Lors', 'p': 'Lors', 't': 'Lors '}]
[{'_sigil': 'W4', '_token_array_position': 39, 'n': 'Adonc', 'p': 'Adonc', 't': 'Adonc '}]


In [51]:
# import json
 
data = json.loads(graph)
for row in data['table']:
    for elem in row[0]: # attention, if a line with NONE it will give error
        print(elem['t'])

Lors 
Lors 
Lors 
Adonc 


In [52]:
################################################################
# DATA FOR THE HEADER OF THE TABLE
# The json is composed by 'table' and 'witnesses'
################################################################

# import json
 
data = json.loads(graph)
for row in data['witnesses']:
    print(row)

W1
W2
W3
W4


In [57]:
# import json
 
dataTestIn = json.loads(graph)
for x in dataTestIn['table']:
    print("WITNESS")
    for y in x:
        print(y)

WITNESS
[{'_sigil': 'W1', '_token_array_position': 0, 'n': 'Lors', 'p': 'Lors', 't': 'Lors '}]
None
[{'_sigil': 'W1', '_token_array_position': 1, 'n': 'conte', 'p': 'conte', 't': 'conte '}]
[{'_sigil': 'W1', '_token_array_position': 2, 'n': 'li', 'p': 'li', 't': 'li '}]
[{'_sigil': 'W1', '_token_array_position': 3, 'n': 'rois', 'p': 'rois', 't': 'rois '}]
[{'_sigil': 'W1', '_token_array_position': 4, 'n': 'a', 'p': 'a', 't': 'a '}]
[{'_sigil': 'W1', '_token_array_position': 5, 'n': 'la', 'p': 'la', 't': 'la '}]
[{'_sigil': 'W1', '_token_array_position': 6, 'n': 'reine', 'p': 'reine', 't': 'reine '}]
[{'_sigil': 'W1', '_token_array_position': 7, 'n': 'coment', 'p': 'coment', 't': 'coment '}]
[{'_sigil': 'W1', '_token_array_position': 8, 'n': 'la', 'p': 'la', 't': 'la '}]
[{'_sigil': 'W1', '_token_array_position': 9, 'n': 'dame', 'p': 'dame', 't': 'dame '}]
[{'_sigil': 'W1', '_token_array_position': 10, 'n': 'del', 'p': 'del', 't': 'del '}]
[{'_sigil': 'W1', '_token_array_position': 11, 

In [90]:
## TRANSFORMING COLLATEX JSON OUTPUT INTO ANOTHER JSON, MORE SUITABLE FOR GENERATING HTML TABLE (see below: ideal json ..)
# import json
 
dataTestIn = json.loads(graph)


dataTestOut = []
for x in dataTestIn['table']:
    item = {}
    for y in x:
        if (y != None):
            item = {"cell": y}
    dataTestOut.append(item)
    
    ## esce solo l'ultima !!!!
    ## guarda:   https://stackoverflow.com/questions/13530967/parsing-data-to-create-a-json-data-object-with-python
    
 
json_data = json.dumps(dataTestOut)
print(json_data)

[{"cell": [{"_sigil": "W1", "_token_array_position": 11, "n": "lac", "p": "lac", "t": "lac\n"}]}, {"cell": [{"_sigil": "W2", "_token_array_position": 24, "n": "lac", "p": "lac", "t": "lac\n"}]}, {"cell": [{"_sigil": "W3", "_token_array_position": 37, "n": "lac", "p": "lac", "t": "lac\n"}]}, {"cell": [{"_sigil": "W4", "_token_array_position": 48, "n": "lac", "p": "lac", "t": "lac\n"}]}]


In [83]:
## TEST
# import json
 
data = json.loads(graph)
print(data['table'][0])

[[{'_sigil': 'W1', '_token_array_position': 0, 'n': 'Lors', 'p': 'Lors', 't': 'Lors '}], None, [{'_sigil': 'W1', '_token_array_position': 1, 'n': 'conte', 'p': 'conte', 't': 'conte '}], [{'_sigil': 'W1', '_token_array_position': 2, 'n': 'li', 'p': 'li', 't': 'li '}], [{'_sigil': 'W1', '_token_array_position': 3, 'n': 'rois', 'p': 'rois', 't': 'rois '}], [{'_sigil': 'W1', '_token_array_position': 4, 'n': 'a', 'p': 'a', 't': 'a '}], [{'_sigil': 'W1', '_token_array_position': 5, 'n': 'la', 'p': 'la', 't': 'la '}], [{'_sigil': 'W1', '_token_array_position': 6, 'n': 'reine', 'p': 'reine', 't': 'reine '}], [{'_sigil': 'W1', '_token_array_position': 7, 'n': 'coment', 'p': 'coment', 't': 'coment '}], [{'_sigil': 'W1', '_token_array_position': 8, 'n': 'la', 'p': 'la', 't': 'la '}], [{'_sigil': 'W1', '_token_array_position': 9, 'n': 'dame', 'p': 'dame', 't': 'dame '}], [{'_sigil': 'W1', '_token_array_position': 10, 'n': 'del', 'p': 'del', 't': 'del '}], [{'_sigil': 'W1', '_token_array_position':

In [84]:
## IDEAL JSON FOR GENERATING THE HTML TABLE

import json
testGraph = '''{
	"table": [
		[
			[{
				"_sigil": "W1",
				"_token_array_position": 0,
				"n": "Lors",
				"p": "Lors",
				"t": "Lors "
			}],
			[{
				"_sigil": "W2",
				"_token_array_position": 13,
				"n": "Lors",
				"p": "Lors",
				"t": "Lors "
			}],
			[{
				"_sigil": "W3",
				"_token_array_position": 26,
				"n": "Lors",
				"p": "Lors",
				"t": "Lors "
			}],
			[{
				"_sigil": "W4",
				"_token_array_position": 39,
				"n": "Adonc",
				"p": "Adonc",
				"t": "Adonc "
			}]
		],
		[
			[{
				"_sigil": "W1",
				"_token_array_position": 1,
				"n": "conte",
				"p": "conte",
				"t": "conte "
			}],
			[{
				"_sigil": "W2",
				"_token_array_position": 14,
				"n": "conte",
				"p": "conte",
				"t": "conte "
			}],
			[{
				"_sigil": "W3",
				"_token_array_position": 27,
				"n": "conte",
				"p": "conte",
				"t": "conte "
			}],
			[{
				"_sigil": "W4",
				"_token_array_position": 41,
				"n": "conte",
				"p": "conte",
				"t": "conte "
			}]
		]
	],
	"witnesses": [
		"W1",
		"W2",
		"W3",
		"W4"
	]
}'''

testData = json.loads(testGraph)

print("DIRECT TEST:   "+testData['table'][0][0][0]['n'])

for row in testData['table']:
    print("ROW")
    for cellList in row:
        for cellDic in cellList:
            print(cellDic['n'])

DIRECT TEST:   Lors
ROW
Lors
Lors
Lors
Adonc
ROW
conte
conte
conte
conte


In [85]:
# import json
 

# DATA FROM PREVIOUS CELL

## open table and thead
html = """<html><table border="1"><thead><tr>"""
for witness in testData['witnesses']:   
    html += "<th>"+witness+"</th>"
## close thead
html += "</tr></thead>"  

## iterate over "rows"
for row in testData['table']:
    ## open tbody
    html += "<tbody><tr>"
    ## iterate over elements inside "rows"
    for cellList in row:
        for cellDic in cellList:
            normToken = cellDic['n']
            html += "<td>"+normToken+"</td>"
    ## close tbody
    html += "</tr></tbody>"
file_ = open('result.html', 'w')
file_.write(html)
file_.close()