### Add a property to the token for normalized forms to be used not in the alignment, but for interpretation in the analysis stage and then in the visualization.

## 0. No normalization

In [None]:
from collatex import *
collation = Collation()
W1 = open( "data/example4/W1.txt", encoding='utf-8' ).read()
W2 = open( "data/example4/W2.txt", encoding='utf-8' ).read()
W3 = open( "data/example4/W3.txt", encoding='utf-8' ).read()
W4 = open( "data/example4/W4.txt", encoding='utf-8' ).read()
collation.add_plain_witness( "W1", W1 )
collation.add_plain_witness( "W2", W2 )
collation.add_plain_witness( "W3", W3 )
collation.add_plain_witness( "W4", W4 )
table = collate(collation, output='html2', segmentation=False)
print(table)


## 1. Dictionary

This method requires the manual creation of a dictionary. In this example, the dictionary is built with **three columns**: the first for the **original form**, the second for the **normalized form** to be used during the **alignment**, the third for the **normalized form** to be used in the **interpretation**, after the alignment and before the visualisation.
The first column must have a value, while the second and third columns may stay empty (later addition: if it's empty, does it take the t value by default???).

In [2]:
import csv
import re
from collatex import *
collation = Collation()

# Create the dictionary (here 'dictionary_norm.csv') with three columns: the first for the original form (t), the second for the normalized form to be used during the alignment (n), the third for the normalized form to be used in the interpretation (p), after the alignment and before the visualisation. The first column must have a value, while the second and third columns may stay empty.
Normit = {}
with open('dictionary_norm.csv') as csvfile:
    reader = csv.DictReader(csvfile, fieldnames=['Original', 'NormalisedAlignment', 'NormalisedInterpretation'])
    for row in reader:
        Normit[row['Original']]= row['NormalisedAlignment']
        
NormitInterpretation = {}
with open('dictionary_norm.csv') as csvfile:
    reader = csv.DictReader(csvfile, fieldnames=['Original', 'NormalisedAlignment', 'NormalisedInterpretation'])
    for row in reader:
        NormitInterpretation[row['Original']]= row['NormalisedInterpretation']

from collatex.core_classes import WordPunctuationTokenizer
tokenizer = WordPunctuationTokenizer()

#read in the witnesses  from your file system 
W1 = open( "data/example4/W1.txt", encoding='utf-8' ).read()
W2 = open( "data/example4/W2.txt", encoding='utf-8' ).read()
W3 = open( "data/example4/W3.txt", encoding='utf-8' ).read()
W4 = open( "data/example4/W4.txt", encoding='utf-8' ).read()

# build a function to tokenize and to normalize by replace keys to be found in the dictionary by the corresponding values 
def tokennormalizer(witness) :
    tokens_as_strings = tokenizer.tokenize(witness)
    list = []
    for token_string in tokens_as_strings:
        normversion = re.sub(r'\s+$',"", token_string)
        replaceversion = Normit.get(normversion,normversion)
        token_norm = NormitInterpretation.get(normversion,normversion)
        list.append({'t':token_string, 'n':replaceversion, 'p':token_norm})
    return(list)

tokens_W1 = tokennormalizer(W1) 
tokens_W2 = tokennormalizer(W2) 
tokens_W3 = tokennormalizer(W3) 
tokens_W4 = tokennormalizer(W4) 
#Print to check what's in the properties; can be deleted once we can visualize it. Can check also in the collation with json output.
##print(tokens_W1, tokens_W2, tokens_W3, tokens_W4)
 
witness_W1 = { "id": "W1", "tokens":tokens_W1 }
witness_W2 = { "id": "W2", "tokens":tokens_W2 }
witness_W3 = { "id": "W3", "tokens":tokens_W3 }
witness_W4 = { "id": "W4", "tokens":tokens_W4 }


input = { "witnesses": [ witness_W1, witness_W2, witness_W3, witness_W4 ] }



graph = collate(input, output='json', segmentation=False) 
print(graph)

## !!! Probabilmente NON SERVONO n (normalized alignment) e p (normalized interpretation), ma solo l'originale (t) e quello normalizzato (n) bastano.




{"table": [[[{"_sigil": "W1", "_token_array_position": 0, "n": "Lors", "p": "Lors", "t": "Lors "}], null, [{"_sigil": "W1", "_token_array_position": 1, "n": "conte", "p": "conte", "t": "conte "}], [{"_sigil": "W1", "_token_array_position": 2, "n": "li", "p": "li", "t": "li "}], [{"_sigil": "W1", "_token_array_position": 3, "n": "rois", "p": "rois", "t": "rois "}], [{"_sigil": "W1", "_token_array_position": 4, "n": "a", "p": "a", "t": "a "}], [{"_sigil": "W1", "_token_array_position": 5, "n": "la", "p": "la", "t": "la "}], [{"_sigil": "W1", "_token_array_position": 6, "n": "reine", "p": "reine", "t": "reine "}], [{"_sigil": "W1", "_token_array_position": 7, "n": "coment", "p": "coment", "t": "coment "}], [{"_sigil": "W1", "_token_array_position": 8, "n": "la", "p": "la", "t": "la "}], [{"_sigil": "W1", "_token_array_position": 9, "n": "dame", "p": "dame", "t": "dame "}], [{"_sigil": "W1", "_token_array_position": 10, "n": "del", "p": "del", "t": "del "}], [{"_sigil": "W1", "_token_array

In [38]:


            #====================================================#
            #====================================================#
            #                                                    #
            #   CREATE TABLE FROM JSON USING NORMALIZED TOKENS   #
            #                                                    # 
            #====================================================#
            #====================================================#


        
        
import json
import itertools
from xml.etree import ElementTree as ET


#====================================================
#
#  BUILD HTML TABLE
#
#====================================================

dataIn = json.loads(graph)

## HEAD OF THE TABLE

# there is an x inside the script pointing to jquery because if it's empty it will 
# automatically be written as a closed empty tag in the output and would not work
html = """<html>
    <head>
	<title>Test collation with normalized tokens</title>
	<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js">x</script>
	<script type="text/javascript">  
            $(document).ready(function(){
                $("tr[type='orig']").click(function(){
                    $(this).next().toggle();
                });
            });
    </script>
	<style>
         tr[type="normAll"] {display: none;}
         tr[type="norm"] {display: none;}
         td, th {padding: 20px;border: 1px solid grey; width:100px}
         table {border-collapse: collapse;}
         .variant + tr {background-color:#06e089}
         .invariant + tr {background-color:lightgray}
    </style>
	</head>"""

html += """<body><table border="1"><thead><tr>"""  
for x in dataIn['table']:
    witName = x[0][0]['_sigil']    # define the witness name
    html += "<th>"+witName+"</th>"    # write the witness name in the head of the table
html += "</tr></thead><tbody>"  ## close thead
   
      
for i in range(len(x)):  # for 'i' in the length of the witness  
    istr = str(i)   # from int to string, otherwise the following does not work
    
    ## CREATE ROW FOR NORMALIZED TOKEN (ALL) - NOT DISPLAY, just for processing
    html += "<tr type='normAll' id='row"+istr+"_normAll'>" # style='display:none'  
    for x in dataIn['table']:   # for each witness
        element = x[i]   # take the first (then at the next iteration take the second, etc.)
        if element is not None: 
            for elementList in element:
                normCompareToken = elementList['n'].strip()  # strip used for deleting whitespaces
        else:
            normCompareToken = ' - '
        html += "<td>"+normCompareToken+"</td>"  # write the original token in a cell
        ## close tbody
    html += "</tr>"
    
    
    ## CREATE ROW FOR ORIGINAL TOKEN
    html += "<tr type='orig' id='row"+istr+"_orig'>"  
    for x in dataIn['table']:   # for each witness
        element = x[i]   # take the first (then at the next iteration take the second, etc.)
        if element is not None: 
            for elementList in element:
                origToken = elementList['t'].strip()  # strip used for deleting whitespaces
                normCompareToken = elementList['n'].strip()
                if normCompareToken is not None:
                    if origToken != normCompareToken:
                        origToken = "<u>"+origToken+"</u>"
                    else:
                        origToken = origToken
        else:
            origToken = ' - '
        html += "<td>"+origToken+"</td>"  # write the original token in a cell
        ## close tbody
    html += "</tr>"
    
    
    
    ## CREATE ROW FOR NORMALIZED TOKEN (ONLY IF DIFFERENT FROM ORIGINAL) - DISPLAY
    html += "<tr type='norm' id='row"+istr+"_norm'>"  
    for x in dataIn['table']:   # for each witness
        element = x[i]   # take the first (then at the next iteration take the second, etc.)
        if element is not None: 
            for elementList in element:
                ## DIFFRENCE BETWEEN ORIGINAL AND NORMALIZED TOKENS
                ## Print normalized token in new row, when different from original
                origCompareToken = elementList['t'].strip() # strip used for deleting whitespaces
                normCompareToken = elementList['n'].strip()
                if normCompareToken is not None:
                    if origCompareToken == normCompareToken:
                        normToken = ""
                    else:
                        normToken = normCompareToken
        else:
            normToken = "" 
            normCompareToken = ""
        html += "<td>"+normToken+"</td>"  # write the original token in a cell
    html += "</tr>"
    
## CLOSE BODY - END OF THE TABLE    
html += "</tbody></table></body></html>"


 

#====================================================
#
#  ANALYSE AND RENDER HTML TABLE
#
#====================================================

## Two classes for each row should be added: (1) if equal or different, and (2) if they include formal variation

## Taken from <https://stackoverflow.com/questions/3844801/check-if-all-elements-in-a-list-are-identical>
def checkEqual(iterator):
   return len(set(iterator)) <= 1

createdTable = html
root = ET.fromstring(createdTable)
for tr in root.iter('tr'):  ## iterate over rows
    trType = tr.get('type')  # and get the value of the attribute type for each row
    if (trType == "normAll"):   ## only take rows with attribute @type='normAll'
        listTd = []   ## open empty list
        for td in tr.iter('td'):  ## take all cells in a row
            listTd.append(td.text)  ## and put their text in the list
        if checkEqual(listTd) == True: ## if all the element in the list (all the aligned tokens appearing in a row) are equal
            tr.set('class', 'invariant')  # add to the row the attribute @class="variant"
            # tr.set('style', 'color:green')
        else: 
            tr.set('class', 'variant') # add to the row the attribute @class="invariant"
            # tr.set('style', 'color:red')
            

tree = ET.tostring(root, encoding="unicode")
outFile = open('result2.html', 'w')
outFile.write(tree)






4213

In [3]:


            #======================================================#
            #======================================================#
            #                                                      #
            #   SIMPLE TEI OUTPUT   (no analysis                   #
            #                                                      # 
            #======================================================#
            #======================================================#
            

        
        

#====================================================
#
#  BUILD APPARATUS
#
#====================================================

import json
# read data
dataIn = json.loads(graph)


# opening file
tei = """<TEI>"""  
  
for x in dataIn['table']:
    pass
 
for i in range(len(x)):  # for 'i' in the length of the witness  
    istr = str(i)   # from int to string, otherwise the following does not work

    ## CREATE APP AND RDGs
    tei += "<app id='app_"+istr+"'>"  
    for x in dataIn['table']:   # for each witness
        element = x[i]   # take the first (then at the next iteration take the second, etc.)
        if element is not None: 
            for elementList in element:
                sigil = elementList['_sigil'].strip()
                origToken = elementList['t'].strip()
                normToken = elementList['n'].strip()
        else:
            origToken = ""
            normToken = "" 
        if origToken == "":
            pass
        else:
            if normToken == "":
                tei += "<rdg with='"+sigil+"'>"+origToken+"</rdg>" 
            else:
                tei += "<rdg with='"+sigil+"' ana='"+normToken+"'>"+origToken+"</rdg>"  # write the original token in a cell
    tei += "</app>"
    
## CLOSing of the file   
tei += "</TEI>"

print(tei)


file_ = open('result_TEI.xml', 'w')
file_.write(tei)
file_.close()

 




<TEI><app id='app_0'><rdg with='W1' ana='Lors'>Lors</rdg><rdg with='W2' ana='Lors'>Lors</rdg><rdg with='W3' ana='Lors'>Lors</rdg><rdg with='W4' ana='Adonc'>Adonc</rdg></app><app id='app_1'><rdg with='W4' ana='li'>li</rdg></app><app id='app_2'><rdg with='W1' ana='conte'>conte</rdg><rdg with='W2' ana='conte'>conte</rdg><rdg with='W3' ana='conte'>conte</rdg><rdg with='W4' ana='conte'>conte</rdg></app><app id='app_3'><rdg with='W1' ana='li'>li</rdg><rdg with='W2' ana='li'>li</rdg><rdg with='W3' ana='li'>li</rdg><rdg with='W4' ana='li'>li</rdg></app><app id='app_4'><rdg with='W1' ana='rois'>rois</rdg><rdg with='W2' ana='rois'>rois</rdg><rdg with='W3' ana='rois'>rois</rdg><rdg with='W4' ana='rois'>rois</rdg></app><app id='app_5'><rdg with='W1' ana='a'>a</rdg><rdg with='W2' ana='a'>a</rdg><rdg with='W3' ana='a'>a</rdg></app><app id='app_6'><rdg with='W1' ana='la'>la</rdg><rdg with='W2' ana='la'>la</rdg><rdg with='W3' ana='la'>la</rdg></app><app id='app_7'><rdg with='W1' ana='reine'>reine</rdg

In [20]:


            #======================================================#
            #======================================================#
            #                                                      #
            #   COMPLETE TEI OUTPUT   (with analysis)              #
            #                                                      # 
            #======================================================#
            #======================================================#
            

            
            
## create rdgGrp with attribute for formal variants
## add attribute to substantial variant? If normCompare are not equal or not all

        
        
import json
from xml.etree import ElementTree as ET


#====================================================
#
#  BUILD APPARATUS
#
#====================================================


# read data
dataIn = json.loads(graph)


# open file
tei = """<TEI>"""  
# CREATE APP AND RDGs
for i in range(len(witness)):  # for 'i' in the length of the witness  
    istr = str(i)   # from int to string, for adding it later to attribute
    tei += "<app id='app_"+istr+"'>"  
    for x in dataIn['table']:   # for each witness
        element = x[i]   # take the first (then at the next iteration take the second, etc.)
        if element is not None: 
            for elementList in element:
                sigil = elementList['_sigil'].strip()
                origToken = elementList['t'].strip()
                normToken = elementList['n'].strip()
        else:
            origToken = ""
            normToken = "" 
        if origToken == "":
            pass
        else:
            if normToken == "":
                tei += "<rdg with='"+sigil+"'>"+origToken+"</rdg>" 
            else:
                tei += "<rdg with='"+sigil+"' ana='"+normToken+"'>"+origToken+"</rdg>"  # write the original token in a cell
    tei += "</app>"
    
## CLOSE file 
tei += "</TEI>"


# count the number of witnesses
allWitnesses = []
for witness in dataIn['table']:
    allWitnesses.append(witness)
numberOfWitnesses = len(allWitnesses)



#====================================================
#
#  ANALYSE AND RENDER HTML TABLE
#
#====================================================

## Two classes for each row should be added: (1) if equal or different, and (2) if they include formal variation

## Taken from <https://stackoverflow.com/questions/3844801/check-if-all-elements-in-a-list-are-identical>
def checkEqual(iterator):
   return len(set(iterator)) <= 1

root = ET.fromstring(tei)
for app in root.iter('app'):  ## iterate over rows
    normalizedTokensAligned = []
    for rdg in app.iter('rdg'):  ## take all cells in a row
            normalizedTokensAligned.append(rdg.text)  ## and put their text in the list
    if checkEqual(normalizedTokensAligned) == True: ## if all the element in the list (all the aligned tokens appearing in a row) are equal
        app.set('class', 'invariant')  # add to the row the attribute @class="variant"
        # tr.set('style', 'color:green')
    else: 
        app.set('class', 'variant') # add to the row the attribute @class="invariant"
        # tr.set('style', 'color:red')
            
tree = ET.tostring(root, encoding="unicode")
print(tree)

outFile = open('result_TEIcomplete.xml', 'w')
outFile.write(tree)


 




<TEI><app class="variant" id="app_0"><rdg ana="Lors" with="W1">Lors</rdg><rdg ana="Lors" with="W2">Lors</rdg><rdg ana="Lors" with="W3">Lors</rdg><rdg ana="Adonc" with="W4">Adonc</rdg></app><app class="invariant" id="app_1"><rdg ana="li" with="W4">li</rdg></app><app class="invariant" id="app_2"><rdg ana="conte" with="W1">conte</rdg><rdg ana="conte" with="W2">conte</rdg><rdg ana="conte" with="W3">conte</rdg><rdg ana="conte" with="W4">conte</rdg></app><app class="invariant" id="app_3"><rdg ana="li" with="W1">li</rdg><rdg ana="li" with="W2">li</rdg><rdg ana="li" with="W3">li</rdg><rdg ana="li" with="W4">li</rdg></app><app class="invariant" id="app_4"><rdg ana="rois" with="W1">rois</rdg><rdg ana="rois" with="W2">rois</rdg><rdg ana="rois" with="W3">rois</rdg><rdg ana="rois" with="W4">rois</rdg></app><app class="invariant" id="app_5"><rdg ana="a" with="W1">a</rdg><rdg ana="a" with="W2">a</rdg><rdg ana="a" with="W3">a</rdg></app><app class="invariant" id="app_6"><rdg ana="la" with="W1">la</rdg

2132

In [None]:
## SIMPLER VERSION OF THE PREVIOUS, obsolete now


import json

dataIn = json.loads(graph)

## HEAD OF THE TABLE
html = """<html><table border="1"><thead><tr>"""  
for x in dataIn['table']:
    witName = x[0][0]['_sigil']    # define the witness name
    html += "<th>"+witName+"</th>"    # write the witness name in the head of the table
html += "</tr></thead><tbody>"  ## close thead
   
      
for i in range(len(x)):  # for 'i' in the length of the witness  
    istr = str(i)   # from int to string, otherwise the following does not work
    
    
    ## CREATE ROW FOR ORIGINAL TOKEN
    html += "<tr id='row"+istr+"_orig'>"  
    for x in dataIn['table']:   # for each witness
        element = x[i]   # take the first, then the second, the third, etc.
        if element is not None: 
            for elementList in element:
                origToken = elementList['t']
        else:
            origToken = ' - '
        html += "<td>"+origToken+"</td>"  # write the original token in a cell
        ## close tbody
    html += "</tr>"
    
    
    ## CREATE ROW FOR NORMALIZED TOKEN
    html += "<tr id='row"+istr+"_norm'>"  
    for x in dataIn['table']:   # for each witness
        element = x[i]   # take the first, then the second, the third, etc.
        if element is not None: 
            for elementList in element:
                normToken = elementList['n']
        else:
            normToken = ' - '
        html += "<td>"+normToken+"</td>"  # write the original token in a cell
    html += "</tr>"
    
## CLOSE BODY - END OF THE TABLE    
html += "</tbody>"


file_ = open('result2.html', 'w')
file_.write(html)
file_.close()

In [41]:
def checkEqual2(iterator):
   return len(set(iterator)) <= 1

checkEqual2(['lac', 'lac', 'lec', 'lac'])

False

In [None]:
## TEST

import json
data = json.loads(graph)
for row in data['table']:
    print(row[0])


In [None]:
## TEST

import json
data = json.loads(graph)
for row in data['table']:
    for elem in row[0]: # attention, if a line with NONE it will give error
        print(elem['t'])
        

In [None]:
## TEST
## Data for the header of the table

# import json 
data = json.loads(graph)
for row in data['witnesses']:
    print(row)
    

In [None]:
## DISCARDED, OTHER APPROACH USED
## SEE FOLLOWING CELL

## json used for generating idealTable.html

import json
testGraph = '''{
	"table": [
		[
			[{
				"_sigil": "W1",
				"_token_array_position": 0,
				"n": "Lors",
				"p": "Lors",
				"t": "Lors "
			}],
			[{
				"_sigil": "W2",
				"_token_array_position": 13,
				"n": "Lors",
				"p": "Lors",
				"t": "Lors "
			}],
			[{
				"_sigil": "W3",
				"_token_array_position": 26,
				"n": "Lors",
				"p": "Lors",
				"t": "Lors "
			}],
			[{
				"_sigil": "W4",
				"_token_array_position": 39,
				"n": "Adonc",
				"p": "Adonc",
				"t": "Adonc "
			}]
		],
		[
			[{
				"_sigil": "W1",
				"_token_array_position": 1,
				"n": "conte",
				"p": "conte",
				"t": "conte "
			}],
			[{
				"_sigil": "W2",
				"_token_array_position": 14,
				"n": "conte",
				"p": "conte",
				"t": "conte "
			}],
			[{
				"_sigil": "W3",
				"_token_array_position": 27,
				"n": "conte",
				"p": "conte",
				"t": "conte "
			}],
			[{
				"_sigil": "W4",
				"_token_array_position": 41,
				"n": "conte",
				"p": "conte",
				"t": "conte "
			}]
		]
	],
	"witnesses": [
		"W1",
		"W2",
		"W3",
		"W4"
	]
}'''

testData = json.loads(testGraph)

print("DIRECT TEST:   "+testData['table'][0][0][0]['n'])

for row in testData['table']:
    print("ROW")
    for cellList in row:
        for cellDic in cellList:
            print(cellDic['n'])

In [None]:
## DISCARDED
## DATA FROM PREVIOUS CELL

## generating idealTable.html

import json
## open table and thead
html = """<html><table border="1"><thead><tr>"""
for witness in testData['witnesses']:   
    html += "<th>"+witness+"</th>"
## close thead
html += "</tr></thead>"  

## iterate over "rows"
for row in testData['table']:
    ## open tbody
    html += "<tbody><tr>"
    ## iterate over elements inside "rows"
    for cellList in row:
        for cellDic in cellList:
            normToken = cellDic['n']
            html += "<td>"+normToken+"</td>"
    ## close tbody
    html += "</tr></tbody>"
file_ = open('result.html', 'w')
file_.write(html)
file_.close()