### Add a property to the token for normalized forms to be used not in the alignment, but for interpretation in the analysis stage and then in the visualization.

## 0. No normalization

In [23]:
from collatex import *
collation = Collation()
W1 = open( "data/example4/W1.txt", encoding='utf-8' ).read()
W2 = open( "data/example4/W2.txt", encoding='utf-8' ).read()
W3 = open( "data/example4/W3.txt", encoding='utf-8' ).read()
W4 = open( "data/example4/W4.txt", encoding='utf-8' ).read()
collation.add_plain_witness( "W1", W1 )
collation.add_plain_witness( "W2", W2 )
collation.add_plain_witness( "W3", W3 )
collation.add_plain_witness( "W4", W4 )
table = collate(collation, output='html2', segmentation=False)
print(table)

W1,W2,W3,W4
Lors,Lors,Lors,Adonc
conte,conte,conte,-
li,li,li,li
rois,rois,rois,-
a,a,a,-
la,la,la,-
reine,reine,roine,-
coment,comment,coment,comment
la,la,la,la
dame,dame,dame,dame


None



## 1. Dictionary

This method requires the manual creation of a dictionary. In this example, the dictionary is built with **three columns**: the first for the **original form**, the second for the **normalized form** to be used during the **alignment**, the third for the **normalized form** to be used in the **interpretation**, after the alignment and before the visualisation.
The first column must have a value, while the second and third columns may stay empty.

In [2]:
import csv
import re
from collatex import *
collation = Collation()

# Create the dictionary (here 'dictionary_norm.csv') with three columns: the first for the original form, the second for the normalized form to be used during the alignment, the third for the normalized form to be used in the interpretation, after the alignment and before the visualisation. The first column must have a value, while the second and third columns may stay empty.
# DUMB EXEMPLE, TO BE REPLACED WITH A REAL ONE
Normit = {}
with open('dictionary_norm.csv') as csvfile:
    reader = csv.DictReader(csvfile, fieldnames=['Original', 'NormalisedAlignment', 'NormalisedInterpretation'],  dialect='excel')
    for row in reader:
        Normit[row['Original']]= row['NormalisedAlignment']
        
NormitInterpretation = {}
with open('dictionary_norm.csv') as csvfile:
    reader = csv.DictReader(csvfile, fieldnames=['Original', 'NormalisedAlignment', 'NormalisedInterpretation'],  dialect='excel')
    for row in reader:
        NormitInterpretation[row['Original']]= row['NormalisedInterpretation']

from collatex.core_classes import WordPunctuationTokenizer
tokenizer = WordPunctuationTokenizer()

#read in the witnesses  from your file system 
W1 = open( "data/example4/W1.txt", encoding='utf-8' ).read()
W2 = open( "data/example4/W2.txt", encoding='utf-8' ).read()
W3 = open( "data/example4/W3.txt", encoding='utf-8' ).read()
W4 = open( "data/example4/W4.txt", encoding='utf-8' ).read()

# build a function to tokenize and to normalize by replace keys to be found in the dictionary by the corresponding values 
def tokennormalizer(witness) :
    tokens_as_strings = tokenizer.tokenize(witness)
    list = []
    for token_string in tokens_as_strings:
        normversion = re.sub(r'\s+$',"", token_string)
        replaceversion = Normit.get(normversion,normversion)
        token_norm = NormitInterpretation.get(normversion,normversion)
        list.append({'t':token_string, 'n':replaceversion, 'p':token_norm})
    return(list)

tokens_W1 = tokennormalizer(W1) 
tokens_W2 = tokennormalizer(W2) 
tokens_W3 = tokennormalizer(W3) 
tokens_W4 = tokennormalizer(W4) 
#Print to check what's in the properties; can be deleted once we can visualize it. Can check also in the collation with json output.
print(tokens_W1, tokens_W2, tokens_W3, tokens_W4)
 
witness_W1 = { "id": "W1", "tokens":tokens_W1 }
witness_W2 = { "id": "W2", "tokens":tokens_W2 }
witness_W3 = { "id": "W3", "tokens":tokens_W3 }
witness_W4 = { "id": "W4", "tokens":tokens_W4 }


input = { "witnesses": [ witness_W1, witness_W2, witness_W3, witness_W4 ] }

table = collate(input, output='html2', segmentation=False)

[{'t': 'Lors ', 'n': 'Lors', 'p': 'Lors'}, {'t': 'conte ', 'n': 'conte', 'p': 'conte'}, {'t': 'li ', 'n': 'la', 'p': ''}, {'t': 'rois ', 'n': 'rois', 'p': 'rois'}, {'t': 'a ', 'n': 'a', 'p': 'a'}, {'t': 'la ', 'n': 'la', 'p': 'la'}, {'t': 'reine ', 'n': 'reine', 'p': 'reine'}, {'t': 'coment ', 'n': 'coment', 'p': 'coment'}, {'t': 'la ', 'n': 'la', 'p': 'la'}, {'t': 'dame ', 'n': 'dame', 'p': 'dame'}, {'t': 'del ', 'n': 'del', 'p': 'del'}, {'t': 'lac\n', 'n': 'lac', 'p': 'lac'}] [{'t': 'Lors ', 'n': 'Lors', 'p': 'Lors'}, {'t': 'conte ', 'n': 'conte', 'p': 'conte'}, {'t': 'li ', 'n': 'la', 'p': ''}, {'t': 'rois ', 'n': 'rois', 'p': 'rois'}, {'t': 'a ', 'n': 'a', 'p': 'a'}, {'t': 'la ', 'n': 'la', 'p': 'la'}, {'t': 'reine ', 'n': 'reine', 'p': 'reine'}, {'t': 'coment ', 'n': 'coment', 'p': 'coment'}, {'t': 'la ', 'n': 'la', 'p': 'la'}, {'t': 'dame ', 'n': 'dame', 'p': 'dame'}, {'t': 'del ', 'n': 'del', 'p': 'del'}, {'t': 'lac\n', 'n': 'lac', 'p': 'lac'}] [{'t': 'Lors ', 'n': 'Lors', 'p': 

W1,W2,W3,W4
Lors,Lors,Lors,Adonc
conte,conte,conte,-
li,li,li,-
rois,rois,rois,-
a,a,a,-
la,la,la,li
reine,reine,roine,comment
coment,coment,coment,-
la,la,la,la
dame,dame,dame,dame
