# Example 1

ToC:
0. Normal
1. Near match
3. Pre - normalization
4. Automatic dictionary (table and graph)

## 1. Normal

In [1]:
from collatex import *
collation = Collation()
A = open( "data/example10/A.txt", encoding='utf-8' ).read()
L = open( "data/example10/L.txt", encoding='utf-8' ).read()
P = open( "data/example10/P.txt", encoding='utf-8' ).read()
S = open( "data/example10/S.txt", encoding='utf-8' ).read()
V = open( "data/example10/V.txt", encoding='utf-8' ).read()
collation.add_plain_witness( "A", A )
collation.add_plain_witness( "L", L )
collation.add_plain_witness( "P", P )
collation.add_plain_witness( "S", S )
collation.add_plain_witness( "V", V )
table = collate(collation, output='html2', segmentation=False)
print(table)

A,L,P,S,V
En,Plurent,Pore,Pleure,Ad
halte,si,des,des,altes
voiz,oil,oilz,oels,voiz
prist,e,et,si,prist
-,s,gete,escrie,-
a,[,mult,a,a
crier,i,grans,haus,crier
grant,],cris,cris,-
cri,jetet,Après,Puis,-
Si,granz,le,se,granz


None



## 2. Near match

In this method, the parameter 'near_match' of the function 'collate' is switched on, for allowing fuzzy matches in the alignment (or better, in a second round of alignment, which refines the first).

In [2]:
from collatex import *
collation = Collation()
A = open( "data/example10/A.txt", encoding='utf-8' ).read()
L = open( "data/example10/L.txt", encoding='utf-8' ).read()
P = open( "data/example10/P.txt", encoding='utf-8' ).read()
S = open( "data/example10/S.txt", encoding='utf-8' ).read()
V = open( "data/example10/V.txt", encoding='utf-8' ).read()
collation.add_plain_witness( "A", A )
collation.add_plain_witness( "L", L )
collation.add_plain_witness( "P", P )
collation.add_plain_witness( "S", S )
collation.add_plain_witness( "V", V )
table = collate(collation, output='html2', near_match=True, segmentation=False)
print(table)

A,L,P,S,V
En,Plurent,Pore,Pleure,Ad
halte,si,des,des,altes
voiz,oil,oilz,oels,voiz
prist,e,et,si,prist
-,s,gete,escrie,-
a,[,mult,a,a
crier,i,grans,haus,crier
grant,],cris,cris,-
cri,jetet,Après,Puis,-
Si,granz,le,se,granz


None



## 3. Simple normalization

Remove punctuation and all to lower case

In [None]:
import glob, re, os

path = 'data/example10/'  # put the path into a variable 

# files = glob.glob(path+'*.txt')    # take all the names of the files from the path specified above and put them in a list

files = [os.path.basename(x) for x in glob.glob(path+'*.txt')]
os.makedirs(path + 'norm', exist_ok=True)   # create a new folder, if does not exist


for file in files:  # for each file in the directory
    
    ### READ THE FILE CONTENT
    file_opened = open(path+file, 'r', encoding='utf-8') # open the file in mode 'r' (read)
    content = file_opened.read()  # read the file content
    
    ### ALL TO LOWER CASE
    lowerContent = content.lower() 
    
    ### REMOVE PUNCTUATION 
    # remove everything that is not alphanumeric character (\w) or space (\s)
    noPunct_lowerContent = re.sub(r'[^\w\s]','',lowerContent) 
    
    ### REMOVE MULTIPLE WHITESPACES
    regularSpaces_noPunct_lowerContent = " ".join(noPunct_lowerContent.split())
    
    ### CREATE A NEW FILE 
    filename = file.split('.')[0]
    new_file = open('data/example10/norm/' + filename + '_norm.txt', 'w', encoding='utf-8') # open the new file in mode 'w' (write)
    
    ### WRITE THE NEW CONTENT INTO THE NEW FILE
    new_file.write(regularSpaces_noPunct_lowerContent) 
    
print('Finished! All normalized!')


In [3]:
from collatex import *
collation = Collation()
A_norm = open( "data/example10/norm/A_norm.txt", encoding='utf-8' ).read()
L_norm = open( "data/example10/norm/L_norm.txt", encoding='utf-8' ).read()
P_norm = open( "data/example10/norm/P_norm.txt", encoding='utf-8' ).read()
S_norm = open( "data/example10/norm/S_norm.txt", encoding='utf-8' ).read()
V_norm = open( "data/example10/norm/V_norm.txt", encoding='utf-8' ).read()
collation.add_plain_witness( "A", A_norm )
collation.add_plain_witness( "L", L_norm )
collation.add_plain_witness( "P", P_norm )
collation.add_plain_witness( "S", S_norm )
collation.add_plain_witness( "V", V_norm )
table = collate(collation, output='html2', near_match=True, segmentation=False)
print(table)

A,L,P,S,V
en,plurent,pore,pleure,ad
halte,si,des,des,altes
voiz,oil,oilz,oels,voiz
prist,e,et,-,prist
a,si,gete,si,a
crier,jetet,mult,escrie,crier
grant,granz,grans,a,granz
cri,criz,-,haus,criz
si,-,cris,cris,ço
-,sempres,après,puis,dist


None


## 4. Automatic dictionary (table and graph)

This method creates a dictionary, using the [TreeTagger](http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/) parameters for old French:
- the [Nouveau corpus d’Amsterdam](http://www.uni-stuttgart.de/lingrom/stein/corpus) parameter file provides POS annotation and lemma, here only lemma is used;
- the [Base de français médiéval](http://txm.bfm-corpus.org) parameter file provides POS annotation, used here.

The joint effort of the two équipes are now available in the Github repository [Medieval French Language Toolkit](https://github.com/sheiden/Medieval-French-Language-Toolkit).

The tagged forms are used for the alignment.

This method produces three outputs:
- a table and a graph, for which see below here; 
- an external table, where formal and substantive variants are separated.

In [4]:
from collatex import *
collation = Collation()
import csv, re
from general_functions import tag_poslemma
from general_functions import table_automaticDictionary



tag_poslemma('example10')  # ex: create_poslemma('example1')
print("taggedAll and taggedDistinct created in folder Dictionaries/ !")


A = open( "data/example10/norm/A_norm.txt", encoding='utf-8' ).read()
L = open( "data/example10/norm/L_norm.txt", encoding='utf-8' ).read()
P = open( "data/example10/norm/P_norm.txt", encoding='utf-8' ).read()
S = open( "data/example10/norm/S_norm.txt", encoding='utf-8' ).read()
V = open( "data/example10/norm/V_norm.txt", encoding='utf-8' ).read()

Normit = {}
with open('dictionaries/taggedDistinct_example1.csv') as csvfile:
    reader = csv.DictReader(csvfile, fieldnames=['Original', 'Normalised'])
    for row in reader:
        Normit[row['Original']]= row['Normalised']

#read in the witnesses  from your file system 
from collatex.core_classes import WordPunctuationTokenizer
tokenizer = WordPunctuationTokenizer()

# build a function to tokenize and to normalize by replace keys to be 
# found in the dictionary by the corresponding values 
def tokennormalizer(witness) :
    tokens_as_strings = tokenizer.tokenize(witness)
    list = []
    for token_string in tokens_as_strings:
        normversion = re.sub(r'\s+$',"", token_string)
        replaceversion = Normit.get(normversion,normversion)
        list.append({'t':token_string, 'n':replaceversion})
    return(list)

#collate
tokens_A = tokennormalizer(A) 
tokens_L = tokennormalizer(L) 
tokens_P = tokennormalizer(P) 
tokens_S = tokennormalizer(S) 
tokens_V = tokennormalizer(V) 

witness_A = { "id": "W1", "tokens":tokens_A }
witness_L = { "id": "W2", "tokens":tokens_L }
witness_P = { "id": "W3", "tokens":tokens_P }
witness_S = { "id": "W4", "tokens":tokens_S }
witness_V = { "id": "W4", "tokens":tokens_V }


input = { "witnesses": [ witness_A, witness_L, witness_P, witness_S, witness_V ] }

table = collate(input, output='html2', segmentation=False)
print(table)
graphSvg = collate(input, output='svg', segmentation=False)
print(graphSvg)

graph_automaticDictionary = collate(input, output='json', segmentation=False)
table_automaticDictionary(graph_automaticDictionary, 'example1')
print('external table created!')

taggedAll and taggedDistinct created in folder Dictionaries/ !


Exception: Vertex is null for token 146 0 that is supposed to be mapped to a vertex in the graph!