Create a dictionary, where the original form stays at it is and the normalized form is: POS_LEMMA. For example

est, VER_estre

les, ART_le


The POS tagging and lemmatization is here perfermed with TreeTaggerWrapper (a TreeTagger Python version), using the parameters for Old French made available by Achim Stein.

## Function for POS tagging and lemma annotation, plus some cleaning, of the text of all witnesses

### N.b.: change the parameter below (where the function is called) to create the dictionary for one particular example

In [1]:

import treetaggerwrapper, os, re


def create_poslemma(example): 


    # create a folder where to store intermediate results.
    try:
        os.makedirs('data/' + example + '/process')
    except OSError:
        pass

    # put together the texts of all the witnesses
    with open('data/' + example + '/W1.txt') as W1, \
         open('data/' + example + '/W2.txt') as W2, \
         open('data/' + example + '/W3.txt') as W3, \
         open('data/' + example + '/process/p1_all_texts.txt', 'w', encoding='utf-8') as all_texts:
        all_texts.write(W1.read() + ' ' + W2.read() + ' ' + W3.read())

    # each word a new line, in alphabetical order: sorted()
    infile = open('data/' + example + '/process/p1_all_texts.txt').read()
    outfile = open('data/' + example + '/process/p2_all_words.txt', 'w', encoding='utf-8')
    words = infile.split()
    for word in sorted(words):
        outfile.write(word + '\n')

    # delete duplicates
    infile = 'data/' + example + '/process/p2_all_words.txt'
    lines_seen = set() # holds lines already seen
    outfile = open('data/' + example + '/process/p3_single_words.txt', 'w', encoding='utf-8')
    for line in open(infile, "r"):
        if line not in lines_seen: # not a duplicate
            outfile.write(line)
            lines_seen.add(line)
    outfile.close()

    # pos and lemma tagging
    infile = 'data/' + example + '/process/p3_single_words.txt'
    outfile = 'data/' + example + '/process/p4_single_words_analyzed.txt'
    open(outfile, 'w', encoding='utf-8')
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='stein')
    tagger.tag_file_to(infile, outfile)


    # clean pos and lemma tagging, keeping only the first lemma suggested
    infile = 'data/' + example + '/process/p4_single_words_analyzed.txt'
    outfile = 'data/' + example + '/process/p5_single_words_analyzed_clean.txt'
    patterns = [('_.*', ''),
                ('\d.*', ''),
    ##            ('\|.*', ''), so that the different output possibilities are saved
                ('�', 'ö'),  ## encoding problem, but it does not seem to depend on the TreeTaggerWrapper, nor on the script. Maybe on the lexicon? Anyway, this is not real solution but works
                ('<nolem>', 'UNKNOWN')]
    t = open(infile).read()
    for (p1,p2) in patterns:
        p = re.compile(p1)
        t = p.sub(p2, t)
    o = open(outfile, 'w', encoding='utf-8')
    o.write(t)
    o.close()


    # create pos_lemma.csv
    infile = open('data/' + example + '/process/p5_single_words_analyzed_clean.txt', 'r')
    outfile = open('pos_lemma_' + example + '.csv', 'w', encoding='utf-8')
    for aline in infile:
        values = aline.split()
        original = values[0]
        pos = values[1]
        lemmas = values[2]
        print(original + ',' + pos + '_' + lemmas + '\n')   ## print them, in order to have immediately check
        outfile.write(original + ',' + pos + '_' + lemmas + '\n')   ## print them on file
    infile.close()
    outfile.close()



## Call the function with the number of the example as parameter (ex. 'example1'). 

The result will be printed both here (or in the terminal) and on the output file with the corresponding name (ex. 'pos_lemma_example1.csv').

In [2]:
create_poslemma('example11')

atorner,VER_atorner

deit,VER_devoir|dire

doit,VER_devoir|dire

li,PRO:pers_ele|il

mie,ADV_mie

ne,PROCON_ni

on,PRO:invar_on

pas,NOM_pas|past

