## DiffTagger's constrictions

In [1]:
# Download UD corpus from here:  https://github.com/UniversalDependencies/UD_Estonian-EDT
import os, os.path
ud_corpus_dir = "UD_Estonian-EDT"
assert os.path.isdir( ud_corpus_dir )

In [2]:
from estnltk import Text
from estnltk.converters.conll_importer import conll_to_text

In [3]:
# Initialize estnltk's taggers with custom layer names
from estnltk.taggers import TokensTagger, CompoundTokenTagger, WordTagger, SentenceTokenizer

tokens_tagger = TokensTagger(output_layer='estnltk_tokens')
cp_tagger     = CompoundTokenTagger(output_layer='estnltk_compounds', 
                                    input_tokens_layer='estnltk_tokens')
word_tagger = WordTagger(output_layer='estnltk_words', 
                         input_tokens_layer='estnltk_tokens', 
                         input_compound_tokens_layer='estnltk_compounds')
sentence_tokenizer = SentenceTokenizer(output_layer='estnltk_sentences', 
                                       input_words_layer='estnltk_words', 
                                       input_compound_tokens_layer='estnltk_compounds')

def add_estnltk_segmentation( text ):
    tokens_tagger.tag(text)
    cp_tagger.tag(text)
    word_tagger.tag(text)
    sentence_tokenizer.tag(text)

In [4]:
# Initialize sentences difftagger
from estnltk.taggers import DiffTagger

sentences_diff_tagger = DiffTagger(layer_a='sentences',
                         layer_b='estnltk_sentences',
                         output_layer='sentences_diff_layer',
                         output_attributes=('span_status', ),
                         span_status_attribute='span_status')
sentences_diff_tagger

name,output layer,output attributes,input layers
DiffTagger,sentences_diff_layer,"('input_layer_name', 'span_status')","('sentences', 'estnltk_sentences')"

0,1
input_layer_attribute,input_layer_name
span_status_attribute,span_status
compare_function,<function _operator.eq>


In [5]:
# Initialize words difftagger
from estnltk.taggers import DiffTagger

words_diff_tagger = DiffTagger(layer_a='words',
                         layer_b='estnltk_words',
                         output_layer='words_diff_layer',
                         output_attributes=('span_status', ),
                         span_status_attribute='span_status')
words_diff_tagger

name,output layer,output attributes,input layers
DiffTagger,words_diff_layer,"('input_layer_name', 'span_status')","('words', 'estnltk_words')"

0,1
input_layer_attribute,input_layer_name
span_status_attribute,span_status
compare_function,<function _operator.eq>


In [6]:
# Find differences between estnltk's and conllu's words and sentences
words_diff_layer = None
sentences_diff_layer = None
for fname in sorted(os.listdir( ud_corpus_dir )):
    if fname == 'et_edt-ud-dev.conllu':  # take the smallest file for testing
        fpath = os.path.join(ud_corpus_dir, fname)
        text = conll_to_text(file=fpath, syntax_layer='conll')
        print (fname)
        print('  ',text.layers)
        print('   Adding estnltk segmentation ...')
        add_estnltk_segmentation( text )
        print('   Finding diffs ...')
        status = {}
        words_diff_layer = words_diff_tagger(text, status).words_diff_layer
        status = {}
        sentences_diff_layer = sentences_diff_tagger(text, status).sentences_diff_layer

et_edt-ud-dev.conllu
   dict_keys(['conll', 'sentences', 'words'])
   Adding estnltk segmentation ...
   Finding diffs ...


## Sentences diff

In [7]:
sentences_diff_layer.meta

{'conflicts': 657,
 'extra_annotations': 426,
 'extra_spans': 426,
 'missing_annotations': 563,
 'missing_spans': 563,
 'modified_spans': 0,
 'overlapped': 98,
 'prolonged': 470,
 'shortened': 89,
 'unchanged_annotations': 2562,
 'unchanged_spans': 2562}

In [8]:
sentences_diff_layer[0:10]

layer name,attributes,parent,enveloping,ambiguous,span count
sentences_diff_layer,"input_layer_name, span_status",,words,True,10

text,input_layer_name,span_status
"['Ainult', 'kaks', 'maali', '(', '""', 'Astronoom', '""', 'ja', '""', 'Geograaf', ' ..., type: <class 'list'>, length: 20",estnltk_sentences,extra
"['Ainult', 'kaks', 'maali', '(', '""', 'Astronoom', '""', 'ja', '""', 'Geograaf', ' ..., type: <class 'list'>, length: 18",sentences,missing
"['Kui', 'te', 'aga', 'mõtlete', '""', 'Võluflöödile', '""', ',', 'siis', 'selle', ..., type: <class 'list'>, length: 31",estnltk_sentences,extra
"['Kui', 'te', 'aga', 'mõtlete', '""', 'Võluflöödile', '""', ',', 'siis', 'selle', ..., type: <class 'list'>, length: 29",sentences,missing
"['Ma', 'ei', 'ütle', ',', 'et', 'Fellini', 'on', 'parim', ',', 'seal', 'on', 've ..., type: <class 'list'>, length: 55",estnltk_sentences,extra
"['Ma', 'ei', 'ütle', ',', 'et', 'Fellini', 'on', 'parim', ',', 'seal', 'on', 've ..., type: <class 'list'>, length: 53",sentences,missing
"['""', 'Writing', 'to', 'Vermeer', '""', 'esietendub', 'Amsterdamis', '1.', 'detse ..., type: <class 'list'>, length: 11",sentences,missing
"['""', 'Writing', 'to', 'Vermeer', '""', 'esietendub', 'Amsterdamis', '1.', 'detsembril', '1999 .']",estnltk_sentences,extra
"['Peter', 'Greenaway', 'internetis', ':']",sentences,missing
"['Peter', 'Greenaway', 'internetis', ':', 'Peter', 'Greenaway', 'mängufilmid', ' ..., type: <class 'list'>, length: 114",estnltk_sentences,extra


In [9]:
# Kuidas saada erinevustega elemendid originaal-kihtidest kätte? 
# Kusagil võiks olla toodud indeksid vms, praegu peab manuaalselt otsima ...
for sid, s in enumerate( text.sentences ):
    if s.enclosing_text.startswith('Ainult kaks maali'):
        print (sid)

16


In [10]:
# Antud laused paistavad võrdsed, praegu jääb arusaamatuks, miks loetakse erinevaks ...
print('conllu: ', text.sentences[ 16 ].enclosing_text)
print('estnltk:', text.estnltk_sentences[ 16 ].enclosing_text)
text.sentences[ 16 ].enclosing_text == text.estnltk_sentences[ 16 ].enclosing_text

conllu:  Ainult kaks maali ( " Astronoom " ja " Geograaf " -- H. L. ) kujutavad mehi .
estnltk: Ainult kaks maali ( " Astronoom " ja " Geograaf " -- H. L. ) kujutavad mehi .


True

In [11]:
# Jälle sama lugu: laused sisuliselt võrdsed, praegu arusaamatu, milles erinevus ...
for sid, s in enumerate( text.sentences ):
    if s.enclosing_text.startswith('Kui te aga mõtlete'):
        print (sid)

print('conllu: ', text.sentences[ 26 ].enclosing_text)
print('estnltk:', text.estnltk_sentences[ 26 ].enclosing_text)
text.sentences[ 26 ].enclosing_text == text.estnltk_sentences[ 26 ].enclosing_text

26
conllu:  Kui te aga mõtlete " Võluflöödile " , siis selle idee tuli libretistilt ja teatriomanikult ( Emanuel Schikaneder -- H. L. ) , kes tahtis ise mängida Papagenot .
estnltk: Kui te aga mõtlete " Võluflöödile " , siis selle idee tuli libretistilt ja teatriomanikult ( Emanuel Schikaneder -- H. L. ) , kes tahtis ise mängida Papagenot .


True

In [12]:
# Ok, nüüd sain aru, kus läheb "lappama"
# Nimelt, sentences_diff_layer toetub 'words' kihile ning kui 'words' on erinev 'estnltk_words'-ist,
# siis loetakse ka laused erinevateks, kuigi lausepiiride osas need ei pruugi erineda ...

print ( text.estnltk_sentences[ 16 ].text )
print ( text.sentences[ 16 ].text )

['Ainult', 'kaks', 'maali', '(', '"', 'Astronoom', '"', 'ja', '"', 'Geograaf', '"', '--', 'H', '.', 'L', '.', ')', 'kujutavad', 'mehi', '.']
['Ainult', 'kaks', 'maali', '(', '"', 'Astronoom', '"', 'ja', '"', 'Geograaf', '"', '--', 'H.', 'L.', ')', 'kujutavad', 'mehi', '.']


In [13]:
#
#  Kas selle vastu aitab, kui teha 'sentences' ja 'estnltk_sentences' mõlemad 
#  flat layer'iteks ja võrrelda alles siis omavahel ?
#

## Words diff

In [14]:
words_diff_layer.meta

{'conflicts': 592,
 'extra_annotations': 44616,
 'extra_spans': 406,
 'missing_annotations': 44632,
 'missing_spans': 422,
 'modified_spans': 44210,
 'overlapped': 16,
 'prolonged': 325,
 'shortened': 251,
 'unchanged_annotations': 0,
 'unchanged_spans': 0}

In [15]:
# Ok, üks erinevuste põhjus on, et conllu-l puudub atribuut normalized_form, aga 
# estnltk_words-il on see olemas:
print ('conllu words attributes: ', text.words.attributes)
print ('estnltk words attributes:', text.estnltk_words.attributes)
# ... mis seletab kõrgeid 'modified_spans', 'extra_annotations' ja 'missing_annotations' arve.

conllu words attributes:  ()
estnltk words attributes: ('normalized_form',)


In [16]:
from estnltk.taggers.standard_taggers.diff_tagger import iterate_diff_conflicts
for cid, (a, b) in enumerate(iterate_diff_conflicts(words_diff_layer, 'span_status')):
    print('='*45)
    print('Conflicting pair:')
    display(a)
    display(b)
    if cid > 10:
        break

Conflicting pair:


text,input_layer_name,span_status
H.,words,missing


text,input_layer_name,span_status
H,estnltk_words,extra


Conflicting pair:


text,input_layer_name,span_status
H.,words,missing


text,input_layer_name,span_status
.,estnltk_words,extra


Conflicting pair:


text,input_layer_name,span_status
L.,words,missing


text,input_layer_name,span_status
L,estnltk_words,extra


Conflicting pair:


text,input_layer_name,span_status
L.,words,missing


text,input_layer_name,span_status
.,estnltk_words,extra


Conflicting pair:


text,input_layer_name,span_status
H.,words,missing


text,input_layer_name,span_status
H,estnltk_words,extra


Conflicting pair:


text,input_layer_name,span_status
H.,words,missing


text,input_layer_name,span_status
.,estnltk_words,extra


Conflicting pair:


text,input_layer_name,span_status
L.,words,missing


text,input_layer_name,span_status
L,estnltk_words,extra


Conflicting pair:


text,input_layer_name,span_status
L.,words,missing


text,input_layer_name,span_status
.,estnltk_words,extra


Conflicting pair:


text,input_layer_name,span_status
1/2,words,missing


text,input_layer_name,span_status
1,estnltk_words,extra


Conflicting pair:


text,input_layer_name,span_status
1/2,words,missing


text,input_layer_name,span_status
/,estnltk_words,extra


Conflicting pair:


text,input_layer_name,span_status
1/2,words,missing


text,input_layer_name,span_status
2,estnltk_words,extra


Conflicting pair:


text,input_layer_name,span_status
1999,words,missing


text,input_layer_name,span_status
1999 .,estnltk_words,extra


In [17]:
#
#    'words' kihi puhul paistab yldiselt OK;
#
#    Lahtiseks jääb, kas on vaja inkorpireerida võrdlusesse estnltk_words.normalized_form-i
#    arvestamine? Kuna teistel märgenduskihtidel seda pole ning spelling correction'it me 
#    ka ei tee, siis kaldun arvama, et pigem mitte. Seega on variant on veel 'normalized_form' 
#    yldse eemaldada estnltk_words kyljest, et 'modified_spans' ==> 'unchanged_spans'.
#
#    Ning siis see erinevuste loendamise küsimus:
#    nt, ['H.'] -- 1 missing, ['H', '.'] -- 2 extra, seega erinevuste koguarv erineb sõltuvalt
#        sellest, kumma kihi poolt vaadata. Saaks ka need erinevused kokku võtta ja öelda, et 
#        "a gap (of differences) in the middle of equal spans"
#