# DiffTagger
The output layer of the `DiffTagger` consists of differences in input layers.

In [1]:
from estnltk import Layer
from estnltk.taggers import DiffTagger

Layer.print_start_end = True

diff_tagger = DiffTagger(layer_a='layer_5',
                         layer_b='layer_5_new',
                         output_layer='diff_layer',
                         output_attributes=('span_status', 'attr', 'attr_5'),
                         span_status_attribute='span_status')
diff_tagger

name,output layer,output attributes,input layers
DiffTagger,diff_layer,"('input_layer_name', 'span_status', 'attr', 'attr_5')","('layer_5', 'layer_5_new')"

0,1
input_layer_attribute,input_layer_name
span_status_attribute,span_status
compare_function,<function _operator.eq>


Ambiguous envelopoing demo layer.

In [2]:
from estnltk.tests import new_text
from estnltk.layer_operations import merge_layers

text = new_text(5)
layer_5 = text.layer_5
layer_5

layer name,attributes,parent,enveloping,ambiguous,span count
layer_5,"attr, attr_5",,layer_1,True,9

text,start,end,attr,attr_5
"['Sada', 'kaks']",0,9,L5-0-0,SADA KAKS
"['Sada', 'kakskümmend']",0,16,L5-1-1,SADA KAKS
,0,16,L5-1-2,SADA KÜMME
,0,16,L5-1-3,SADA KAKSKÜMMEND
"['Sada', 'kakskümmend', 'kolm']",0,21,L5-2-4,SADA KAKSKÜMMEND KOLM
"['Neli', 'tuhat']",23,33,L5-3-5,NELI TUHAT
"['tuhat', 'viis']",28,38,L5-4-6,TUHAT VIIS
['viis'],34,38,L5-5-7,VIIS
"['viissada', 'kuuskümmend']",34,54,L5-6-8,VIISSADA KUUS
,34,54,L5-6-9,VIISSADA KÜMME


In [3]:
# copy layer_5
layer_5_new = merge_layers(layers=[layer_5],
                           output_layer='layer_5_new',
                           output_attributes=['attr', 'attr_5'])

del layer_5.spans[8]
del layer_5.spans[6]
del layer_5.spans[5]
layer_5

layer name,attributes,parent,enveloping,ambiguous,span count
layer_5,"attr, attr_5",,layer_1,True,6

text,start,end,attr,attr_5
"['Sada', 'kaks']",0,9,L5-0-0,SADA KAKS
"['Sada', 'kakskümmend']",0,16,L5-1-1,SADA KAKS
,0,16,L5-1-2,SADA KÜMME
,0,16,L5-1-3,SADA KAKSKÜMMEND
"['Sada', 'kakskümmend', 'kolm']",0,21,L5-2-4,SADA KAKSKÜMMEND KOLM
"['Neli', 'tuhat']",23,33,L5-3-5,NELI TUHAT
"['tuhat', 'viis']",28,38,L5-4-6,TUHAT VIIS
"['kümme', 'koma', 'kaheksa']",47,74,L5-7-11,KÜMME KOMA KAHEKSA


Copy and change demo layer.

In [4]:
layer_5_new[1][0].attr_5 = 'sada kaks'
del layer_5_new.spans[6][1]
del layer_5_new.spans[4]
del layer_5_new.spans[3]
text['layer_5_new'] = layer_5_new
layer_5_new

layer name,attributes,parent,enveloping,ambiguous,span count
layer_5_new,"attr, attr_5",,layer_1,True,7

text,start,end,attr,attr_5
"['Sada', 'kaks']",0,9,L5-0-0,SADA KAKS
"['Sada', 'kakskümmend']",0,16,L5-1-1,sada kaks
,0,16,L5-1-2,SADA KÜMME
,0,16,L5-1-3,SADA KAKSKÜMMEND
"['Sada', 'kakskümmend', 'kolm']",0,21,L5-2-4,SADA KAKSKÜMMEND KOLM
['viis'],34,38,L5-5-7,VIIS
"['viissada', 'kuuskümmend']",34,54,L5-6-8,VIISSADA KUUS
,34,54,L5-6-10,VIISSADA KUUSKÜMMEND
"['kümme', 'koma', 'kaheksa']",47,74,L5-7-11,KÜMME KOMA KAHEKSA
"['seitse', 'koma', 'kaheksa']",55,74,L5-8-12,SEITSE KOMA KAHEKSA


Create and show layer of differences.

In [5]:
status = {}
diff_tagger(text, status).diff_layer

layer name,attributes,parent,enveloping,ambiguous,span count
diff_layer,"input_layer_name, span_status, attr, attr_5",,layer_1,True,6

text,start,end,input_layer_name,span_status,attr,attr_5
"['Sada', 'kakskümmend']",0,16,layer_5,modified,L5-1-1,SADA KAKS
,0,16,layer_5_new,modified,L5-1-1,sada kaks
"['Neli', 'tuhat']",23,33,layer_5,missing,L5-3-5,NELI TUHAT
"['tuhat', 'viis']",28,38,layer_5,missing,L5-4-6,TUHAT VIIS
['viis'],34,38,layer_5_new,extra,L5-5-7,VIIS
"['viissada', 'kuuskümmend']",34,54,layer_5_new,extra,L5-6-8,VIISSADA KUUS
,34,54,layer_5_new,extra,L5-6-10,VIISSADA KUUSKÜMMEND
"['seitse', 'koma', 'kaheksa']",55,74,layer_5_new,extra,L5-8-12,SEITSE KOMA KAHEKSA


## `status`
`status` contains descriptive statistics of differences.

In [6]:
status

{'modified_spans': 1,
 'missing_spans': 2,
 'extra_spans': 3,
 'extra_annotations': 5,
 'missing_annotations': 3,
 'overlapped': 1,
 'prolonged': 0,
 'shortened': 1,
 'conflicts': 2,
 'unchanged_spans': 3,
 'unchanged_annotations': 5}

    unchanged_spans + modified_spans + missing_spans = length_of_old_layer
    unchanged_spans + modified_spans + extra_spans = length_of_new_layer
    unchanged_annotations + missing_annotations = number_of_annotations_in_old_layer
    unchanged_annotations + extra_annotations   = number_of_annotations_in_new_layer
    overlapped + prolonged + shortened = conflicts <= missing_spans x extra_spans
    
A **raw span** is the text segment corresponding to the span (a span without annotations).

A span is a **missing span** if its raw span is in the first layer but not in the second layer.

A span is an **extra span** if its raw span is in the second layer but not in the first layer.

A span is **modified** if the raw span exists in the first and in the second layer but the annotations are different.

A **conflict** is a pair of a missing span and an extra span that cover partially the same text. The annotations are not compared in this case.

# Iterate layer of differences

In [7]:
from estnltk.taggers.standard_taggers.diff_tagger import iterate_modified
from estnltk.taggers.standard_taggers.diff_tagger import iterate_missing
from estnltk.taggers.standard_taggers.diff_tagger import iterate_extra
from estnltk.taggers.standard_taggers.diff_tagger import iterate_diff_conflicts
from estnltk.taggers.standard_taggers.diff_tagger import iterate_overlapped
from estnltk.taggers.standard_taggers.diff_tagger import iterate_prolonged
from estnltk.taggers.standard_taggers.diff_tagger import iterate_shortened

## Iterate modified spans
Note that only missing and extra annotations of the span are shown.

In [8]:
for a in iterate_modified(text.diff_layer, 'span_status'):
    a.display()

text,start,end,input_layer_name,span_status,attr,attr_5
"[Sada, kakskümmend]",0,16,layer_5,modified,L5-1-1,SADA KAKS
,0,16,layer_5_new,modified,L5-1-1,sada kaks


## Iterate missing spans

In [9]:
for a in iterate_missing(text.diff_layer, 'span_status'):
    a.display()

text,start,end,input_layer_name,span_status,attr,attr_5
"[Neli, tuhat]",23,33,layer_5,missing,L5-3-5,NELI TUHAT


text,start,end,input_layer_name,span_status,attr,attr_5
"[tuhat, viis]",28,38,layer_5,missing,L5-4-6,TUHAT VIIS


## Iterate extra spans

In [10]:
for a in iterate_extra(text.diff_layer, 'span_status'):
    a.display()

text,start,end,input_layer_name,span_status,attr,attr_5
[viis],34,38,layer_5_new,extra,L5-5-7,VIIS


text,start,end,input_layer_name,span_status,attr,attr_5
"[viissada, kuuskümmend]",34,54,layer_5_new,extra,L5-6-8,VIISSADA KUUS
,34,54,layer_5_new,extra,L5-6-10,VIISSADA KUUSKÜMMEND


text,start,end,input_layer_name,span_status,attr,attr_5
"[seitse, koma, kaheksa]",55,74,layer_5_new,extra,L5-8-12,SEITSE KOMA KAHEKSA


## Iterate overlapped spans

In [11]:
for a, b in iterate_overlapped(text.diff_layer, 'span_status'):
    a.display()
    b.display()

text,start,end,input_layer_name,span_status,attr,attr_5
"[tuhat, viis]",28,38,layer_5,missing,L5-4-6,TUHAT VIIS


text,start,end,input_layer_name,span_status,attr,attr_5
"[viissada, kuuskümmend]",34,54,layer_5_new,extra,L5-6-8,VIISSADA KUUS
,34,54,layer_5_new,extra,L5-6-10,VIISSADA KUUSKÜMMEND


## Iterate prolonged spans

In [12]:
for a, b in iterate_prolonged(text.diff_layer, 'span_status'):
    a.display()
    b.display()

## Iterate shortened spans

In [13]:
for a, b in iterate_shortened(text.diff_layer, 'span_status'):
    a.display()
    b.display()

text,start,end,input_layer_name,span_status,attr,attr_5
"[tuhat, viis]",28,38,layer_5,missing,L5-4-6,TUHAT VIIS


text,start,end,input_layer_name,span_status,attr,attr_5
[viis],34,38,layer_5_new,extra,L5-5-7,VIIS


## Iterate all conflicts between missing and extra spans

In [14]:
for a, b in iterate_diff_conflicts(text.diff_layer, 'span_status'):
    print('Conflicting pair:')
    a.display()
    b.display()

Conflicting pair:


text,start,end,input_layer_name,span_status,attr,attr_5
"[tuhat, viis]",28,38,layer_5,missing,L5-4-6,TUHAT VIIS


text,start,end,input_layer_name,span_status,attr,attr_5
[viis],34,38,layer_5_new,extra,L5-5-7,VIIS


Conflicting pair:


text,start,end,input_layer_name,span_status,attr,attr_5
"[tuhat, viis]",28,38,layer_5,missing,L5-4-6,TUHAT VIIS


text,start,end,input_layer_name,span_status,attr,attr_5
"[viissada, kuuskümmend]",34,54,layer_5_new,extra,L5-6-8,VIISSADA KUUS
,34,54,layer_5_new,extra,L5-6-10,VIISSADA KUUSKÜMMEND
