# DiffTagger
The output layer of the `DiffTagger` consists of differences in input layers.

In [1]:
from estnltk import Layer, Text
from estnltk.taggers import DiffTagger

Layer.print_start_end = True

diff_tagger = DiffTagger(layer_a='layer_a',
                         layer_b='layer_b',
                         output_layer='diff_layer',
                         output_attributes=('span_status', 'attr', 'attr_5'),
                         span_status_attribute='span_status')
diff_tagger

name,output layer,output attributes,input layers
DiffTagger,diff_layer,"('input_layer_name', 'span_status', 'attr', 'attr_5')","('layer_a', 'layer_b')"

0,1
input_layer_attribute,input_layer_name
span_status_attribute,span_status
compare_function,<function _operator.eq>


Create ambiguous envelopoing demo layer `layer_a`.

In [2]:
text = Text('Sada kakskümmend kolm. Neli tuhat viissada kuuskümmend seitse koma kaheksa. Üheksakümmend tuhat.')

layer_1 = Layer('layer_1', attributes=['attr', 'attr_1'], text_object=text, ambiguous=True)
layer_1.add_annotation(( 0,  4),  attr='L1-0',  attr_1='SADA')
layer_1.add_annotation(( 5,  9),  attr='L1-1',  attr_1='KAKS')
layer_1.add_annotation(( 5, 16),  attr='L1-2',  attr_1='KAKS')
layer_1.add_annotation(( 5, 16),  attr='L1-2',  attr_1='KÜMME')
layer_1.add_annotation(( 5, 16),  attr='L1-2',  attr_1='KAKSKÜMMEND')
layer_1.add_annotation(( 9, 14),  attr='L1-3',  attr_1='KÜMME')
layer_1.add_annotation((17, 21),  attr='L1-4',  attr_1='KOLM')
layer_1.add_annotation((23, 27),  attr='L1-5',  attr_1='NELI')
layer_1.add_annotation((28, 33),  attr='L1-6',  attr_1='TUHAT')
layer_1.add_annotation((34, 38),  attr='L1-7',  attr_1='VIIS')
layer_1.add_annotation((34, 42),  attr='L1-8',  attr_1='SADA')
layer_1.add_annotation((34, 42),  attr='L1-8',  attr_1='VIIS')
layer_1.add_annotation((34, 42),  attr='L1-8',  attr_1='VIISSADA')
layer_1.add_annotation((38, 42),  attr='L1-9',  attr_1='SADA')
layer_1.add_annotation((43, 47),  attr='L1-10', attr_1='KUUS')
layer_1.add_annotation((43, 54),  attr='L1-11', attr_1='KUUS')
layer_1.add_annotation((43, 54),  attr='L1-11', attr_1='KÜMME')
layer_1.add_annotation((43, 54),  attr='L1-11', attr_1='KUUSKÜMMEND')
layer_1.add_annotation((47, 52),  attr='L1-12', attr_1='KÜMME')
layer_1.add_annotation((55, 61),  attr='L1-13', attr_1='SEITSE')
layer_1.add_annotation((62, 66),  attr='L1-14', attr_1='KOMA')
layer_1.add_annotation((67, 74),  attr='L1-15', attr_1='KAHEKSA')
layer_1.add_annotation((76, 82),  attr='L1-16', attr_1='ÜHEKSA')
layer_1.add_annotation((76, 89),  attr='L1-17', attr_1='ÜHEKSA')
layer_1.add_annotation((76, 89),  attr='L1-17', attr_1='KÜMME')
layer_1.add_annotation((76, 89),  attr='L1-17', attr_1='ÜHEKSAKÜMMEND')
layer_1.add_annotation((82, 87),  attr='L1-18', attr_1='KÜMME')

layer_a = Layer('layer_a', attributes=['attr', 'attr_5'], text_object=text, ambiguous=True, enveloping='layer_1')
layer_a.add_annotation([layer_1[0], layer_1[1]],                attr='L5-0-0',  attr_5='SADA KAKS')
layer_a.add_annotation([layer_1[0], layer_1[2], layer_1[4]],    attr='L5-2-4',  attr_5='SADA KAKSKÜMMEND KOLM')
layer_a.add_annotation([layer_1[5], layer_1[6]],                attr='L5-3-5',  attr_5='NELI TUHAT')
layer_a.add_annotation([layer_1[7]],                            attr='L5-5-7',  attr_5='VIIS')
layer_a.add_annotation([layer_1[8], layer_1[11]],               attr='L5-6-8',  attr_5='VIISSADA KUUS')
layer_a.add_annotation([layer_1[8], layer_1[11]],               attr='L5-6-9',  attr_5='VIISSADA KÜMME')
layer_a.add_annotation([layer_1[8], layer_1[11]],               attr='L5-6-10', attr_5='VIISSADA KUUSKÜMMEND')
layer_a.add_annotation([layer_1[12], layer_1[14], layer_1[15]], attr='L5-7-11', attr_5='KÜMME KOMA KAHEKSA')
layer_a.add_annotation([layer_1[13], layer_1[14], layer_1[15]], attr='L5-8-12', attr_5='seitse koma kaheksa')

layer_a

layer name,attributes,parent,enveloping,ambiguous,span count
layer_a,"attr, attr_5",,layer_1,True,7

text,start,end,attr,attr_5
"['Sada', 'kaks']",0,9,L5-0-0,SADA KAKS
"['Sada', 'kakskümmend', 'kolm']",0,21,L5-2-4,SADA KAKSKÜMMEND KOLM
"['Neli', 'tuhat']",23,33,L5-3-5,NELI TUHAT
['viis'],34,38,L5-5-7,VIIS
"['viissada', 'kuuskümmend']",34,54,L5-6-8,VIISSADA KUUS
,34,54,L5-6-9,VIISSADA KÜMME
,34,54,L5-6-10,VIISSADA KUUSKÜMMEND
"['kümme', 'koma', 'kaheksa']",47,74,L5-7-11,KÜMME KOMA KAHEKSA
"['seitse', 'koma', 'kaheksa']",55,74,L5-8-12,seitse koma kaheksa


Create ambiguous envelopoing demo layer `layer_b`.

In [3]:
layer_b = Layer('layer_b', attributes=['attr', 'attr_5'], text_object=text, ambiguous=True, enveloping='layer_1')

layer_b.add_annotation([layer_1[0], layer_1[2]],                attr='L5-1-1',  attr_5='SADA KAKS')
layer_b.add_annotation([layer_1[0], layer_1[2]],                attr='L5-1-2',  attr_5='SADA KÜMME')
layer_b.add_annotation([layer_1[0], layer_1[2]],                attr='L5-1-3',  attr_5='SADA KAKSKÜMMEND')
layer_b.add_annotation([layer_1[6], layer_1[7]],                attr='L5-4-6',  attr_5='TUHAT VIIS')
layer_b.add_annotation([layer_1[8], layer_1[11]],               attr='L5-6-10', attr_5='VIISSADA KUUSKÜMMEND')
layer_b.add_annotation([layer_1[12], layer_1[14], layer_1[15]], attr='L5-7-11', attr_5='KÜMME KOMA KAHEKSA')
layer_b.add_annotation([layer_1[13], layer_1[14], layer_1[15]], attr='L5-8-12', attr_5='SEITSE KOMA KAHEKSA')

layer_b

layer name,attributes,parent,enveloping,ambiguous,span count
layer_b,"attr, attr_5",,layer_1,True,5

text,start,end,attr,attr_5
"['Sada', 'kakskümmend']",0,16,L5-1-1,SADA KAKS
,0,16,L5-1-2,SADA KÜMME
,0,16,L5-1-3,SADA KAKSKÜMMEND
"['tuhat', 'viis']",28,38,L5-4-6,TUHAT VIIS
"['viissada', 'kuuskümmend']",34,54,L5-6-10,VIISSADA KUUSKÜMMEND
"['kümme', 'koma', 'kaheksa']",47,74,L5-7-11,KÜMME KOMA KAHEKSA
"['seitse', 'koma', 'kaheksa']",55,74,L5-8-12,SEITSE KOMA KAHEKSA


Create and show layer of differences.

In [4]:
diff_layer = diff_tagger.make_layer(text, layers={'layer_a': layer_a, 'layer_b': layer_b})
diff_layer

0,1
conflicts,4
extra_annotations,5
extra_spans,2
missing_annotations,7
missing_spans,4
modified_spans,2
overlapped,1
prolonged,2
shortened,1
unchanged_annotations,2

layer name,attributes,parent,enveloping,ambiguous,span count
diff_layer,"input_layer_name, span_status, attr, attr_5",,layer_1,True,8

text,start,end,input_layer_name,span_status,attr,attr_5
"['Sada', 'kaks']",0,9,layer_a,missing,L5-0-0,SADA KAKS
"['Sada', 'kakskümmend']",0,16,layer_b,extra,L5-1-1,SADA KAKS
,0,16,layer_b,extra,L5-1-2,SADA KÜMME
,0,16,layer_b,extra,L5-1-3,SADA KAKSKÜMMEND
"['Sada', 'kakskümmend', 'kolm']",0,21,layer_a,missing,L5-2-4,SADA KAKSKÜMMEND KOLM
"['Neli', 'tuhat']",23,33,layer_a,missing,L5-3-5,NELI TUHAT
"['tuhat', 'viis']",28,38,layer_b,extra,L5-4-6,TUHAT VIIS
['viis'],34,38,layer_a,missing,L5-5-7,VIIS
"['viissada', 'kuuskümmend']",34,54,layer_a,modified,L5-6-8,VIISSADA KUUS
,34,54,layer_a,modified,L5-6-9,VIISSADA KÜMME


## Diff summary
Diff layer `meta` contains summary of differences.

In [5]:
diff_layer.meta

{'modified_spans': 2,
 'missing_spans': 4,
 'extra_spans': 2,
 'extra_annotations': 5,
 'missing_annotations': 7,
 'overlapped': 1,
 'prolonged': 2,
 'shortened': 1,
 'conflicts': 4,
 'unchanged_spans': 1,
 'unchanged_annotations': 2}

    unchanged_spans + modified_spans + missing_spans = length_of_old_layer
    unchanged_spans + modified_spans + extra_spans = length_of_new_layer
    unchanged_annotations + missing_annotations = number_of_annotations_in_old_layer
    unchanged_annotations + extra_annotations   = number_of_annotations_in_new_layer
    overlapped + prolonged + shortened = conflicts <= missing_spans x extra_spans

A span is a **missing span** if its base span is in the first layer but not in the second layer.

A span is an **extra span** if its base span is in the second layer but not in the first layer.

A span is **modified** if the base span exists in the first and in the second layer but the annotations are different.

A **conflict** is a pair of a missing span and an extra span that cover partially the same text. The annotations are not compared in this case.

# Iterate layer of differences

In [6]:
from estnltk.taggers.system.diff_tagger import iterate_modified
from estnltk.taggers.system.diff_tagger import iterate_missing
from estnltk.taggers.system.diff_tagger import iterate_extra
from estnltk.taggers.system.diff_tagger import iterate_diff_conflicts
from estnltk.taggers.system.diff_tagger import iterate_overlapped
from estnltk.taggers.system.diff_tagger import iterate_prolonged
from estnltk.taggers.system.diff_tagger import iterate_shortened

## Iterate modified spans
Note that only missing and extra annotations of the span are shown.

In [7]:
for a in iterate_modified(diff_layer, 'span_status'):
    display(a)

text,input_layer_name,span_status,attr,attr_5
viissada kuuskümmend,layer_a,modified,L5-6-8,VIISSADA KUUS
,layer_a,modified,L5-6-9,VIISSADA KÜMME


text,input_layer_name,span_status,attr,attr_5
seitse koma kaheksa,layer_a,modified,L5-8-12,seitse koma kaheksa
,layer_b,modified,L5-8-12,SEITSE KOMA KAHEKSA


## Iterate missing spans

In [8]:
for a in iterate_missing(diff_layer, 'span_status'):
    display(a)

text,input_layer_name,span_status,attr,attr_5
Sada kaks,layer_a,missing,L5-0-0,SADA KAKS


text,input_layer_name,span_status,attr,attr_5
Sada kakskümmend kolm,layer_a,missing,L5-2-4,SADA KAKSKÜMMEND KOLM


text,input_layer_name,span_status,attr,attr_5
Neli tuhat,layer_a,missing,L5-3-5,NELI TUHAT


text,input_layer_name,span_status,attr,attr_5
viis,layer_a,missing,L5-5-7,VIIS


## Iterate extra spans

In [9]:
for a in iterate_extra(diff_layer, 'span_status'):
    display(a)

text,input_layer_name,span_status,attr,attr_5
Sada kakskümmend,layer_b,extra,L5-1-1,SADA KAKS
,layer_b,extra,L5-1-2,SADA KÜMME
,layer_b,extra,L5-1-3,SADA KAKSKÜMMEND


text,input_layer_name,span_status,attr,attr_5
tuhat viis,layer_b,extra,L5-4-6,TUHAT VIIS


## Iterate overlapped spans

In [10]:
for a, b in iterate_overlapped(diff_layer, 'span_status'):
    display(a)
    display(b)

text,input_layer_name,span_status,attr,attr_5
Neli tuhat,layer_a,missing,L5-3-5,NELI TUHAT


text,input_layer_name,span_status,attr,attr_5
tuhat viis,layer_b,extra,L5-4-6,TUHAT VIIS


## Iterate prolonged spans

In [11]:
for a, b in iterate_prolonged(diff_layer, 'span_status'):
    display(a)
    display(b)

text,input_layer_name,span_status,attr,attr_5
Sada kaks,layer_a,missing,L5-0-0,SADA KAKS


text,input_layer_name,span_status,attr,attr_5
Sada kakskümmend,layer_b,extra,L5-1-1,SADA KAKS
,layer_b,extra,L5-1-2,SADA KÜMME
,layer_b,extra,L5-1-3,SADA KAKSKÜMMEND


text,input_layer_name,span_status,attr,attr_5
viis,layer_a,missing,L5-5-7,VIIS


text,input_layer_name,span_status,attr,attr_5
tuhat viis,layer_b,extra,L5-4-6,TUHAT VIIS


## Iterate shortened spans

In [12]:
for a, b in iterate_shortened(diff_layer, 'span_status'):
    display(a)
    display(b)

text,input_layer_name,span_status,attr,attr_5
Sada kakskümmend kolm,layer_a,missing,L5-2-4,SADA KAKSKÜMMEND KOLM


text,input_layer_name,span_status,attr,attr_5
Sada kakskümmend,layer_b,extra,L5-1-1,SADA KAKS
,layer_b,extra,L5-1-2,SADA KÜMME
,layer_b,extra,L5-1-3,SADA KAKSKÜMMEND


## Iterate all conflicts between missing and extra spans

In [13]:
for a, b in iterate_diff_conflicts(diff_layer, 'span_status'):
    print('Conflicting pair:')
    display(a)
    display(b)

Conflicting pair:


text,input_layer_name,span_status,attr,attr_5
Sada kaks,layer_a,missing,L5-0-0,SADA KAKS


text,input_layer_name,span_status,attr,attr_5
Sada kakskümmend,layer_b,extra,L5-1-1,SADA KAKS
,layer_b,extra,L5-1-2,SADA KÜMME
,layer_b,extra,L5-1-3,SADA KAKSKÜMMEND


Conflicting pair:


text,input_layer_name,span_status,attr,attr_5
Sada kakskümmend kolm,layer_a,missing,L5-2-4,SADA KAKSKÜMMEND KOLM


text,input_layer_name,span_status,attr,attr_5
Sada kakskümmend,layer_b,extra,L5-1-1,SADA KAKS
,layer_b,extra,L5-1-2,SADA KÜMME
,layer_b,extra,L5-1-3,SADA KAKSKÜMMEND


Conflicting pair:


text,input_layer_name,span_status,attr,attr_5
Neli tuhat,layer_a,missing,L5-3-5,NELI TUHAT


text,input_layer_name,span_status,attr,attr_5
tuhat viis,layer_b,extra,L5-4-6,TUHAT VIIS


Conflicting pair:


text,input_layer_name,span_status,attr,attr_5
viis,layer_a,missing,L5-5-7,VIIS


text,input_layer_name,span_status,attr,attr_5
tuhat viis,layer_b,extra,L5-4-6,TUHAT VIIS
