In [1]:
import csv
from estnltk import Text

In [2]:
from estnltk import Span, Layer
from estnltk.taggers import FlattenTagger
from estnltk.taggers import DiffTagger

In [3]:
from grammarextractor.data_processing.measurement_extraction import MeasurementTokenTagger
from grammarextractor.data_processing.measurement_extraction import MeasurementTagger

In [4]:
measurement_token_tagger = MeasurementTokenTagger(output_layer='measurement_tokens')
measurement_tagger = MeasurementTagger(layer_of_tokens='measurement_tokens')

In [5]:
lines = []
with open("examples.csv", "r") as fin:
    reader = csv.reader(fin)
    for row in reader:
        lines.append(row)

In [6]:
texts_with_gold = []
for line in lines:
    
    text = Text(line[0])
    try:
        start = int(line[1])
        end = int(line[2])
        value = float(line[3])
        layer = Layer(name = 'gold', attributes = ['value'], ambiguous = True)
        layer.add_span(Span(start = start, end = end, legal_attributes = ['value'], value = value))
    
    except ValueError:
        layer = Layer(name = 'gold', attributes = ['value'])
    
    text['gold'] = layer
    texts_with_gold.append(text)

In [7]:
tagged_texts = []
for text in texts_with_gold:
    measurement_token_tagger.tag(text)
    measurement_tagger.tag(text)
    tagged_texts.append(text)

In [8]:
tagged_texts[0].measurements

layer name,attributes,parent,enveloping,ambiguous,span count
measurements,"name, OBJECT, VALUE, UNIT, DATE, REGEX_TYPE",,measurement_tokens,True,1

text,name,OBJECT,VALUE,UNIT,DATE,REGEX_TYPE
"PSA 19 05 2011 -1,32",MEASUREMENT,PSA,1.32,,19 05 2011,PSA


In [9]:
tagged_texts[0].gold

layer name,attributes,parent,enveloping,ambiguous,span count
gold,value,,,True,1

text,value
"PSA 19 05 2011 -1,32",1.32


In [10]:
flatten_tagger = FlattenTagger(input_layer='measurements',
                               output_layer='measurements_flat',
                               output_attributes=['name', 'value'],
                               attribute_mapping=(('REGEX_TYPE', 'name'), 
                                                  ('VALUE','value'), 
                                                  )
                               )

In [11]:
r = flatten_tagger.tag(tagged_texts[0])

In [12]:
r

text
"PSA 19 05 2011 -1,32 ja 26 09 2011 PSA oli pt-i sõnadel 0,044ng/ml."

layer name,attributes,parent,enveloping,ambiguous,span count
gold,value,,,True,1
measurement_tokens,"grammar_symbol, unit_type, value, regex_type",,,True,9
measurements,"name, OBJECT, VALUE, UNIT, DATE, REGEX_TYPE",,measurement_tokens,True,1
measurements_flat,"name, value",,,True,1


In [13]:
diff_tagger = DiffTagger(layer_a='gold',
                         layer_b='measurements_flat',
                         output_layer='diff',
                         output_attributes=['value']
                        )

In [14]:
status = {}
diff_tagger(r, status).diff

layer name,attributes,parent,enveloping,ambiguous,span count
diff,"span_status, input_layer_name, value",,,True,1

text,span_status,input_layer_name,value
"PSA 19 05 2011 -1,32",modified,gold,1.32
,modified,measurements_flat,1.32


In [15]:
status

{'conflicts': 0,
 'extra_annotations': 1,
 'extra_spans': 0,
 'missing_annotations': 1,
 'missing_spans': 0,
 'modified_spans': 1,
 'overlapped': 0,
 'prolonged': 0,
 'shortened': 0,
 'unchanged_annotations': 0,
 'unchanged_spans': 0}

In [16]:
from estnltk.taggers.standard_taggers.diff_tagger import iterate_modified
from estnltk.taggers.standard_taggers.diff_tagger import iterate_extra

In [18]:
for a in iterate_modified(r.diff, 'span_status'):
    a.display()

text,start,end,span_status,input_layer_name,value
"PSA 19 05 2011 -1,32",0,20,modified,gold,1.32
,0,20,modified,measurements_flat,1.32


In [21]:
r.diff[0][1]

Annotation(PSA 19 05 2011 -1,32, {'input_layer_name': 'measurements_flat', 'span_status': 'modified', 'value': '1.32'})