# Diff `measurements_old` and `measurements` layers

Connect to the database.

In [1]:
from estnltk.storage import PostgresStorage

storage = PostgresStorage(dbname='egcut_epi',
                          pgpass_file='~/.pgpass',
                          schema="grammarextractor",
                          role='egcut_epi_grammarextractor_create')

collection = storage.get_collection('psa_measurements_v2')

INFO:db.py:1216: connecting to host: 'p12.stacc.ee', port: '5432', dbname: 'egcut_epi', user: 'ptammo'
INFO:db.py:1228: role: 'egcut_epi_grammarextractor_create'


Since `measurements` layer envelopes `measurement_tokens` layer but `measurements_old` layer does not, first flatten `measurements` layer into `measurements_flat` layer. As these layers have different attribute names, mapping of attribute names is also needed.

In [2]:
from estnltk.layer_operations import drop_annotations

In [3]:
from estnltk.storage.postgres import RowMapperRecord

In [4]:
def row_mapper_1(row):

    text_id, text = row[0], row[1]
    
    drop_annotations(layer=text.measurements_old,
                 attribute='name',
                 values={'TIMEX', 'PRAHT', 'PULSS', 'VERERÕHK'},
                 function=None  # default: None
                 )
    
    layer = text.measurements_old
    layer.name = 'measurements_old_new'

    return [RowMapperRecord(layer=layer, meta=None)]

In [5]:
collection.create_layer_buffered('measurements_old_new',
                                 data_iterator=collection.select(layers = ['measurements_old']),
                                 row_mapper=row_mapper_1, overwrite = True, progressbar='notebook')

INFO:db.py:815: collection: 'psa_measurements_v2'
INFO:db.py:836: nothing to overwrite, preparing to create a new layer: 'measurements_old_new'
INFO:db.py:900: layer created: 'measurements_old_new'


In [48]:
not_removed = []
c = 0
for key, value in collection.select(layers = ['measurements_old']):

    not_removed.append(value.measurements_old)
    c += 1
    if c > 200:
        break

ProgrammingError: cursor "read" already exists


In [49]:
removed = []
c = 0
for key, value in collection.select(layers = ['measurements_old']):
    #print(value.measurements_old)
    #break
    drop_annotations(layer=value.measurements_old,
                 attribute='name',
                 values={'TIMEX', 'PRAHT', 'PULSS', 'VERERÕHK'},
                 function=None  # default: None
                 )
    removed.append(value.measurements_old)
    c += 1
    if c > 200:
        break

In [7]:
len(removed)

201

In [8]:
len(not_removed)

201

In [10]:
for rem, not_rem in zip(removed, not_removed):
    if rem != not_rem:
        print(rem)
        print(not_rem)
        break

Layer(name=measurements_old, spans=SL[])
Layer(name=measurements_old, spans=SL[AS[Annotation(15.02.2016, {'day': '15', 'diastolic': None, 'high': None, 'hour': None, 'key': None, 'low': None, 'minute': None, 'month': '02', 'name': 'TIMEX', 'prygi': None, 'pulse': None, 'subject': None, 'systolic': None, 'unit': None, 'value': None, 'year': '2016'})]])


In [16]:
for i in removed:
    if i:
        print(i)
        break

Layer(name=measurements_old, spans=SL[AS[Annotation(Kreatiniin   59, {'day': None, 'diastolic': None, 'high': None, 'hour': None, 'key': 'Kreatiniin', 'low': None, 'minute': None, 'month': None, 'name': 'KREATINIIN', 'prygi': None, 'pulse': None, 'subject': None, 'systolic': None, 'unit': None, 'value': '59', 'year': None})],
AS[Annotation(Kolesterool   3.9, {'day': None, 'diastolic': None, 'high': None, 'hour': None, 'key': 'Kolesterool', 'low': None, 'minute': None, 'month': None, 'name': 'KOLESTEROOL', 'prygi': None, 'pulse': None, 'subject': None, 'systolic': None, 'unit': None, 'value': '3.9', 'year': None})],
AS[Annotation( 
PSA   -4.400 , {'day': None, 'diastolic': None, 'high': None, 'hour': None, 'key': ' \nPSA', 'low': None, 'minute': None, 'month': None, 'name': 'PSA', 'prygi': None, 'pulse': None, 'subject': None, 'systolic': None, 'unit': None, 'value': '4.400', 'year': None})]])


In [None]:
from collections import OrderedDict
from estnltk.taggers import DiffTagger

diff_tagger = DiffTagger(layer_a='measurements_old',
                         layer_b='measurements_flat',
                         output_layer='measurements_diff_old_flat',
                         output_attributes=['name', 'key', 'value', 'unit', 'DATE', 'year']
                        )

meta = OrderedDict([('unchanged_annotations', 'int'),
                    ('missing_annotations', 'int'),
                    ('extra_annotations', 'int'),
                    ('unchanged_spans', 'int'),
                    ('modified_spans', 'int'),
                    ('missing_spans', 'int'),
                    ('extra_spans', 'int'),
                    ('conflicts', 'int'),
                    ('overlapped', 'int'),
                    ('prolonged', 'int'),
                    ('shortened', 'int')])

collection.create_layer_buffered(tagger=diff_tagger,
                                 overwrite=True,
                                 meta=meta,
                                 progressbar='notebook'
                                 )

In [9]:
from collections import OrderedDict
from estnltk.taggers import DiffTagger

diff_tagger = DiffTagger(layer_a='measurements_old',
                         layer_b='measurements_flat',
                         output_layer='measurements_diff_old_flat',
                         output_attributes=['name', 'key', 'value', 'unit', 'DATE', 'year']
                        )

meta = OrderedDict([('unchanged_annotations', 'int'),
                    ('missing_annotations', 'int'),
                    ('extra_annotations', 'int'),
                    ('unchanged_spans', 'int'),
                    ('modified_spans', 'int'),
                    ('missing_spans', 'int'),
                    ('extra_spans', 'int'),
                    ('conflicts', 'int'),
                    ('overlapped', 'int'),
                    ('prolonged', 'int'),
                    ('shortened', 'int')])

collection.create_layer_buffered(tagger=diff_tagger,
                                 overwrite=True,
                                 meta=meta,
                                 progressbar='notebook'
                                 )

INFO:db.py:815: collection: 'psa_measurements_v2'
INFO:db.py:820: overwriting output layer: 'measurements_diff_old_flat'
INFO:db.py:1010: layer deleted: 'measurements_diff_old_flat'


HBox(children=(IntProgress(value=0, max=66926), HTML(value='')))

KeyboardInterrupt: 

## Statistics
Tabel of diff layer meta data.

In [28]:
l = []
for idx, thing in enumerate(collection.select(layers = ['measurements_old', 'measurements_flat'])):
    print(thing)
    
    #print(thing[1].measurements_old._repr_html_())
    l.append(thing[1])
    if idx > 10:
        break
    #break

(1, Text(text='Anamnees 15.02.2016:\nElektroonne saatekiri.'))
(2, Text(text='Nycturai 2-6 x.'))
(3, Text(text='Uriin n.'))
(4, Text(text='Kreatiniin   59-104 90 11.02.2016 Kolesterool   3.9-7.8 5.3 11.02.2016 \nPSA   -4.400 0.661 11.02.2016 .'))
(5, Text(text='Urineerimine ei olevat valus,kuid abikaasa sõnadel on valulik.'))
(6, Text(text='20.09.2013 13:09 - PER, PER - D05687 - E410 - uroloogia \nPSA  2,9  \nTamsulosiini võtnud 1 aasta jooksul , nüktuuria 1x , urineerimine aeglane.'))
(7, Text(text='KATE tõttu ( 2012) marevan ravil.'))
(8, Text(text='UH'))
(9, Text(text='14.10.2015 - Võtnud nüüd aasta Combodarti.'))
(10, Text(text='PSA 3,6.'))
(11, Text(text='Oluliselt ei kaeba.'))
(12, Text(text='12.02.13 SPSA 7,87, kreat 121, urea 6,3, v/s 3,4.'))


In [32]:
l[5].measurements_flat

layer name,attributes,parent,enveloping,ambiguous,span count
measurements_flat,"name, key, value, unit, DATE",,,True,1

text,name,key,value,unit,DATE
"PSA 2,9",MEASUREMENT,PSA,2.9,,


In [33]:
l[5].measurements_old

layer name,attributes,parent,enveloping,ambiguous,span count
measurements_old,"name, key, value, unit, low, high, systolic, diastolic, pulse, year, month, day, hour, minute, prygi, subject",,,True,3

text,name,key,value,unit,low,high,systolic,diastolic,pulse,year,month,day,hour,minute,prygi,subject
20.09.2013 13:09,TIMEX,,,,,,,,,2013.0,9.0,20.0,13.0,9.0,,
"\nPSA 2,9 \n",PSA,\nPSA,29.0,,,,,,,,,,,,,
1 aasta,PRAHT,,,,,,,,,,,,,,1 aasta,


In [4]:
diff_layer = 'measurements_diff_old_flat'

df = collection.get_layer_meta(diff_layer)

In [5]:
df.sum()

id                       2239511275
text_id                  2239578201
unchanged_annotations             0
missing_annotations           70812
extra_annotations             33907
unchanged_spans                   0
modified_spans                 3648
missing_spans                 67164
extra_spans                   30259
conflicts                     27515
overlapped                      706
prolonged                       125
shortened                     26684
dtype: int64

## Random samples
The following iterations give different kind of random samlples without replacement. The domain names of samples (`'modified_spans'`, `missing_spans'` etc) correspond to the summary table columns.

In [24]:
from estnltk.taggers.standard_taggers.diff_tagger import sample_spans

for span in sample_spans(k=100,
                         collection=collection,
                         diff_layer=diff_layer,
                         domain='modified_spans'):
    span.display()

PgStorageException: Layer table 'psa_measurements_v2__measurements_diff_old_flat__layer' does not exist.

In [17]:
for span in sample_spans(2, collection, diff_layer, 'missing_spans'):
    span.display()

keys: (4112, 32536)
text_id: 4112
span_nrs {0}


text,start,end,span_status,input_layer_name,name,key,value,unit,DATE,year
14.01.2013,1,12,missing,measurements_old,TIMEX,,,,,2013


text_id: 32536
span_nrs {7}


In [18]:
for span in sample_spans(2, collection, diff_layer, 'extra_spans'):
    span.display()
    print(span)

keys: (27944, 64005)
text_id: 27944
span_nrs {0}


text,start,end,span_status,input_layer_name,name,key,value,unit,DATE,year
PSA 3,55,60,extra,measurements_flat,MEASUREMENT,PSA,3,,,


AS[Annotation(PSA 3, {'DATE': '', 'input_layer_name': 'measurements_flat', 'key': 'PSA', 'name': 'MEASUREMENT', 'span_status': 'extra', 'unit': '', 'value': '3', 'year': None})]
text_id: 64005
span_nrs {1}


text,start,end,span_status,input_layer_name,name,key,value,unit,DATE,year
"S,P-K 4.5",202,211,extra,measurements_flat,MEASUREMENT,"S,P-K",4.5,,,


AS[Annotation(S,P-K 4.5, {'DATE': '', 'input_layer_name': 'measurements_flat', 'key': 'S,P-K', 'name': 'MEASUREMENT', 'span_status': 'extra', 'unit': '', 'value': '4.5', 'year': None})]


In [19]:
for a, b in sample_spans(2, collection, diff_layer, 'conflicts'):
    a.display()
    b.display()
    print(80*'-')

keys: (55327, 2951)
text_id: 2951
span_nrs {0}
text_id: 55327
span_nrs {0}


In [20]:
for a, b in sample_spans(2, collection, diff_layer, 'overlapped'):
    a.display()
    b.display()
    print(80*'-')

keys: (10377, 42469)
text_id: 10377
span_nrs {0}
text_id: 42469
span_nrs {0}


In [21]:
for a, b in sample_spans(2, collection, diff_layer, 'prolonged'):
    a.display()
    b.display()
    print(80*'-')

keys: (44575, 18607)
text_id: 18607
span_nrs {0}
text_id: 44575
span_nrs {0}


In [22]:
for a, b in sample_spans(2, collection, diff_layer, 'shortened'):
    a.display()
    b.display()
    print(80*'-')

keys: (49521, 40046)
text_id: 40046
span_nrs {0}


text,start,end,span_status,input_layer_name,name,key,value,unit,DATE,year
"WBC 4.94 (3,5 .. 8,8 E9/L",25,50,missing,measurements_old,LEUKOTSÜÜDID,WBC,4.94,E9/L,,


text,start,end,span_status,input_layer_name,name,key,value,unit,DATE,year
WBC 4.94,25,33,extra,measurements_flat,MEASUREMENT,WBC,4.94,,,


--------------------------------------------------------------------------------
text_id: 49521
span_nrs {1}


Display `measurements_diff_1_2` layer.

In [23]:
from estnltk import Layer
Layer.print_start_end = True

texts = collection.select(layers=['measurement_tokens',
                                  'measurements_old',
                                  'measurements_flat',
                                  'measurements_diff_old_flat'],
                          keys=[7])
text_id, text = next(texts)
text.measurements_diff_old_flat

layer name,attributes,parent,enveloping,ambiguous,span count
measurements_diff_old_flat,"span_status, input_layer_name, name, key, value, unit, DATE, year",,,True,0

text,start,end,span_status,input_layer_name,name,key,value,unit,DATE,year


In [5]:
storage.close()