### Sampling

In [22]:
from estnltk import Text
from estnltk.storage.postgres import PostgresStorage


storage = PostgresStorage(host='postgres.keeleressursid.ee',
                          port=5432,
                          dbname='estonian-text-corpora',
                          user='',
                          password='',
                          schema='estonian_text_corpora',
                          role='estonian_text_corpora_read',
                          temporary=False)

INFO:storage.py:58: connecting to host: 'postgres.keeleressursid.ee', port: 5432, dbname: 'estonian-text-corpora', user: 'rasmusm'
INFO:storage.py:108: schema: 'estonian_text_corpora', temporary: False, role: 'estonian_text_corpora_read'


In [23]:
collection = storage['koondkorpus_sentences']
collection.selected_layers = ['v171_named_entities','v172_stanza_syntax','v172_obl_phrases']

In [None]:
from estnltk.storage.postgres import LayerQuery
from tqdm import tqdm
texts = []
q = LayerQuery('v171_named_entities',nertag='LOC')
i = 0
matching_texts = []
equal_ner_obl = []
ner_without_obl = []
ner_inside_obl = []
obl_inside_ner = []
ner_obl_loikuvad = []
partly_equal_ner_obl = []

for key, txt in tqdm(collection.select(query=q,layers=['v172_obl_phrases','v171_named_entities'])):
    i = i+1
    for ner in txt['v171_named_entities']:
        if ner.nertag != "LOC":
            continue
        n_match = False
        for ent in txt['v172_obl_phrases']:
            if ent.text == ner.text:
                equal_ner_obl.append((key, ner.start, ner.end, ent.start, ent.end))
                n_match = True
            if ent.start <= ner.end and ent.end >= ner.start:
                partly_equal_ner_obl.append((key, ner.start, ner.end, ent.start, ent.end))
                n_match = True
            if ner.start >= ent.start and ner.end <= ent.end:
                if ner.text == ent.text:
                    continue
                ner_inside_obl.append((key, ner.start, ner.end, ent.start, ent.end))
            if ent.start >= ner.start and ent.end <= ner.end:
                if ner.text == ent.text:
                    continue
                obl_inside_ner.append((key, ner.start, ner.end, ent.start, ent.end))
            if ner.start > ent.start and ner.end > ent.end and ner.start <= ent.end:
                ner_obl_loikuvad.append((key, ner.start, ner.end, ent.start, ent.end))
            if ner.start < ent.start and ner.end < ent.end and ner.end >= ent.start:
                ner_obl_loikuvad.append((key, ner.start, ner.end, ent.start, ent.end))
        if n_match == False:
            ner_without_obl.append((key, ner.start, ner.end, ent.start, ent.end))
    if i%3000==0:
        
        with open('matching_texts.txt','a',encoding='UTF-8') as f:
            f.write(str(matching_texts))

        with open('equal_ner_obl.txt','a',encoding='UTF-8') as f:
            f.write(str(equal_ner_obl))

        with open('partly_equal_ner_obl.txt','a',encoding='UTF-8') as f:
            f.write(str(partly_equal_ner_obl))

        with open('ner_without_obl.txt','a',encoding='UTF-8') as f:
            f.write(str(ner_without_obl))

        with open('ner_inside_obl.txt','a',encoding='UTF-8') as f:
            f.write(str(ner_inside_obl))

        with open('obl_inside_ner.txt','a',encoding='UTF-8') as f:
            f.write(str(obl_inside_ner))

        with open('ner_obl_loikuvad.txt','a',encoding='UTF-8') as f:
            f.write(str(ner_obl_loikuvad))
            
        matching_texts = []
        equal_ner_obl = []
        ner_without_obl = []
        ner_inside_obl = []
        obl_inside_ner = []
        ner_obl_loikuvad = []
        partly_equal_ner_obl = []

### Analysis

In [1]:
with open('ner_obl_loikuvad.txt','r',encoding='UTF-8') as f:
    ner_obl_loikuvad = f.readline()

In [2]:
splitted = ner_obl_loikuvad.split(']')

In [3]:
altogether = []
for part in splitted:
    aslist = part[1:].split('), (')
    altogether.extend(aslist)

len(altogether)

9642

### Labelstudio

In [8]:
for i in range(len(altogether)):
    altogether[i] = altogether[i].replace('(','')
    altogether[i] = altogether[i].replace(')','')

In [49]:
altogether = [row.split(',') for row in altogether]

In [70]:
text_indices = [int(row[0]) for row in altogether if len(row)==5]

In [71]:
from estnltk.storage.postgres import IndexQuery

query = IndexQuery(text_indices)
a = collection.select(query=query,layers=['v171_named_entities','v172_stanza_syntax','v172_obl_phrases'])

In [94]:
collection[8460].v171_named_entities.spans

[EnvelopingSpan(['Peeter'], [{'nertag': 'PER'}]),
 EnvelopingSpan(['Priit'], [{'nertag': 'PER'}]),
 EnvelopingSpan(['Baltikumis', 'Disneyt'], [{'nertag': 'LOC'}])]

In [96]:
from tqdm import tqdm
from copy import copy

ls_texts = []

for i, (key, text) in tqdm(enumerate(a)):
    for row in altogether:
        if len(row) == 5 and int(row[0]) == key:
            this_row = row
    for span in copy(text.v171_named_entities.spans):
        if int(this_row[1])!=span.start or int(this_row[2])!=span.end:
            text.v171_named_entities.remove_span(span)
    for span in copy(text.v172_obl_phrases.spans):
        if int(this_row[3])!=span.start or int(this_row[4])!=span.end:
            text.v172_obl_phrases.remove_span(span)
    ls_texts.append(text)

8828it [03:01, 48.71it/s]


In [97]:
from estnltk.converters.label_studio.label_studio import LabelStudioExporter

exporter = LabelStudioExporter("ner_obl_loikuvad.json",layers=['v171_named_entities','v172_obl_phrases'],checkbox=True)

print(exporter.labeling_interface)

exporter.convert(ls_texts,append=False)


        <View>
            <Labels name="label" toName="text">
	<Label value="v171_named_entities" background="#987D6C"/> 
	<Label value="v172_obl_phrases" background="#1114B3"/> 

            </Labels>
        <Text name="text" value="$text"/>
            <Header value="Are the annotations correct?"/>
                <Choices name="review" toName="text">
                    <Choice value="yes"/>
                    <Choice value="no"/>
                </Choices>
            </View>
