In [118]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Loading the corpus

In [120]:
from estnltk import Text
from estnltk.storage.postgres import PostgresStorage, create_schema


storage = PostgresStorage(host='postgres.keeleressursid.ee',
                          port=5432,
                          dbname='estonian-text-corpora',
                          user='rasmusm',
                          password='',
                          schema='estonian_text_corpora',
                          role='estonian_text_corpora_read',
                          temporary=False)

INFO:storage.py:58: connecting to host: 'postgres.keeleressursid.ee', port: 5432, dbname: 'estonian-text-corpora', user: 'rasmusm'
INFO:storage.py:108: schema: 'estonian_text_corpora', temporary: False, role: 'estonian_text_corpora_read'


In [4]:
storage

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,rows,total_size,comment
collection,version,relations,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
koondkorpus_base_subset_of_5000_v2,2.0,,0,12 MB,Collection of 5000 randomly picked Koondkorpus texts (v2)
koondkorpus_base_subset_of_5000_v2,2.0,original_sentences_flat__la,0,5544 kB,created by soras on Fri Jun 12 11:28:06 2020
koondkorpus_base_subset_of_5000_v2,2.0,original_words__layer,0,19 MB,created by soras on Fri Jun 12 09:15:46 2020
koondkorpus_base_subset_of_5000_v2,2.0,original_words_morph_analys,0,96 MB,"Morphological analysis from v1.6.2/3, probably based on commit 349a7c2 (2018-11-22)"
koondkorpus_base_subset_of_5000_v2,2.0,structure,2,32 kB,
koondkorpus_base_subset_of_5000_v2,2.0,v166_compound_tokens__layer,0,5472 kB,created by soras on Thu Jun 4 12:29:42 2020
koondkorpus_base_subset_of_5000_v2,2.0,v166_morph_analysis__layer,0,97 MB,created by soras on Tue Jun 9 14:13:07 2020
koondkorpus_base_subset_of_5000_v2,2.0,v166_sentences__layer,0,21 MB,created by soras on Tue Jun 9 06:01:41 2020
koondkorpus_base_subset_of_5000_v2,2.0,v166_tokens__layer,0,20 MB,created by soras on Thu Jun 4 07:40:39 2020
koondkorpus_base_subset_of_5000_v2,2.0,v166_words__layer,0,20 MB,created by soras on Fri Jun 5 05:49:26 2020


In [121]:
collection = storage['koondkorpus_sentences']

In [122]:
collection.selected_layers = ['v171_named_entities','v172_geo_terms']

In [None]:
from estnltk.vabamorf.morf import synthesize

cases = [
    ('n', 'nimetav'),
    ('g', 'omastav'),
    ('p', 'osastav'),
    ('ill', 'sisseütlev'),
    ('in', 'seesütlev'),
    ('el', 'seestütlev'),
    ('all', 'alaleütlev'),
    ('ad', 'alalütlev'),
    ('abl', 'alaltütlev'),
    ('tr', 'saav'),
    ('ter', 'rajav'),
    ('es', 'olev'),
    ('ab', 'ilmaütlev'),
    ('kom', 'kaasaütlev')]

all_forms = []
for case, name in cases:
    all_forms.append(', '.join(synthesize('kanal', 'sg ' + case, 'S')))
    all_forms.append(', '.join(synthesize('kanal', 'pl ' + case, 'S')))

In [123]:
terms = []
with open('geo_terms.txt','r',encoding='UTF-8') as f:
    term = f.readline()
    while term is not '':
        terms.append(term.strip())
        term = f.readline()

### Local copy of Span Sampler

In [None]:
    def attribute_locations_creation(self):
        self.conn.commit()
        self.cur.execute("""SELECT EXISTS (
           SELECT FROM information_schema.tables 
           WHERE  table_schema = 'public'
           AND    table_name   = 'attribute_locations'
           );""")
        res = self.cur.fetchall()
        if not res[0][0]:
            self.cur.execute("CREATE TABLE attribute_locations (layer_id integer, attribute_value varchar, indices integer[], count integer);")
            self.conn.commit()
            for term in terms:
                q = LayerQuery('v172_geo_terms', lemma=term)
                for key, txt in tqdm(collection.select(query=q,layers=['v172_geo_terms'])):
                    indices = [i for i, nertag in enumerate(txt['v172_geo_terms']['lemma']) if nertag[0] ==term]
                    self.cur.execute("INSERT INTO attribute_locations (layer_id, attribute_value,indices,count) VALUES (%s, %s, %s, %s)",(key, term, indices,len(indices)))

        self.conn.commit()

In [124]:
from random import sample, choices
from estnltk.storage.postgres import LayerQuery, IndexQuery
from tqdm import tqdm

class SpanSampler:
    
    def __init__(self, storage, collection, layer, attribute):
        self.storage = storage
        self.conn = storage.conn
        self.cur = self.conn.cursor()
        self.collection = collection
        self.layer = layer
        self.attribute = attribute
    
    def __call__(self, count, attribute, return_index=False, with_replacement=True): 
        # Returns iterator of type Text, Span or int, Text, span
        # count determines the number of samples
        # with replacement means that same span can be sampled several times
        self.conn.commit()
        self.create_sampling_matrix(attribute)
        indices = self.find_sampled_indices(count,with_replacement)
        result_list = []
        only_txt_index = [idx[1] for idx in indices]
        texts = list(collection.select( query=IndexQuery(only_txt_index),layers=[self.layer],return_index=True ))
        for text in texts:
            idx = [index for index in indices if text[0] == index[1]][0]
            if return_index:
                result_list.append((idx[0],text[1],text[1][self.layer][idx[2]]))
            else:
                result_list.append((text[1],text[1][self.layer][idx[2]]))
        self.clear_sampling_matrix()
        return result_list
    

        
    def create_sampling_matrix(self,attribute_val):
        self.cur.execute("CREATE TABLE sampling_matrix (id serial, layer integer, layer_index integer);")
        self.cur.execute("INSERT INTO sampling_matrix (layer,layer_index) (SELECT layer_id as layer, unnest(indices) as layer_index FROM attribute_locations WHERE attribute_value IN " + str(attribute_val) + ");")
        self.conn.commit()
    
    def find_sampled_indices(self,count,with_replacement):
        self.cur.execute("SELECT COUNT(*) FROM sampling_matrix;")
        span_count = self.cur.fetchall()[0][0]
        self.conn.commit()
        if with_replacement:
            sampled = choices(range(span_count),k=count)
        else:
            sampled = sample(range(span_count),count)
        self.cur.execute("SELECT * FROM sampling_matrix WHERE id IN " + str(tuple(sampled)) + ';')
        return self.cur.fetchall()
    
    def clear_sampling_matrix(self):
        self.conn.commit()
        self.cur.execute("DROP TABLE sampling_matrix;")
        self.conn.commit()
        

In [125]:
# Second storage to keep the temporary lists used for sampling
localstorage = PostgresStorage(host='localhost',
                          port=5432,
                          dbname='ner_test',
                          user='postgres',
                          password='dbpass',
                          pgpass_file='~/.pgpass',
                          schema='my_schema',
                          role=None,
                          temporary=False)

INFO:storage.py:58: connecting to host: 'localhost', port: 5432, dbname: 'ner_test', user: 'postgres'
INFO:storage.py:108: schema: 'my_schema', temporary: False, role: 'postgres'


In [7]:
mycur = localstorage.conn.cursor()

In [126]:
sampler = SpanSampler(storage=localstorage,collection=collection, layer='v172_geo_terms',attribute='lemma')

In [28]:
#localstorage.conn.commit()

In [59]:
#sampler.clear_sampling_matrix()

UndefinedTable: table "sampling_matrix" does not exist


In [154]:
samples = sampler(count=1000,attribute=tuple(filtered_terms))

In [155]:
samples

[(Text(text='Võib kujutleda ärimeeste rõõmu , kui nad põhjakõrbenud hotelliprojektist kasuliku hinnaga lahti said .'),
  Span('lahti', [{'lemma': 'laht'}])),
 (Text(text='“ Riinal ja Juhanil ( Riina on Kaljo Kiisa tütar ja Juhan Viiding väimees - toim ) jäi suu lahti .'),
  Span('lahti', [{'lemma': 'laht'}])),
 (Text(text='Saajate ja maksjate “ eurosuhe ” 1,5 : 1 pole sugugi mägede taga .'),
  Span('mägede', [{'lemma': 'mägi'}])),
 (Text(text='Munad munetakse seisva vee pinnale kraavides , tiikides , lompides , mudaaukudes .'),
  Span('tiikides', [{'lemma': 'tiik'}])),
 (Text(text='Aga kui kõik minutaolised lahti lasta , kust siis uued võtta ? ”'),
  Span('lahti', [{'lemma': 'laht'}])),
 (Text(text='Mees nägi kaldast umbes 1,5 km kaugusel palju päid ja tumedat pampu , mida ta pidas ümberläinud paadiks ( tegelikult oli see rahuvalvajate nööriga kokkuseotud varustus ) .'),
  Span('kaldast', [{'lemma': 'kallas'}])),
 (Text(text='Oma olemuselt on meri , kus vesi vahetub kitsaste Taani väin

### Saving samples

In [156]:

import pickle

with open("1000_ulejaanud.pickle",'wb') as f:
    pickle.dump(samples,f)

In [157]:
from copy import copy

for text, sample_span in samples:
    for span in copy(text.v172_geo_terms.spans):
        if span != sample_span:
            text.v172_geo_terms.remove_span(span)

### Getting sentences to labelstudio format

In [130]:
from estnltk.converters.label_studio.label_studio import LabelStudioExporter

In [158]:
exporter = LabelStudioExporter("koond_1000_ulejaanud.json",'v172_geo_terms',checkbox=True)

In [159]:
print(exporter.labeling_interface)


        <View>
            <Labels name="label" toName="text">
	<Label value="v172_geo_terms" background="#F888F5"/> 

            </Labels>
        <Text name="text" value="$text"/>
            <Header value="Are the annotations correct?"/>
                <Choices name="review" toName="text">
                    <Choice value="yes"/>
                    <Choice value="no"/>
                </Choices>
            </View>


In [160]:
only_texts = [sample[0] for sample in samples]

In [161]:
exporter.convert(only_texts,append=False)

### Working with files from labelstudio

In [162]:
import json

with open('project-13-at-2023-01-17-03-17-09a04ed6.json','r',encoding='UTF-8') as f:
    data = json.load(f)

In [163]:
recall_set = []

for txt in data:
    if txt['annotations'][0]['result'][-1]['value']['choices'][0] == 'yes':
        recall_set.append(txt['data']['text'])

In [164]:
recall_set

['Kutsume kõiki huvilisi 17. mail kell 11 Hurmi järve äärde külavisiooni talgutele !',
 '14. dets. 1936. a. Peipsi järvelt 7 kalurit...milline relvastus meie piirivalvuritel , kas on kordonites kuulipildujaid , kuidas on ülemuste nimed ja aukraadid .',
 'Ta teenis Aegna saarel ning mängis sõjaväe orkestris .',
 '“ Ja just siia Lagedile , Pirita jõe ja Leivajõe vahelisele saarele . ”',
 'Opa , Ilsnä , Ipiku , Mäemõisa , Karkla , Kirbla , Koiva , Koivaliina , Kuivaste , Kuramaa , Kööna , Alamõisa , Vana-Liivimaa , Lemsalu , Loodi , Lutsi , Luke , Väike-Salatsi , Nausküla , Pedeli jõgi , Pedetsi jõgi , Reikküla järv , Räisaku , Ruhja jõgi , Salatsi , Salatsi jõgi , Uue-Salatsi , Säde jõgi , Sältnä , Talli , Tiikre , Tõrsa jõgi , Vaidva jõgi , Läti Valga , Vana-Laitsna e Laitsna , Vastse-Laitsna , Vana-Salatsi , Võnnu , Läti Liivimaa',
 'vastavalt määruse ( EMÜ ) nr 1601/92 artiklile 2 reguleeritakse kõnealuse korra alusel vajadust eespoolmainitud määruse lisas loetletud toodete järele , m

In [165]:
len(recall_set)

353

In [166]:
import json

with open('project-14-at-2023-01-18-00-44-d22f56f0.json','r',encoding='UTF-8') as f:
    data = json.load(f)

In [167]:
recall_set_2 = []

for txt in data:
    if txt['annotations'][0]['result'][-1]['value']['choices'][0] == 'yes':
        recall_set_2.append(txt['data']['text'])

In [168]:
len(recall_set_2)

13

In [169]:
import json

with open('project-15-at-2023-01-18-03-25-2438b42b.json','r',encoding='UTF-8') as f:
    data = json.load(f)

In [170]:
recall_set_3 = []

for txt in data:
    if txt['annotations'][0]['result'][-1]['value']['choices'][0] == 'yes':
        recall_set_3.append(txt['data']['text'])

In [171]:
len(recall_set_3)

177

In [111]:
recall_set = []

for txt in data:
    if txt['annotations'][0]['result'][-1]['value']['choices'][0] == 'yes':
        recall_set.append(txt['annotations'][0]['prediction']['result'][0]['value']['idx'])

In [114]:
true_sents = []
for i in range(len(only_texts)):
    if i in recall_set:
        true_sents.append(only_texts[i])

In [115]:
true_sents

[Text(text='Reisipakette on lisatud Hispaania mandriossa , Prantsusmaale ja Inglismaale , kuid ligi 5% võrra kahandatud Kanaari saartele .'),
 Text(text='Kaks korda nädalas lendav kahemootoriline lennuk AN-28 on praegu ainus Ruhnu saarega regulaarset ühendust pidav transpordivahend .'),
 Text(text='Nelja lõunapoolse Kuriili saare kuuluvuse küsimuse lahendamiseks jääb seega aega vaid kaks aastat .'),
 Text(text='Vilsandi saar ei ole mitte ainult lindude , vaid ka kirjanike arvult ühe elaniku kohta kõige rikkam paik Eestimaal .'),
 Text(text='Fääri saartel käinutel on raske testida sündmusi kronoloogilises järjekorras , sest Eesti delegatsiooni kuulunud isikud tegelesid selle nädala jooksul erinevate asjadega : poliitikud ja ajakirjanikud kohtusid poliitikute ja ajakirjanikega , muusikud ja kunstnikud kohtusid nii ametivendade kui laiema publikuga , nukuteatri trupp (  Helle Laas  ,  Riho Tammert  ,  Are Uder  ,  Leenamari Pirn  ) kohtus väikeste fäärlastega , andes neile kokku 8 menukat

In [120]:
from estnltk.taggers import NerTagger
ner = NerTagger()
for snt in true_sents:
    snt.tag_layer()
    ner.tag(snt)

In [123]:
exporter = LabelStudioExporter("koond_100_true.json",'ner',checkbox=True)

In [124]:
exporter.convert(true_sents,append=False)

In [126]:
print(exporter.labeling_interface)


        <View>
            <Labels name="label" toName="text">
	<Label value="ner" background="#5A37B7"/> 

            </Labels>
        <Text name="text" value="$text"/>
            <Header value="Are the annotations correct?"/>
                <Choices name="review" toName="text">
                    <Choice value="yes"/>
                    <Choice value="no"/>
                </Choices>
            </View>


In [127]:
import json

with open('project-12-at-2023-01-11-04-09-b44513d6.json','r',encoding='UTF-8') as f:
    data = json.load(f)

In [128]:

correct = 0
for txt in data:
    if txt['annotations'][0]['result'][-1]['value']['choices'][0] == 'yes':
        correct += 1

In [130]:
correct/len(data)

0.7105263157894737