### Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from estnltk import Text
from estnltk_neural.taggers import StanzaSyntaxTagger

from estnltk_patches import EntityTagger
from estnltk_patches import SyntaxTree

In [3]:
import pickle 
import random
import pandas as pd
from tqdm import tqdm
from os import listdir
from collections import Counter, defaultdict
from estnltk.wordnet import Wordnet
from estnltk.taggers import NerTagger
from estnltk.corpus_processing.parse_koondkorpus import parse_tei_corpus

### NER and OBL comparison

In [4]:
with open("original_sentences_with_analysis.pickle", "rb") as f:
    sentences = pickle.load(f)

In [5]:
sentences[0]

text
08.08.2001 Maarius Suviste

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,7
compound_tokens,"type, normalized",,tokens,False,1
words,normalized_form,,,True,3
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,3
morph_extended,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech, punctuation_type, pronoun_type, letter_case, fin, verb_extension_suffix, subcat",morph_analysis,,True,3
entities,"entity_type, free_entity, is_valid, root",,morph_analysis,False,1
ner,nertag,,words,False,0
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc",morph_analysis,,False,3


In [6]:
sentences[20]

text
Ta sündis Saaremaal Laimjala vallas talunik Fjodor Rüütli perekonnas .

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,10
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,10
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,10
morph_extended,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech, punctuation_type, pronoun_type, letter_case, fin, verb_extension_suffix, subcat",morph_analysis,,True,10
entities,"entity_type, free_entity, is_valid, root",,morph_analysis,False,2
ner,nertag,,words,False,3
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc",morph_analysis,,False,10


In [7]:
sentences[20].entities

layer name,attributes,parent,enveloping,ambiguous,span count
entities,"entity_type, free_entity, is_valid, root",,morph_analysis,False,2

text,entity_type,free_entity,is_valid,root
['vallas'],,,,"Span('vallas', [{'id': 5, 'lemma': 'vald', 'upostag': 'S', 'xpostag': 'S', 'feat ..., type: <class 'estnltk_core.layer.span.Span'>"
['perekonnas'],,,,"Span('perekonnas', [{'id': 9, 'lemma': 'perekond', 'upostag': 'S', 'xpostag': 'S ..., type: <class 'estnltk_core.layer.span.Span'>"


In [8]:
sentences[20].ner

layer name,attributes,parent,enveloping,ambiguous,span count
ner,nertag,,words,False,3

text,nertag
['Saaremaal'],LOC
"['Laimjala', 'vallas']",LOC
"['Fjodor', 'Rüütli']",PER


In [9]:
sentences[20].entities[0][0].start <= sentences[20].ner[1][-1].end and sentences[20].entities[0][0].end >= sentences[20].ner[1][0].start

True

In [10]:
sentences[20].ner[0][0].start

10

In [11]:
sentences[20].ner[1][0].start

20

In [12]:
sentences[20].ner[1][-1].end

35

In [13]:
sentences[20].ner[1].text

['Laimjala', 'vallas']

In [14]:
equal_ner_obl = []
partly_equal_ner_obl = []
ner_without_obl = []

for i, sentence in enumerate(sentences):
    try:
        ents = sentence.entities
        nerlayer = sentence.ner
    
        for n in nerlayer:
            if n.nertag != "LOC":
                continue
            n_match = False
            for ent in ents:
                if ent.text == n.text:
                    equal_ner_obl.append((i, n, ent))
                    n_match = True
                if ent[0].start <= n[-1].end and ent[-1].end >= n[0].start:
                    partly_equal_ner_obl.append((i, n, ent))
                    n_match = True
            if n_match == False:
                ner_without_obl.append((i, n))
    except:
        print(i)
        continue

1118
2534


In [15]:
len(equal_ner_obl)

271

In [16]:
equal_ner_obl[0]

(146,
 EnvelopingSpan(['Eesti', 'riigil'], [{'nertag': 'LOC'}]),
 EnvelopingSpan(['Eesti', 'riigil'], [{'entity_type': None, 'free_entity': None, 'is_valid': None, 'root': <class 'estnltk_core.layer.span.Span'>}]))

In [17]:
len(partly_equal_ner_obl)

910

In [18]:
partly_equal_ner_obl[0]

(20,
 EnvelopingSpan(['Laimjala', 'vallas'], [{'nertag': 'LOC'}]),
 EnvelopingSpan(['vallas'], [{'entity_type': None, 'free_entity': None, 'is_valid': None, 'root': <class 'estnltk_core.layer.span.Span'>}]))

In [19]:
partly_equal_ner_obl[:10]

[(20,
  EnvelopingSpan(['Laimjala', 'vallas'], [{'nertag': 'LOC'}]),
  EnvelopingSpan(['vallas'], [{'entity_type': None, 'free_entity': None, 'is_valid': None, 'root': <class 'estnltk_core.layer.span.Span'>}])),
 (25,
  EnvelopingSpan(['Nõukogude'], [{'nertag': 'LOC'}]),
  EnvelopingSpan(['Nõukogude', 'Liidu'], [{'entity_type': None, 'free_entity': None, 'is_valid': None, 'root': <class 'estnltk_core.layer.span.Span'>}])),
 (28,
  EnvelopingSpan(['Eesti'], [{'nertag': 'LOC'}]),
  EnvelopingSpan(['Eesti', 'suveräänsuse', 'kaitsjana'], [{'entity_type': None, 'free_entity': None, 'is_valid': None, 'root': <class 'estnltk_core.layer.span.Span'>}])),
 (97,
  EnvelopingSpan(['Eesti'], [{'nertag': 'LOC'}]),
  EnvelopingSpan(['Eesti', 'raskest', 'seisust'], [{'entity_type': None, 'free_entity': None, 'is_valid': None, 'root': <class 'estnltk_core.layer.span.Span'>}])),
 (146,
  EnvelopingSpan(['Eesti', 'riigil'], [{'nertag': 'LOC'}]),
  EnvelopingSpan(['Eesti', 'riigil'], [{'entity_type': None

In [20]:
len(ner_without_obl)

2101

In [21]:
ner_without_obl[0]

(1, EnvelopingSpan(['Saaremaal'], [{'nertag': 'LOC'}]))

In [22]:
ner_inside_obl = []

for i, sentence in enumerate(sentences):
    try:
        ents = sentence.entities
        nerlayer = sentence.ner
        
        if len(ents) == 0:
            continue
    
        for n in nerlayer:
            if n.nertag != "LOC":
                continue
            for ent in ents:
                if n[0].start >= ent[0].start and n[-1].end <= ent[-1].end:
                    ner_inside_obl.append((i, n, ent))
    except Exception as e:
        print(i, e)
        continue

1118 'Text' object has no layer 'entities'
2534 'Text' object has no layer 'entities'


In [23]:
len(ner_inside_obl)

797

In [24]:
ner_inside_obl[:5]

[(25,
  EnvelopingSpan(['Nõukogude'], [{'nertag': 'LOC'}]),
  EnvelopingSpan(['Nõukogude', 'Liidu'], [{'entity_type': None, 'free_entity': None, 'is_valid': None, 'root': <class 'estnltk_core.layer.span.Span'>}])),
 (28,
  EnvelopingSpan(['Eesti'], [{'nertag': 'LOC'}]),
  EnvelopingSpan(['Eesti', 'suveräänsuse', 'kaitsjana'], [{'entity_type': None, 'free_entity': None, 'is_valid': None, 'root': <class 'estnltk_core.layer.span.Span'>}])),
 (97,
  EnvelopingSpan(['Eesti'], [{'nertag': 'LOC'}]),
  EnvelopingSpan(['Eesti', 'raskest', 'seisust'], [{'entity_type': None, 'free_entity': None, 'is_valid': None, 'root': <class 'estnltk_core.layer.span.Span'>}])),
 (146,
  EnvelopingSpan(['Eesti', 'riigil'], [{'nertag': 'LOC'}]),
  EnvelopingSpan(['Eesti', 'riigil'], [{'entity_type': None, 'free_entity': None, 'is_valid': None, 'root': <class 'estnltk_core.layer.span.Span'>}])),
 (154,
  EnvelopingSpan(['Eesti'], [{'nertag': 'LOC'}]),
  EnvelopingSpan(['Eesti', 'riigist'], [{'entity_type': None, 

In [25]:
sentences[97]

text
"Arvan , et mul on olemas see kogemus aidata tuua Eesti raskest seisust välja ."

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,15
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,15
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,15
morph_extended,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech, punctuation_type, pronoun_type, letter_case, fin, verb_extension_suffix, subcat",morph_analysis,,True,15
entities,"entity_type, free_entity, is_valid, root",,morph_analysis,False,2
ner,nertag,,words,False,1
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc",morph_analysis,,False,15


### Wordnet

In [26]:
kohakaanded = ['sg ill', 'sg in', 'sg el', 'sg all', 'sg ad', 'sg abl', 
               'pl ill', 'pl in', 'pl el', 'pl all', 'pl ad', 'pl abl']

In [27]:
all_loc_entities = []
all_loc_entities_text = []

for i, sent in enumerate(sentences):
    if i in [1118, 2534]:
        continue
    for span in sent['entities']:
        loc = False
        root_form = span.root.form[0]
        if root_form in kohakaanded:
            loc = True
        
        if loc:
            all_loc_entities.append(span)
            all_loc_entities_text.append(span.text)

In [28]:
len(all_loc_entities)

6650

In [29]:
all_root = []
all_root_lemma = []

for ent in all_loc_entities:
    all_root.append(ent.root.text)
    all_root_lemma.append(ent.root.lemma)

In [30]:
wn = Wordnet()

In [31]:
all_loc_entities_hypernyms = defaultdict(set)
all_loc_entities_hypernyms_literal = defaultdict(set)
loc_ent_count = defaultdict(int)
without_hyp = []


for i, lem in enumerate(all_root_lemma):
    synsets = wn[lem]
    loc_ent_count[all_root[i]] += 1
    for syns in synsets:
        hypernym = syns.hypernyms
        # all_loc_entities_hypernyms.append((all_root[i], lem, hypernym[0]))
        if len(hypernym) > 0:
            all_loc_entities_hypernyms[all_root[i]].add(hypernym[0])
            all_loc_entities_hypernyms_literal[all_root[i]].add(hypernym[0].literal)
        else:
            without_hyp.append((all_root[i], lem, syns))

In [32]:
len(all_loc_entities_hypernyms)

2727

In [33]:
all_root[0]

'aastatel'

In [34]:
all_loc_entities_hypernyms["aastatel"]

{"Synset('aasta.n.02')", "Synset('ajavahemik.n.01')"}

In [35]:
all_loc_entities_hypernyms_literal["aastatel"]

{'aasta', 'ajavahemik'}

In [36]:
literal_counts = defaultdict(int)

for ent, literals in all_loc_entities_hypernyms_literal.items():
    for lit in literals:
        literal_counts[lit] += loc_ent_count[ent]

In [37]:
literal_counts["aasta"]

158

In [38]:
sorted(literal_counts.items(), key=lambda x: x[1], reverse=True)[:5]

[('ajavahemik', 470),
 ('inimene', 229),
 ('abstraktsioon', 199),
 ('osa', 176),
 ('kuu', 163)]

In [39]:
loc_time_hypernyms = ["kuu", "aasta", "aastaaeg", "ajavahemik", "piirkond", "koht", "äritegevuskoht", "maa", "ajaühik",
                     "nädalapäev", "asula", "tegevusala", "aeg", "ala", "maa-asula", "eluruum", "rahvusriik", "hoone",
                     "ruum", "maapind", "päev", "maa-ala", "mander", "tuba", "asukoht", "linn"]

In [40]:
loc_time_entities = []

for ent, literals in all_loc_entities_hypernyms_literal.items():
    loc_time_count = 0
    for lit in literals:
        if lit in loc_time_hypernyms:
            loc_time_count += 1
            
    loc_time_entities.append((ent, loc_time_count/len(literals)))

In [41]:
sorted(loc_time_entities, key=lambda x: x[1], reverse=True)[:20]

[('aastatel', 1.0),
 ('erialal', 1.0),
 ('tulevikule', 1.0),
 ('aastal', 1.0),
 ('välismaal', 1.0),
 ('välismaale', 1.0),
 ('aastail', 1.0),
 ('aastatest', 1.0),
 ('ööpäevas', 1.0),
 ('Välismaalt', 1.0),
 ('suvel', 1.0),
 ('sügisel', 1.0),
 ('suvelgi', 1.0),
 ('põldudel', 1.0),
 ('muldadel', 1.0),
 ('põllul', 1.0),
 ('tulevikus', 1.0),
 ('Põllul', 1.0),
 ('Esmaspäevast', 1.0),
 ('ümbruses', 1.0)]

### Tagger

In [43]:
from estnltk_patches import ReTagger

In [77]:
for i, sent in enumerate(sentences):
    if len(sent.entities) > 3:
        print(i)

28
213
217
219
222
227
233
249
251
277
284
313
365
366
371
407
423
434
439
468
483
503
513
605
628
636
669
676
697
711
740
748
751
792
793
826
844
846
851
862
868
899
917
929
937
943
978
1004
1030
1051


AttributeError: 'Text' object has no layer 'entities'

In [79]:
sentences[213].entities

layer name,attributes,parent,enveloping,ambiguous,span count
entities,"entity_type, free_entity, is_valid, root",,morph_analysis,False,5

text,entity_type,free_entity,is_valid,root
['reisil'],,,,"Span('reisil', [{'id': 7, 'lemma': 'reis', 'upostag': 'S', 'xpostag': 'S', 'feat ..., type: <class 'estnltk_core.layer.span.Span'>"
"['mille', 'puhul']",,,,"Span('mille', [{'id': 25, 'lemma': 'mis', 'upostag': 'P', 'xpostag': 'P', 'feats ..., type: <class 'estnltk_core.layer.span.Span'>"
['erandkorras'],,,,"Span('erandkorras', [{'id': 29, 'lemma': 'erandkord', 'upostag': 'S', 'xpostag': ..., type: <class 'estnltk_core.layer.span.Span'>"
['välismaale'],,,,"Span('välismaale', [{'id': 30, 'lemma': 'välismaa', 'upostag': 'S', 'xpostag': ' ..., type: <class 'estnltk_core.layer.span.Span'>"
['ravile'],,,,"Span('ravile', [{'id': 31, 'lemma': 'ravi', 'upostag': 'S', 'xpostag': 'S', 'fea ..., type: <class 'estnltk_core.layer.span.Span'>"


In [97]:
retagger = ReTagger(sentences[213].entities)

In [84]:
sentences[213].entities[3].root.form

Unnamed: 0,form
0,sg all


In [85]:
sentences[213].entities[3].root.lemma

'välismaa'

In [87]:
wn['välismaa'][0].hypernyms

["Synset('rahvusriik.n.01')"]

In [102]:
retagger.tag_adverb_type()

[(0,
  EnvelopingSpan(['reisil'], [{'entity_type': None, 'free_entity': None, 'is_valid': None, 'root': <class 'estnltk_core.layer.span.Span'>}]),
  None),
 (1,
  EnvelopingSpan(['mille', 'puhul'], [{'entity_type': None, 'free_entity': None, 'is_valid': None, 'root': <class 'estnltk_core.layer.span.Span'>}]),
  None),
 (2,
  EnvelopingSpan(['erandkorras'], [{'entity_type': None, 'free_entity': None, 'is_valid': None, 'root': <class 'estnltk_core.layer.span.Span'>}]),
  None),
 (3,
  EnvelopingSpan(['välismaale'], [{'entity_type': None, 'free_entity': None, 'is_valid': None, 'root': <class 'estnltk_core.layer.span.Span'>}]),
  'LOC'),
 (4,
  EnvelopingSpan(['ravile'], [{'entity_type': None, 'free_entity': None, 'is_valid': None, 'root': <class 'estnltk_core.layer.span.Span'>}]),
  None)]