### Imports and extracting the data

In [1]:
from estnltk import Text
from estnltk_neural.taggers import StanzaSyntaxTagger
from estnltk_patches import EntityTagger
from estnltk_patches import SyntaxTree

In [2]:
import random
from os import listdir
from collections import Counter, defaultdict
from estnltk.corpus_processing.parse_koondkorpus import parse_tei_corpus

In [3]:
stanza = StanzaSyntaxTagger()

In [4]:
result = parse_tei_corpus('Maaleht/ML/maaleht/aja_maaleht_2001/aja_ml_2001_32.tasak.xml')
result.extend(parse_tei_corpus('Maaleht/ML/maaleht/maaleht_2003/aja_ml_2003_31.xml'))
result.extend(parse_tei_corpus('Maaleht/ML/maaleht/maaleht_2002/aja_ml_2002_46.xml'))
result.extend(parse_tei_corpus('Maaleht/ML/maaleht/aja_maaleht_2001/aja_ml_2001_26.xml'))
result.extend(parse_tei_corpus('Maaleht/ML/maaleht/maaleht_2002/aja_ml_2002_03.xml'))

In [5]:
len(result)

225

In [6]:
sentences = []

for t in result:
    t.tag_layer()
    for sent in t.sentences:
        sent_text = " ".join(sent.text)
        sentences.append(Text(sent_text))

### Tagging the sentences with Stanxa and OBL extractor

In [7]:
for sent in sentences:
    sent.tag_layer('morph_extended')
    stanza(sent)

In [8]:
obl_extractor = EntityTagger(deprel='obl', output_layer='entities')

In [9]:
for sent in sentences:
    try:
        obl_extractor(sent)
    except:
        continue

In [10]:
for i, sent in enumerate(sentences):
    try:
        ent = sent.entities
    except:
        print(i)
        continue

1118
2534


In [11]:
sent

text
"Lisaks senistele keskkonna , ettevõtluse , turismi ja inimestevahelise suhtluse töörühmale moodustati nüüd ka transpordialane töörühm , mis hakkab otsima lahendusi saarte ja mandri vahelise ühenduse paremaks korraldamiseks ."

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,29
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,29
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,29
morph_extended,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech, punctuation_type, pronoun_type, letter_case, fin, verb_extension_suffix, subcat",morph_analysis,,True,29
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc",morph_analysis,,False,29
entities,"entity_type, free_entity, is_valid, root",,morph_analysis,False,4


### Finding all obl entities in inessive

In [12]:
all_entities_in = []
all_entities_in_with_text_id = []

for i, sent in enumerate(sentences):
    if i in [1118, 2534]:
        continue
    for span in sent['entities']:
        span_root_form = span.root.form[0]
        if span_root_form in ['sg in', 'pl in']:
            all_entities_in_with_text_id.append((i, span.root.lemma))
            all_entities_in.append(span.root.lemma)

In [13]:
all_entities_in_with_text_id[:3]

[(20, 'vald'), (20, 'perekond'), (21, 'põllunduskool')]

In [14]:
sentences[21]

text
"Hariduse omandas Kõljala põllunduskoolis , Jäneda põllumajandustehnikumis ning kaugõppes Eesti Põllumajanduse Akadeemias ."

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,13
compound_tokens,"type, normalized",,tokens,False,0
words,normalized_form,,,True,13
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,13
morph_extended,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech, punctuation_type, pronoun_type, letter_case, fin, verb_extension_suffix, subcat",morph_analysis,,True,13
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc",morph_analysis,,False,13
entities,"entity_type, free_entity, is_valid, root",,morph_analysis,False,2


#### Arrange by count

In [15]:
count = Counter(all_entities_in).most_common()

In [16]:
count[:10]

[('Eesti', 92),
 ('riik', 31),
 ('vald', 30),
 ('mis', 18),
 ('linn', 18),
 ('see', 16),
 ('mõte', 15),
 ('ulatus', 15),
 ('lõpp', 15),
 ('maailm', 14)]

In [25]:
sentences[0].words[0].lemma[0]

'08.08.2001'

#### Arrange by fraction of inessive in all words

In [27]:
all_word_counts = defaultdict(int)

for sent in sentences:
    for word in sent.words:
        all_word_counts[word.lemma[0]] += 1

In [28]:
all_word_counts['Eesti']

764

In [29]:
len(count)

733

In [61]:
frac_of_in = []

for w, c in count:
    try:
        frac = c / all_word_counts[w]
        frac_of_in.append((w, frac))
    except:
        continue

In [62]:
len(frac_of_in)

730

In [63]:
sorted(frac_of_in, key=lambda x: x[1], reverse=True)[:20]

[('eesots', 1.0),
 ('kataloog', 1.0),
 ('sinimägi', 1.0),
 ('alluvus', 1.0),
 ('eraomand', 1.0),
 ('tall', 1.0),
 ('statu', 1.0),
 ('lood', 1.0),
 ('eluhoone', 1.0),
 ('laast', 1.0),
 ('lähiümbrus', 1.0),
 ('sisepoliitika', 1.0),
 ('raviküsimus', 1.0),
 ('messikeskus', 1.0),
 ('põllunduskool', 1.0),
 ('majandusvaldkond', 1.0),
 ('päevasärk', 1.0),
 ('aastakogu', 1.0),
 ('suusatamine', 1.0),
 ('turbaraba', 1.0)]

#### Error detection

In [33]:
for c in count:
    if c[0] == 'koha':
        print(c)

('koha', 6)


In [35]:
for i, sent in enumerate(sentences):
    if i in [1118, 2534]:
        continue
    for span in sent['entities']:
        lem = span.root.lemma
        if lem == 'koha':
            print(i, span.text)

1234 ['põllumajandusministri', 'kohast']
1259 ['õiges', 'kohas']
3200 ['alles', '14.', 'kohal']
3254 ['kogu', 'kohale']
3383 ['avalikus', 'kohas']
3810 ['kahekordsest', 'kohast']
4382 ['13', 'kohast', '8']
5214 ['kõrvalises', 'kohas']
5661 ['kohast', ',', 'kust', 'see', 'pole', 'minu', 'arust', 'majanduslikult', 'põhjendatud']
6193 ['kohas', ',', 'kus', 'ei', 'ole', 'olnud', 'ruutjalga', 'tühja', 'maad', 'ilma', 'inimluu', 'killuta']
8893 ['samas', 'kohas']
9239 ['tähtsal', 'kohal']
9964 ['kohas']


In [45]:
sentences[1259].text

'“ U vas kukuruza ne paidjot , ” meenutab Männik õigel ajal ja õiges kohas põllule sokutatud Hruštšovi sõnu .'

In [48]:
sentences[1259].words[14].lemma

Unnamed: 0,lemma
0,koht
1,koha


In [39]:
sentences[1234].words[10].lemma

Unnamed: 0,lemma
0,koht
1,koha


In [52]:
sentences[1259].entities[1].root.lemma

'koha'

#### Arrange by fraction of inessive in all obl

In [53]:
all_obl_word_counts = defaultdict(int)

for i, sent in enumerate(sentences):
    if i in [1118, 2534]:
        continue
    for span in sent['entities']:
        lem = span.root.lemma
        all_obl_word_counts[lem] += 1

In [54]:
frac_of_in_obl = []

for w, c in count:
    frac = c / all_obl_word_counts[w]
    frac_of_in_obl.append((w, frac))

In [56]:
sorted(frac_of_in_obl, key=lambda x: x[1], reverse=True)[:20]

[('ulatus', 1.0),
 ('farm', 1.0),
 ('eesots', 1.0),
 ('kõnekoda', 1.0),
 ('kataloog', 1.0),
 ('kirg', 1.0),
 ('bussijaam', 1.0),
 ('sinimägi', 1.0),
 ('maapiirkond', 1.0),
 ('kommentaar', 1.0),
 ('erandkord', 1.0),
 ('asutus', 1.0),
 ('ööpäev', 1.0),
 ('kümnevõistlus', 1.0),
 ('instituut', 1.0),
 ('osariik', 1.0),
 ('alluvus', 1.0),
 ('oblast', 1.0),
 ('kontor', 1.0),
 ('regioon', 1.0)]

In [58]:
for i, sent in enumerate(sentences):
    if i in [1118, 2534]:
        continue
    for span in sent['entities']:
        lem = span.root.lemma
        if lem == 'ulatus':
            print(i, span.text)

249 ['vajaliku', 'summa', 'ulatuses']
2016 ['ulatuses', ',', 'milles', 'see', 'muutub', 'teeniva', 'kinnisasja', 'suhtes', 'koormavamaks']
2165 ['100%', ',', 'vaid', '80%', 'ulatuses']
2176 ['euro', 'ulatuses']
2315 ['teatud', 'ulatuses']
2531 ['100%', 'ulatuses']
2533 ['valitsuse', 'kehtestatud', 'maksimaalmäära', 'ulatuses']
3460 ['ulatuses']
3462 ['ulatuses']
3467 [',', 'mille', 'ulatuses']
3884 ['täies', 'ulatuses']
4041 ['70%', 'ulatuses']
4531 ['millises', 'ulatuses']
8623 ['mõistlikus', 'ulatuses']
9520 ['täies', 'ulatuses']


In [60]:
sentences[9520]

text
"Kooliõpetajad ja - direktorid saavad ületunde teha veel 1. septembrini , siis rakendub seadus täies ulatuses ka nendele ."

layer name,attributes,parent,enveloping,ambiguous,span count
sentences,,,words,False,1
tokens,,,,False,20
compound_tokens,"type, normalized",,tokens,False,1
words,normalized_form,,,True,19
morph_analysis,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech",words,,True,19
morph_extended,"normalized_text, lemma, root, root_tokens, ending, clitic, form, partofspeech, punctuation_type, pronoun_type, letter_case, fin, verb_extension_suffix, subcat",morph_analysis,,True,19
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc",morph_analysis,,False,19
entities,"entity_type, free_entity, is_valid, root",,morph_analysis,False,3
