# Collection `koondkorpus_words_v16_1`

This collection is created with the script [create_koondkorpus_words_v16_1.py](create_koondkorpus_words_v16_1.py) (commit `ebc119761de38bd86028f6e9ee96df70e36d7a9d`).

For every word in `v16_1_words` layer of `koondkorpus_base` collection there is an element in the `koondkorpus_words_v16_1` collection. This element is a sentence that contains the word with `target` layer that marks the position of the word in the sentence and has the `normalized_form` attribute. The word text string and the normalized form is also saved as the collection metadata in the `word` and `normalized_form` columns.

There are 5350453 objects in the collection.

Connect to the collection.

In [1]:
from estnltk.storage import PostgresStorage

storage = PostgresStorage(pgpass_file='~/.pgpass',
                          dbname='estonian-text-corpora',
                          schema='estonian_text_corpora',
                          role='estonian_text_corpora_create')

collection = storage.get_collection('koondkorpus_words_v16_1')

INFO:db.py:1222: connecting to host: 'postgres.keeleressursid.ee', port: '5432', dbname: 'estonian-text-corpora', user: 'liisitor'
INFO:db.py:1234: role: 'estonian_text_corpora_create'


  """)


Create a collection iterator.

In [2]:
iter_collection = collection.select(layers=['target'], collection_meta=['word', 'normalized_form'])

# Get pronouns from collection

Creates a dict of pronouns where key is normalized pronoun and values are analyses.

`"pronoun": [{analysis}, {analysis}]`, where `analysis == {{collect_info}, {pronoun_analysis}}`

`collect_info` - all information from collection

`pronoun_analysis` - pronountagger output


In [None]:
from estnltk.taggers import VabamorfAnalyzer
from estnltk.taggers import PostMorphAnalysisTagger
from estnltk.taggers import PronounTypeTagger
from collections import defaultdict
from estnltk import Text

pronoun_tagger = PronounTypeTagger()
morph_analyzer = VabamorfAnalyzer()
postanalysis_tagger = PostMorphAnalysisTagger()

pronouns = defaultdict(list)

for key, text, meta in iter_collection:
    text.analyse('segmentation')
    text.tag_layer(['tokens', 'compound_tokens'])
    morph_analyzer.tag(text)
    postanalysis_tagger.retag(text)
    pronoun_tagger.tag(text)
    for word in text.pronoun_type:
        if word.start == text.target.start:  # kas lauses oleva sõna algus on sama, mis target sõnal
            for i, pos in enumerate(word.partofspeech):
                if pos == 'P':
                    collect = {'key': key,
                               'text': text.text,
                               'meta_norm': meta['normalized_form'],
                               'meta_word': meta['word'],
                               'target': text.target.text,
                               'target_start': text.target.start,
                               'target_end': text.target.end}
                    pron_analysis = {
                        'pron_lemma': word.lemma[i],
                        'pron_form': word.form[i],
                        'pron_pos': pos,
                        'pron_type': word.pronoun_type[i],
                        'pron_root': word.root_tokens[i]}

                    if pronouns[word.text] == []:
                        pronouns[word.text].append({'collect_info': collect, 'pronoun_analysis': pron_analysis})
                    else:
                        analysis_exists = False
                        for analysis in pronouns[word.text]:
                            if pron_analysis == analysis['pronoun_analysis']:
                                analysis_exists = True
                                continue
                            if not analysis_exists:
                                pronouns[word.text].append({'collect_info': collect, 'pronoun_analysis': pron_analysis})
                                analysis_exists = False

In [None]:
import json
with open('koond16pronouns.json', 'w') as f:
    for chunk in json.JSONEncoder().iterencode(pronouns):
        f.write(chunk)

Close the connection.

In [None]:
storage.close()

In [3]:
import json
with open('koond16pronouns.json', 'r') as inf:
    pronouns_from_json = json.load(inf)

In [12]:
for key, values in sorted(pronouns_from_json.items()):
    if key == 'mingi':
        print(key)
        for i, value in enumerate(values):
            print("analysis_%s: %s\n" % (i+1, value))

mingi
analysis_1: {'collect_info': {'meta_norm': None, 'target': 'mingi', 'target_start': 46, 'target_end': 51, 'key': '931', 'text': 'See oli juba 2000. aasta algus , kohvikus oli mingi moeüritus .', 'meta_word': 'mingi'}, 'pronoun_analysis': {'pron_pos': 'P', 'pron_lemma': 'mingi', 'pron_form': 'sg g', 'pron_root': ['mingi'], 'pron_type': ['indef']}}

analysis_2: {'collect_info': {'meta_norm': None, 'target': 'mingi', 'target_start': 46, 'target_end': 51, 'key': '931', 'text': 'See oli juba 2000. aasta algus , kohvikus oli mingi moeüritus .', 'meta_word': 'mingi'}, 'pronoun_analysis': {'pron_pos': 'P', 'pron_lemma': 'mingi', 'pron_form': 'sg n', 'pron_root': ['mingi'], 'pron_type': ['indef']}}

analysis_3: {'collect_info': {'meta_norm': None, 'target': 'mingi-17', 'target_start': 12, 'target_end': 20, 'key': '5300295', 'text': 'bad_blondy: mingi-17', 'meta_word': 'mingi-17'}, 'pronoun_analysis': {'pron_pos': 'P', 'pron_lemma': 'mingi', 'pron_form': 'sg n', 'pron_root': ['mingi'], 'pr

In [32]:
with open('pronoun_analysis_16_new.txt', 'a') as m:
    for key, values in sorted(pronouns_from_json.items()):
        m.write("%s %s\n" % (key, [[value['pronoun_analysis']['pron_lemma'], value['pronoun_analysis']['pron_pos'], value['pronoun_analysis']['pron_form'], value['pronoun_analysis']['pron_type']] for value in values]))