# Create a list of all the tokens in the corpora
## From koondkorpus zip file using EstNLTK 1.4

Download the following files to the folder `corpora`.
  * http://ats.cs.ut.ee/keeletehnoloogia/estnltk/koond.zip
  * http://www.cl.ut.ee/korpused/segakorpus/slohtuleht/failid/xml/SLOleht.tar.gz

Run
```bash
tar -xzf corpora/SLOleht.tar.gz
tar -xzf corpora/koond.zip
mv corpora/SLOleht corpora/SLOleht_xml
mkdir corpora/SLOleht
python estnltk/estnltk/examples/convert_koondkorpus.py corpora/SLOleht_xml/ corpora/SLOleht/ -e utf_8
python koondkorpus-experiments/extract_tokens.py corpora/koond corpora/SLOleht --out corpora
```
Find the output files from the folder `corpora`.


| corpora     | files  | size     |
|-------------|--------|----------|
| koond       | 527730 |   1.8 GB |
| SLOleht_xml |   3314 | 429.8 MB |
| SLOleht     |        |          |




### Additional information
https://estnltk.github.io/estnltk/1.4.1/tutorials/tei.html

## From koondkorpus PostgreSQL collection using EstNLTK 1.6

In [1]:
#import sys; sys.path = ["/home/liisi/Documents/estnltk/est1.6"] + sys.path

In [2]:
from estnltk.storage import PostgresStorage

storage = PostgresStorage(pgpass_file='~/.pgpass',
                          dbname='estonian-text-corpora',
                          schema='estonian_text_corpora',
                          role='estonian_text_corpora_create')

collection = storage.get_collection('test_korpus_2')

INFO:db.py:1164: connecting to host: 'postgres.keeleressursid.ee', port: '5432', dbname: 'estonian-text-corpora', user: 'liisitor'
INFO:db.py:1176: role: 'estonian_text_corpora_create'


In [3]:
counter = collection.count_values(layer='words', attr='text', progressbar='notebook')

HBox(children=(IntProgress(value=0), HTML(value='')))




In [4]:
counter.most_common(10)

[('.', 2613),
 (',', 2580),
 ('on', 852),
 ('ja', 837),
 ('et', 380),
 ('"', 358),
 ('ka', 284),
 ('ei', 261),
 ('kui', 222),
 ('(', 201)]

In [5]:
words = ' '.join(sorted(counter))

In [6]:
from estnltk.taggers import PronounTypeTagger
from estnltk import Text

tagger = PronounTypeTagger()
tagger

name,layer,attributes,depends_on
PronounTypeTagger,pronoun_type,"(lemma, root, root_tokens, ending, clitic, form, partofspeech, pronoun_type)",[morph_analysis]


In [7]:
wordst = Text(words).tag_layer()
tagger.tag(wordst)
words2 = wordst['pronoun_type']
words2

layer name,attributes,parent,enveloping,ambiguous,span count
pronoun_type,"lemma, root, root_tokens, ending, clitic, form, partofspeech, pronoun_type",morph_analysis,,True,14687

text,lemma,root,root_tokens,ending,clitic,form,partofspeech,pronoun_type
!,!,!,"('!',)",,,,Z,
"""","""","""","('""',)",,,,Z,
&,&,&,"('&',)",0,,,J,
(,(,(,"('(',)",,,,Z,
),),),"(')',)",,,,Z,
*,*,*,"('*',)",,,,Z,
AS-is,AS,AS,"('AS',)",is,,sg in,Y,
*,*,*,"('*',)",,,,Z,
Keskhaiglas,keskhaigla,kesk_haigla,"('kesk', 'haigla')",s,,sg in,S,
*,*,*,"('*',)",,,,Z,


In [23]:
from collections import defaultdict
meow = defaultdict(list)
for token, form, lemma, postag, ptype in zip(words2.text, words2.form, words2.lemma, words2.partofspeech, words2.pronoun_type):   
    if postag[0]  == 'P':
        for i in range(len(form)):
            meow[token].append([lemma[i], postag[i], form[i], ptype[i]])

In [24]:
sorted(meow.items())

[('Enda', [['ise', 'P', 'sg g', ('pos', 'det', 'refl')]]),
 ('Iga', [['iga', 'P', 'sg n', ('det',)]]),
 ('Igaks', [['iga', 'P', 'sg tr', ('det',)]]),
 ('Igal', [['iga', 'P', 'sg ad', ('det',)]]),
 ('Ise',
  [['ise', 'P', 'pl n', ('pos', 'det', 'refl')],
   ['ise', 'P', 'sg n', ('pos', 'det', 'refl')]]),
 ('Keegi', [['keegi', 'P', 'sg n', ('indef',)]]),
 ('Kel', [['kes', 'P', 'sg ad', ('inter_rel',)]]),
 ('Kelle',
  [['kes', 'P', 'pl g', ('inter_rel',)],
   ['kes', 'P', 'sg g', ('inter_rel',)]]),
 ('Kellega',
  [['kes', 'P', 'pl kom', ('inter_rel',)],
   ['kes', 'P', 'sg kom', ('inter_rel',)]]),
 ('Kellele',
  [['kes', 'P', 'pl all', ('inter_rel',)],
   ['kes', 'P', 'sg all', ('inter_rel',)]]),
 ('Kes',
  [['kes', 'P', 'pl n', ('inter_rel',)],
   ['kes', 'P', 'sg n', ('inter_rel',)]]),
 ('Kõigi', [['kõik', 'P', 'pl g', ('det',)]]),
 ('Kõigil', [['kõik', 'P', 'pl ad', ('det',)]]),
 ('Kõigis', [['kõik', 'P', 'pl in', ('det',)]]),
 ('Kõik', [['kõik', 'P', 'pl n', ('det',)], ['kõik', 'P', '

In [25]:
storage.close()