In [1]:
import pandas
from collections import defaultdict
from random import sample
from IPython.display import clear_output
from IPython.display import display
from estnltk.rewriting.premorph.morph_analyzed_token import MorphAnalyzedToken

# Pronouns
Find all tokens that either `Vabamorf` or `MorphAnalyzedToken` thinks is pronoun. Save the results in a csv file.

In [4]:
pronouns = []
with open('../temp/wordlist') as in_f:
    for i, line in enumerate(in_f):
        token = MorphAnalyzedToken(line.strip())
        if 'P' in token._part_of_speeches or token.normal.is_pronoun:
            pronouns.append({'token': token, 
                             'vabamorf': 'P' in token._part_of_speeches,
                             'is_pronoun': token.normal.is_pronoun})
            if len(pronouns) % 500 == 1:
                clear_output()
                display('{} {}'.format(i, token))

df = pandas.DataFrame.from_records(pronouns, columns=['token', 'vabamorf', 'is_pronoun'])
out_file = 'results/pronouns.csv'
df.to_csv(out_file, index=False)
clear_output()
print(len(pronouns), 'pronoun-like tokens written to the file', out_file)
df[:10]

7418 pronoun-like tokens written to the file results/pronouns.csv


Unnamed: 0,token,vabamorf,is_pronoun
0,-END-,False,True
1,-Esimene,True,True
2,-Esimese,True,True
3,-Esimeses,True,True
4,-Esimest,True,True
5,-Iga,True,True
6,-Igal,True,True
7,-Igalt,True,True
8,-Ise,True,True
9,-Keda,True,True


# Interesting tokens

A token is interesting if
 - it contains a hyphen '`-`',
 - it is a word if hyphens are removed,
 - all hyphen-separated parts are words,
 - it contains a hyphen-separated part that consists of 2 or 3 letters and
 - it is not a pronoun.

Being a word is defined by `Token.is_word` and being a pronoun is defined by `Token.is_pronoun`.
 
Find all interesting tokens and save the results in a file.

In [3]:
result = []
with open('../temp/wordlist', 'r', encoding='utf_8') as in_f:
    for i, line in enumerate(in_f):
        if '-' in line:
            token = Token(line.strip())
            parts = token.split('-')
            if not any(1<len(part)<4 and part.isalpha() for part in parts):
                continue
            if not token.replace('-', '').is_word:
                continue
            if not all(part.is_word for part in parts):
                continue
            if token.is_pronoun:
                continue
            result.append(token)
            if len(result) % 501 == 1:
                clear_output()
                display('{} {}'.format(i, token))
clear_output()
out_file = 'results/interesting_tokens.txt'
with open('results/interesting_tokens.txt', 'w', encoding='utf_8') as out_f:
    for token in result:
        print(token, file=out_f)

print(len(result), 'interesting tokens written to the file', out_file)

12204 interesting tokens written to the file results/interesting_tokens.txt


Aggregate and count interesting tokens by short words.

In [4]:
short_words = defaultdict(list)

with open('results/interesting_tokens.txt', 'r', encoding='utf_8') as in_f:
    for line in in_f:
        token = line.strip()
        parts = token.split('-')
        for part in parts:
            if 1<len(part)<4 and part.isalpha():
                short_words[part].append(token)
table = []
for word, examples in short_words.items():
    table.append({'short_word': word,
                  'support': len(examples),
                  'examples': sorted(sample(examples,min(len(examples), 3)))})

table = pandas.DataFrame.from_records(table, columns=['short_word', 'support', 'examples'])
table = table.sort_values("support", ascending=False)
out_file = 'results/short_words.csv'
table.to_csv(out_file, index=False)
print(len(table), 'short words written to the file', out_file)
table[:10]

661 short words written to the file results/short_words.csv


Unnamed: 0,short_word,support,examples
239,maa,580,"[maa-algkoolid, maa-aluseid, maa-arstidele]"
58,töö,474,"[töö-lubaduste, töö-otsijatele, võidu-töö]"
336,elu,408,"[elu-asemepoliitikas, elu-stiili, elu-viisi]"
65,üle,404,"[üle-eestine, üle-kümne-tuhandeliste, üle-tree..."
607,ühe,325,"[ühe-kaheaastases, ühe-kahekorruselist, ühe-mi..."
531,tee,262,"[tee-ehituslepingute, tee-ehitusprojektist, te..."
622,vee,262,"[vee-eelnõu, vee-efektiga, vee-motoklubi]"
360,aja,259,"[aja-kirjanikule, aja-lool, ülemineku-aja]"
353,ja,243,"[informaatika-ja, lae-ja-lase, silma-ja]"
308,pea,233,"[pea-staabist, pea-toimetajakonkurss, pea-tree..."
