In [1]:
import pandas
from IPython.display import clear_output
from IPython.display import display
from estnltk.validators.word_validator import Token

# Pronouns
Find all tokens that either `Vabamorf` or `Token.is_pronoun` thinks is pronoun. Write the results into csv file.

In [2]:
pronouns = []
with open('../temp/wordlist') as in_f:
    for i, line in enumerate(in_f):
        token = Token(line.strip())
        if 'P' in token.part_of_speeches or token.is_pronoun:
            pronouns.append({'token': token, 
                             'vabamorf': 'P' in token.part_of_speeches,
                             'is_pronoun': token.is_pronoun})
            if len(pronouns) % 501 == 1:
                clear_output()
                display('{} {}'.format(i, token))

            
df = pandas.DataFrame.from_records(pronouns, columns=['token', 'vabamorf', 'is_pronoun'])
out_file = 'results/pronouns.csv'
df.to_csv(out_file, index=False)
clear_output()
print(len(pronouns), 'pronoun-like tokens written to the file', out_file)
df[:10]

6260 pronoun-like tokens written to the file results/pronouns.csv


Unnamed: 0,token,vabamorf,is_pronoun
0,-END-,True,True
1,-Esimene,True,True
2,-Esimese,True,True
3,-Esimeses,True,True
4,-Esimest,True,True
5,-Iga,True,True
6,-Igal,True,True
7,-Igalt,True,True
8,-Ise,True,True
9,-Keda,True,True


# Interesting tokens
Find all tokens
 - that contain a hyphen '`-`',
 - that are words if hyphens removed,
 - where all hyphen-separated parts are words,
 - that contain a hyphen-separated part that consists of 2 or 3 letters and
 - that are not pronouns.
 
 Being a word is defined by `Token.is_word` and being a pronoun is defined by `Token.is_pronoun`.
 
 Write the results into file.

In [3]:
result = []
with open('../temp/wordlist', 'r', encoding='utf_8') as in_f:
    for i, line in enumerate(in_f):
        if '-' in line:
            token = Token(line.strip())
            parts = token.split('-')
            if not any(1<len(part)<4 and part.isalpha() for part in parts):
                continue
            if not token.replace('-', '').is_word:
                continue
            if not all(part.is_word for part in parts):
                continue
            if token.is_pronoun:
                continue
            result.append(token)
            if len(result) % 501 == 1:
                clear_output()
                display('{} {}'.format(i, token))
clear_output()
out_file = 'results/interesting_tokens.txt'
with open('results/interesting_tokens.txt', 'w', encoding='utf_8') as out_f:
    for token in result:
        print(token, file=out_f)

print(len(result), 'interesting tokens written to the file', out_file)

12204 interesting tokens written to the file results/interesting_tokens.txt
