In [1]:
from pandas import DataFrame
from pandas import read_csv
from estnltk.vabamorf.morf import synthesize, analyze
from collections import defaultdict, Counter
import re
from IPython.display import clear_output, display

# Postmorph analysis for numeric tokens

## Create a rules file for analyzing numeric tokens

In [2]:
number_words = {
                '0$': {'words': ('null', 'nullis'),
                       'roots': ('null',)},
                '1$|[2-9]1$|([1-9][0-9]*[02-9])1$': {'words': ('üks', 'esimene'),
                       'roots': ('üh', 'esime')},
                '2$|[2-9]2$|([1-9][0-9]*[02-9])2$': {'words': ('kaks', 'teine'),
                       'roots': ('kah', 'teis')},
                '3$|[2-9]3$|([1-9][0-9]*[02-9])3$': {'words': ('kolm', 'kolmas'),
                       'roots': ('kolma', 'kolm')},
                '4$|[2-9]4$|([1-9][0-9]*[02-9])4$': {'words': ('neli', 'neljas'),
                       'roots': ('nel',)},
                '5$|[2-9]5$|([1-9][0-9]*[02-9])5$': {'words': ('viis', 'viies'),
                       'roots': ('vii',)},
                '6$|[2-9]6$|([1-9][0-9]*[02-9])6$': {'words': ('kuus', 'kuues'),
                       'roots': ('kuu',)},
                '7$|[2-9]7$|([1-9][0-9]*[02-9])7$': {'words': ('seitse', 'seitsmes'),
                       'roots': ('seits',)},
                '8$|[2-9]8$|([1-9][0-9]*[02-9])8$': {'words': ('kaheksa', 'kaheksas'),
                       'roots': ('kaheks',)},
                '9$|[2-9]9$|([1-9][0-9]*[02-9])9$': {'words': ('üheksa', 'üheksas'),
                       'roots': ('üheks',)},
                '([1-9][0-9]*)?1[1-9]$': {'words': ('kolmteist', 'kolmeteistkümnes'),
                       'roots': ('kolmeteistküm',)},
                '([1-9][0-9]*)?[1-9]0$': {'words': ('kolmkümmend', 'kolmekümnes'),
                       'roots': ('kolmeküm',)},
                '([1-9][0-9]*)?[1-9]00$': {'words': ('sada', 'sajas'),
                       'roots': ('sad', 'sa')},
                '([1-9][0-9]*)?[1-9]0{3,5}$': {'words': ('tuhat', 'tuhandes'),
                       'roots': ('tuhan','tuha')},
                '([1-9][0-9]*)?[1-9]0{6,8}(0{6}0*)?$': {'words': ('miljon', 'miljones'),
                                               'roots': ('miljon',)},
                '([1-9][0-9]*)?[1-9]0{9,11}$': {'words': ('miljard', 'miljardes'),
                       'roots': ('miljard',)}
               }

forms =  ['sg n', 'pl n', 'sg g', 'pl g', 'sg p', 'pl p', 'sg ill', 'pl ill', 'adt', 'sg in', 'pl in',
          'sg el', 'pl el', 'sg all', 'pl all', 'sg ad', 'pl ad', 'sg abl', 'pl abl', 'sg tr',
          'pl tr', 'sg ter', 'pl ter', 'sg es', 'pl es', 'sg ab', 'pl ab', 'sg kom', 'pl kom']

for number, data in number_words.items():
    data['analyses'] = {}
    for form in forms:
        for number_word in data['words']:
            for synt in synthesize(number_word, form):
                analysis = analyze([synt], disambiguate=False, guess=False, propername=False)[0]['analysis']
                for a in analysis:
                    if a['partofspeech'] in {'N', 'O'}:
                        data['analyses'].setdefault(synt, set()).add((a['partofspeech'], a['form'], a['ending']))

for number, data in number_words.items():
    data['suffixes'] = {}
    for word_form in data['analyses']:
        for root in data['roots']:
            ending = word_form.partition(root)[-1]
            if ending:
                for i in range(len(ending)):
                    data['suffixes'][ending[i:]] = set()
                break
    for ending, analyses in data['suffixes'].items():
        for word_form, a in data['analyses'].items():
            if word_form.endswith(ending):
                for pos, form, suffix in a:
                    if pos!='N' or form!='sg n':
                        analyses.add((pos, form, suffix))
    data['suffixes'][''] = {('N', '?', '0')}
    
    
table = []
for number, data in number_words.items():
    for ending, analyses in data['suffixes'].items():
        for pos, form, normal_ending in analyses:
            table.append({
                        'number': number,
                        'suffix': ending,
                        'pos': pos,
                        'form': form,
                        'ending': normal_ending
            })

df = DataFrame.from_records(table, columns=['number', 'suffix', 'pos', 'form', 'ending'])
df = df.sort_values(['number', 'pos', 'form', 'ending'])

df.to_csv('results/number_analysis_rules.csv', index=False)

print(len(df), 'lines')
df[:10]

5873 lines


Unnamed: 0,number,suffix,pos,form,ending
3901,([1-9][0-9]*)?1[1-9]$,,N,?,0
3799,([1-9][0-9]*)?1[1-9]$,eta,N,pl ab,teta
3812,([1-9][0-9]*)?1[1-9]$,ta,N,pl ab,teta
3920,([1-9][0-9]*)?1[1-9]$,neteta,N,pl ab,teta
3976,([1-9][0-9]*)?1[1-9]$,teta,N,pl ab,teta
3984,([1-9][0-9]*)?1[1-9]$,eteta,N,pl ab,teta
4144,([1-9][0-9]*)?1[1-9]$,a,N,pl ab,teta
3956,([1-9][0-9]*)?1[1-9]$,lt,N,pl abl,ilt
3964,([1-9][0-9]*)?1[1-9]$,ilt,N,pl abl,ilt
3986,([1-9][0-9]*)?1[1-9]$,eilt,N,pl abl,ilt


In the rules file [results/number_analysis_rules.csv](results/number_analysis_rules.csv) **pos**, **form** and **ending** cells contain the morphological analysis for the tokens that match **number** with **suffix**.

## Load the rules from the file

In [3]:
def load_number_analysis_rules(file):
    df = read_csv(file, na_filter=False)
    rules = defaultdict(dict)
    for _, r in df.iterrows():
        if r.suffix not in rules[r.number]:
            rules[r.number][r.suffix] = []
        rules[r.number][r.suffix].append({'partofspeech': r.pos, 'form': r.form, 'ending':r.ending})
    return rules

rules = load_number_analysis_rules('results/number_analysis_rules.csv')

## From the koondkorpus tokens find examples and support for the rules

In [11]:
token_count = 0
not_alpha_count = 0

numeric = re.compile('-?(\d+)-?(\D*)$')
examples_good = {}
examples_good_support = Counter()
examples_bad = {}
with open('../temp/wordlist', 'r', encoding='utf_8') as in_f:
    for i, line in enumerate(in_f):
        token_count += 1
        token = line.strip()
        if not token.isalpha():
            not_alpha_count += 1
            m = numeric.match(token)
            if m:
                good_token = False
                number = m.group(1) 
                suffix = m.group(2)
                for number_re in rules:
                    if re.match(number_re, number):
                        if suffix in rules[number_re]:
                            good_token = True
                            examples_good[(number_re, suffix)] = token
                            examples_good_support[(number_re, suffix)] += 1
                if not good_token:
                    examples_bad[(number, suffix)] = token
        if i % 10000 == 0:
            clear_output()
            display('{} {}'.format(i, token))
                
clear_output()

print(len(examples_good), 'good examples')
print(len(examples_bad),  'numeric examples not covered by the rules')
print('token_count:', token_count)
print('not_alpha_count:', not_alpha_count)

df = read_csv('results/number_analysis_rules.csv', na_filter=False)
example = []
support = []
for _, r in df.iterrows():
    example.append(examples_good.get((r.number, r.suffix), ''))
    support.append(examples_good_support.get((r.number, r.suffix), ''))
df['example'] = example
df['support'] = support

df.to_csv('results/number_analysis_rules_with_examples.csv', index=False)
df[:10]

697 good examples
50873 numeric examples not covered by the rules
token_count: 5000215
not_alpha_count: 732035


Unnamed: 0,number,suffix,pos,form,ending,example,support
0,([1-9][0-9]*)?1[1-9]$,,N,?,0,9918,2016.0
1,([1-9][0-9]*)?1[1-9]$,eta,N,pl ab,teta,,
2,([1-9][0-9]*)?1[1-9]$,ta,N,pl ab,teta,,
3,([1-9][0-9]*)?1[1-9]$,neteta,N,pl ab,teta,,
4,([1-9][0-9]*)?1[1-9]$,teta,N,pl ab,teta,,
5,([1-9][0-9]*)?1[1-9]$,eteta,N,pl ab,teta,,
6,([1-9][0-9]*)?1[1-9]$,a,N,pl ab,teta,912a,49.0
7,([1-9][0-9]*)?1[1-9]$,lt,N,pl abl,ilt,8016-lt,41.0
8,([1-9][0-9]*)?1[1-9]$,ilt,N,pl abl,ilt,,
9,([1-9][0-9]*)?1[1-9]$,eilt,N,pl abl,ilt,,


See [results/number_analysis_rules_with_examples.csv](results/number_analysis_rules_with_examples.csv) for all examples

### Some examples of the tokens that contain numbers but are not covered by the rules

In [5]:
from random import random
for k, v in examples_bad.items():
    if (random() < 0.001):
        print(v, k)

2075kroonisest ('2075', 'kroonisest')
105dC ('105', 'dC')
90b ('90', 'b')
13-le. ('13', 'le.')
5-astmelisel ('5', 'astmelisel')
6-astmeline ('6', 'astmeline')
07922 ('07922', '')
2027. ('2027', '.')
000ga ('000', 'ga')
46m ('46', 'm')
18000EEK ('18000', 'EEK')
732708. ('732708', '.')
8800punktise ('8800', 'punktise')
18-Martin ('18', 'Martin')
160-sentimeetriste ('160', 'sentimeetriste')
42-seks ('42', 'seks')
967. ('967', '.')
1800mAh ('1800', 'mAh')
16biti ('16', 'biti')
30minti ('30', 'minti')
1kl ('1', 'kl')
37aastne ('37', 'aastne')
400-kroonilisel ('400', 'kroonilisel')
64-ruutmeetrine ('64', 'ruutmeetrine')
17lk ('17', 'lk')
90-kilost ('90', 'kilost')
4220. ('4220', '.')
60-dollarise ('60', 'dollarise')
0-režiim ('0', 'režiim')
14aastaseid ('14', 'aastaseid')
880-ruutmeetrises ('880', 'ruutmeetrises')
16-sekundilist ('16', 'sekundilist')
23PL ('23', 'PL')
14-Vicente ('14', 'Vicente')
6676. ('6676', '.')
5199. ('5199', '.')
18-pealisest ('18', 'pealisest')
50-stele ('50', 'stele'

## Analyze numeric tokens

In [6]:
import re
def analyze_number(token):
    m = re.match('-?(\d+)-?(\D*)$', token)
    if not m:
        return []
    m.group(0), 
    number = m.group(1) 
    ending = m.group(2)
    result = []
    for number_re, analyses in rules.items():
        if re.match(number_re, number):
            for analysis in analyses[ending]:
                a = {'lemma':number, 'root':number, 'root_tokens':[number], 'clitic':''}
                a.update(analysis)
                result.append(a)
    return result

analyze_number('11eteta')

[{'clitic': '',
  'ending': 'teta',
  'form': 'pl ab',
  'lemma': '11',
  'partofspeech': 'N',
  'root': '11',
  'root_tokens': ['11']}]

In [7]:
analyze_number('11')

[{'clitic': '',
  'ending': '0',
  'form': '?',
  'lemma': '11',
  'partofspeech': 'N',
  'root': '11',
  'root_tokens': ['11']}]

In [8]:
analyze_number('11iks')

[{'clitic': '',
  'ending': 'iks',
  'form': 'pl tr',
  'lemma': '11',
  'partofspeech': 'N',
  'root': '11',
  'root_tokens': ['11']},
 {'clitic': '',
  'ending': 'iks',
  'form': 'pl tr',
  'lemma': '11',
  'partofspeech': 'O',
  'root': '11',
  'root_tokens': ['11']}]