# Postmorph analysis for numeric tokens

## Create a rules file for analyzing numeric tokens

In [1]:
from pandas import DataFrame
from estnltk.vabamorf.morf import synthesize, analyze
from collections import defaultdict

number_words = {
                '0$': {'words': ('null', 'nullis'),
                       'roots': ('null',)},
                '1$|[2-9]1$|([1-9][0-9]*[02-9])1$': {'words': ('üks', 'esimene'),
                       'roots': ('üh', 'esime')},
                '2$|[2-9]2$|([1-9][0-9]*[02-9])2$': {'words': ('kaks', 'teine'),
                       'roots': ('kah', 'teis')},
                '3$|[2-9]3$|([1-9][0-9]*[02-9])3$': {'words': ('kolm', 'kolmas'),
                       'roots': ('kolma', 'kolm')},
                '4$|[2-9]4$|([1-9][0-9]*[02-9])4$': {'words': ('neli', 'neljas'),
                       'roots': ('nel',)},
                '5$|[2-9]5$|([1-9][0-9]*[02-9])5$': {'words': ('viis', 'viies'),
                       'roots': ('vii',)},
                '6$|[2-9]6$|([1-9][0-9]*[02-9])6$': {'words': ('kuus', 'kuues'),
                       'roots': ('kuu',)},
                '7$|[2-9]7$|([1-9][0-9]*[02-9])7$': {'words': ('seitse', 'seitsmes'),
                       'roots': ('seits',)},
                '8$|[2-9]8$|([1-9][0-9]*[02-9])8$': {'words': ('kaheksa', 'kaheksas'),
                       'roots': ('kaheks',)},
                '9$|[2-9]9$|([1-9][0-9]*[02-9])9$': {'words': ('üheksa', 'üheksas'),
                       'roots': ('üheks',)},
                '([1-9][0-9]*)?1[1-9]$': {'words': ('kolmteist', 'kolmeteistkümnes'),
                       'roots': ('kolmeteistküm',)},
                '([1-9][0-9]*)?[1-9]0$': {'words': ('kolmkümmend', 'kolmekümnes'),
                       'roots': ('kolmeküm',)},
                '([1-9][0-9]*)?[1-9]00$': {'words': ('sada', 'sajas'),
                       'roots': ('sad', 'sa')},
                '([1-9][0-9]*)?[1-9]0{3,5}$': {'words': ('tuhat', 'tuhandes'),
                       'roots': ('tuhan','tuha')},
                '([1-9][0-9]*)?[1-9]0{6,8}(0{6}0*)?$': {'words': ('miljon', 'miljones'),
                                               'roots': ('miljon',)},
                '([1-9][0-9]*)?[1-9]0{9,11}$': {'words': ('miljard', 'miljardes'),
                       'roots': ('miljard',)}
               }

forms =  ['sg n', 'pl n', 'sg g', 'pl g', 'sg p', 'pl p', 'sg ill', 'pl ill', 'adt', 'sg in', 'pl in',
          'sg el', 'pl el', 'sg all', 'pl all', 'sg ad', 'pl ad', 'sg abl', 'pl abl', 'sg tr',
          'pl tr', 'sg ter', 'pl ter', 'sg es', 'pl es', 'sg ab', 'pl ab', 'sg kom', 'pl kom']

for number, data in number_words.items():
    data['analyses'] = {}
    for form in forms:
        for number_word in data['words']:
            for synt in synthesize(number_word, form):
                analysis = analyze([synt], disambiguate=False, guess=False, propername=False)[0]['analysis']
                for a in analysis:
                    if a['partofspeech'] in {'N', 'O'}:
                        data['analyses'].setdefault(synt, set()).add((a['partofspeech'], a['form'], a['ending']))

for number, data in number_words.items():
    data['suffixes'] = {}
    for word_form in data['analyses']:
        for root in data['roots']:
            ending = word_form.partition(root)[-1]
            if ending:
                for i in range(len(ending)):
                    data['suffixes'][ending[i:]] = set()
                break
    for ending, analyses in data['suffixes'].items():
        for word_form, a in data['analyses'].items():
            if word_form.endswith(ending):
                for pos, form, suffix in a:
                    if pos!='N' or form!='sg n':
                        analyses.add((pos, form, suffix))
    data['suffixes'][''] = {('N', '?', '0')}
    
    
table = []
for number, data in number_words.items():
    for ending, analyses in data['suffixes'].items():
        for pos, form, normal_ending in analyses:
            table.append({
                        'number': number,
                        'suffix': ending,
                        'pos': pos,
                        'form': form,
                        'ending': normal_ending
            })

df = DataFrame.from_records(table, columns=['number', 'suffix', 'pos', 'form', 'ending'])
df = df.sort_values(['number', 'pos', 'form', 'ending'])

df.to_csv('results/number_analysis_rules.csv', index=False)

print(len(df), 'lines')
df[:10]

5873 lines


Unnamed: 0,number,suffix,pos,form,ending
3564,([1-9][0-9]*)?1[1-9]$,,N,?,0
3569,([1-9][0-9]*)?1[1-9]$,ta,N,pl ab,teta
3603,([1-9][0-9]*)?1[1-9]$,eta,N,pl ab,teta
3674,([1-9][0-9]*)?1[1-9]$,eteta,N,pl ab,teta
3705,([1-9][0-9]*)?1[1-9]$,a,N,pl ab,teta
3820,([1-9][0-9]*)?1[1-9]$,teta,N,pl ab,teta
3834,([1-9][0-9]*)?1[1-9]$,neteta,N,pl ab,teta
3628,([1-9][0-9]*)?1[1-9]$,t,N,pl abl,ilt
3789,([1-9][0-9]*)?1[1-9]$,lt,N,pl abl,ilt
3974,([1-9][0-9]*)?1[1-9]$,neilt,N,pl abl,ilt


In the rules file [results/number_analysis_rules.csv](results/number_analysis_rules.csv) **pos**, **form** and **ending** cells contain the morphological analysis for the tokens that match **number** with **suffix**.

## Load the rules from the file

In [2]:
from pandas import read_csv

def load_number_analysis_rules(file):
    df = read_csv(file, na_filter=False)
    rules = defaultdict(dict)
    for _, r in df.iterrows():
        if r.suffix not in rules[r.number]:
            rules[r.number][r.suffix] = []
        rules[r.number][r.suffix].append({'partofspeech': r.pos, 'form': r.form, 'ending':r.ending})
    return rules

rules = load_number_analysis_rules('results/number_analysis_rules.csv')

## Analyze numeric tokens

In [3]:
import re
def analyze_number(token):
    m = re.match('-?(\d+)-?(\D*)', token)
    if not m:
        return []
    m.group(0), 
    number = m.group(1) 
    ending = m.group(2)
    result = []
    for number_re, analyses in rules.items():
        if re.match(number_re, number):
            for analysis in analyses[ending]:
                a = {'lemma':number, 'root':number, 'root_tokens':[number], 'clitic':''}
                a.update(analysis)
                result.append(a)
    return result

analyze_number('11eteta')

[{'clitic': '',
  'ending': 'teta',
  'form': 'pl ab',
  'lemma': '11',
  'partofspeech': 'N',
  'root': '11',
  'root_tokens': ['11']}]

In [4]:
analyze_number('1na')

[{'clitic': '',
  'ending': 'dena',
  'form': 'pl es',
  'lemma': '1',
  'partofspeech': 'N',
  'root': '1',
  'root_tokens': ['1']},
 {'clitic': '',
  'ending': 'na',
  'form': 'sg es',
  'lemma': '1',
  'partofspeech': 'N',
  'root': '1',
  'root_tokens': ['1']},
 {'clitic': '',
  'ending': 'tena',
  'form': 'pl es',
  'lemma': '1',
  'partofspeech': 'O',
  'root': '1',
  'root_tokens': ['1']},
 {'clitic': '',
  'ending': 'na',
  'form': 'sg es',
  'lemma': '1',
  'partofspeech': 'O',
  'root': '1',
  'root_tokens': ['1']}]

In [5]:
analyze_number('11iks')

[{'clitic': '',
  'ending': 'iks',
  'form': 'pl tr',
  'lemma': '11',
  'partofspeech': 'N',
  'root': '11',
  'root_tokens': ['11']},
 {'clitic': '',
  'ending': 'iks',
  'form': 'pl tr',
  'lemma': '11',
  'partofspeech': 'O',
  'root': '11',
  'root_tokens': ['11']}]