In [1]:
from pandas import DataFrame
from pandas import read_csv
from estnltk.vabamorf.morf import synthesize, analyze
from collections import defaultdict, Counter
import re
from IPython.display import clear_output, display

# Postmorph analysis for numeric tokens

## Create a rules file for analyzing numeric tokens

In [2]:
number_words = {
                '0$':                                  {'words': ('null', 'nullis'),
                                                        'roots': ('null',)},
                '(|[2-9]|([1-9][0-9]*[02-9]))1$':      {'words': ('üks', 'esimene'),
                                                        'roots': ('üh', 'esime')},
                '(|[2-9]|([1-9][0-9]*[02-9]))2$':      {'words': ('kaks', 'teine'),
                                                        'roots': ('kah', 'tei')},
                '(|[2-9]|([1-9][0-9]*[02-9]))3$':      {'words': ('kolm', 'kolmas'),
                                                        'roots': ('kolm',)},
                '(|[2-9]|([1-9][0-9]*[02-9]))4$':      {'words': ('neli', 'neljas'),
                                                        'roots': ('nel',)},
                '(|[2-9]|([1-9][0-9]*[02-9]))5$':      {'words': ('viis', 'viies'),
                                                        'roots': ('vii',)},
                '(|[2-9]|([1-9][0-9]*[02-9]))6$':      {'words': ('kuus', 'kuues'),
                                                        'roots': ('kuu',)},
                '(|[2-9]|([1-9][0-9]*[02-9]))7$':      {'words': ('seitse', 'seitsmes'),
                                                        'roots': ('seits',)},
                '(|[2-9]|([1-9][0-9]*[02-9]))8$':      {'words': ('kaheksa', 'kaheksas'),
                                                        'roots': ('kaheks',)},
                '(|[2-9]|([1-9][0-9]*[02-9]))9$':      {'words': ('üheksa', 'üheksas'),
                                                        'roots': ('üheks',)},
                '([1-9][0-9]*)?1[1-9]$':               {'words': ('kolmteist', 'kolmeteistkümnes'),
                                                        'roots': ('kolmeteistküm',)},
                '([1-9][0-9]*)?[1-9]0$':               {'words': ('kolmkümmend', 'kolmekümnes'),
                                                        'roots': ('kolmeküm',)},
                '([1-9][0-9]*)?[1-9]00$':              {'words': ('sada', 'sajas'),
                                                        'roots': ('sad', 'sa')},
                '([1-9][0-9]*)?[1-9]0{3,5}$':          {'words': ('tuhat', 'tuhandes'),
                                                        'roots': ('tuha',)},
                '([1-9][0-9]*)?[1-9]0{6,8}(0{6}0*)?$': {'words': ('miljon', 'miljones'),
                                                        'roots': ('miljon',)},
                '([1-9][0-9]*)?[1-9]0{9,11}$':         {'words': ('miljard', 'miljardes'),
                                                        'roots': ('miljard',)}
               }

ordinal_number_words = {
                '0\.$':                                  {'words': ('nullis',),
                                                          'roots': ('null',)},
                '(|[2-9]|([1-9][0-9]*[02-9]))1\.$':      {'words': ('esimene',),
                                                          'roots': ('esime',)},
                '(|[2-9]|([1-9][0-9]*[02-9]))2\.$':      {'words': ('teine',),
                                                          'roots': ('tei',)},
                '(|[2-9]|([1-9][0-9]*[02-9]))3\.$':      {'words': ('kolmas',),
                                                          'roots': ('kolma',)},
                '(|[2-9]|([1-9][0-9]*[02-9]))4\.$':      {'words': ('neljas',),
                                                          'roots': ('nel',)},
                '(|[2-9]|([1-9][0-9]*[02-9]))5\.$':      {'words': ('viies',),
                                                          'roots': ('vii',)},
                '(|[2-9]|([1-9][0-9]*[02-9]))6\.$':      {'words': ('kuues',),
                                                          'roots': ('kuu',)},
                '(|[2-9]|([1-9][0-9]*[02-9]))7\.$':      {'words': ('seitsmes',),
                                                          'roots': ('seits',)},
                '(|[2-9]|([1-9][0-9]*[02-9]))8\.$':      {'words': ('kaheksas',),
                                                          'roots': ('kaheks',)},
                '(|[2-9]|([1-9][0-9]*[02-9]))9\.$':      {'words': ('üheksas',),
                                                          'roots': ('üheks',)},
                '([1-9][0-9]*)?1[1-9]\.$':               {'words': ('kolmeteistkümnes',),
                                                          'roots': ('kolmeteistküm',)},
                '([1-9][0-9]*)?[1-9]0\.$':               {'words': ('kolmekümnes',),
                                                          'roots': ('kolmeküm',)},
                '([1-9][0-9]*)?[1-9]00\.$':              {'words': ('sajas',),
                                                          'roots': ('saja',)},
                '([1-9][0-9]*)?[1-9]0{3,5}\.$':          {'words': ('tuhandes',),
                                                          'roots': ('tuhan',)},
                '([1-9][0-9]*)?[1-9]0{6,8}(0{6}0*)?\.$': {'words': ('miljones',),
                                                          'roots': ('miljon',)},
                '([1-9][0-9]*)?[1-9]0{9,11}\.$':         {'words': ('miljardes',),
                                                          'roots': ('miljard',)}
               }



forms =  ['sg n', 'pl n', 'sg g', 'pl g', 'sg p', 'pl p', 'sg ill', 'pl ill', 'adt', 'sg in', 'pl in',
          'sg el', 'pl el', 'sg all', 'pl all', 'sg ad', 'pl ad', 'sg abl', 'pl abl', 'sg tr',
          'pl tr', 'sg ter', 'pl ter', 'sg es', 'pl es', 'sg ab', 'pl ab', 'sg kom', 'pl kom']

# all numbers
for number, data in number_words.items():
    data['analyses'] = {}
    for form in forms:
        for number_word in data['words']:
            for synt in synthesize(number_word, form):
                analysis = analyze([synt], disambiguate=False, guess=False, propername=False)[0]['analysis']
                for a in analysis:
                    if a['partofspeech'] in {'N', 'O'}:
                        data['analyses'].setdefault(synt, set()).add((a['partofspeech'], a['form'], a['ending']))

for number, data in number_words.items():
    data['suffixes'] = {}
    for word_form in data['analyses']:
        for root in data['roots']:
            ending = word_form.partition(root)[-1]
            if ending:
                for i in range(len(ending)):
                    data['suffixes'][ending[i:]] = set()
                break
    for ending, analyses in data['suffixes'].items():
        for word_form, a in data['analyses'].items():
            if word_form.endswith(ending):
                for pos, form, suffix in a:
                    if pos!='N' or form!='sg n':
                        analyses.add((pos, form, suffix))
    data['suffixes'][''] = {('N', '?', '0')}

table = []
for number, data in number_words.items():
    for ending, analyses in data['suffixes'].items():
        for pos, form, normal_ending in analyses:
            table.append({
                        'number': number,
                        'suffix': ending,
                        'pos': pos,
                        'form': form,
                        'ending': normal_ending
            })

# ordinal numbers only
for number, data in ordinal_number_words.items():
    data['analyses'] = {}
    for form in forms:
        for number_word in data['words']:
            for synt in synthesize(number_word, form):
                analysis = analyze([synt], disambiguate=False, guess=False, propername=False)[0]['analysis']
                for a in analysis:
                    if a['partofspeech'] in {'O'}:
                        data['analyses'].setdefault(synt, set()).add((a['partofspeech'], a['form'], a['ending']))

for number, data in ordinal_number_words.items():
    data['suffixes'] = {}
    for word_form in data['analyses']:
        for root in data['roots']:
            ending = word_form.partition(root)[-1]
            if ending:
                for i in range(len(ending)):
                    data['suffixes'][ending[i:]] = set()
                break
    for ending, analyses in data['suffixes'].items():
        for word_form, a in data['analyses'].items():
            if word_form.endswith(ending):
                for pos, form, suffix in a:
                    if pos!='O' or form!='sg n':
                        analyses.add((pos, form, suffix))
    data['suffixes'][''] = {('O', '?', '0')}


for number, data in ordinal_number_words.items():
    for ending, analyses in data['suffixes'].items():
        for pos, form, normal_ending in analyses:
            table.append({
                        'number': number,
                        'suffix': ending,
                        'pos': pos,
                        'form': form,
                        'ending': normal_ending
            })
            
            
df = DataFrame.from_records(table, columns=['number', 'suffix', 'pos', 'form', 'ending'])
df = df.sort_values(['number', 'pos', 'form', 'ending'])

df.to_csv('results/number_analysis_rules.csv', index=False)

print(len(df), 'lines')
df[:10]

9744 lines


Unnamed: 0,number,suffix,pos,form,ending
4838,([1-9][0-9]*)?1[1-9]$,,N,?,0
4959,([1-9][0-9]*)?1[1-9]$,teta,N,pl ab,teta
4999,([1-9][0-9]*)?1[1-9]$,eta,N,pl ab,teta
5041,([1-9][0-9]*)?1[1-9]$,ta,N,pl ab,teta
5087,([1-9][0-9]*)?1[1-9]$,neteta,N,pl ab,teta
5112,([1-9][0-9]*)?1[1-9]$,eteta,N,pl ab,teta
5151,([1-9][0-9]*)?1[1-9]$,a,N,pl ab,teta
4916,([1-9][0-9]*)?1[1-9]$,ilt,N,pl abl,ilt
4981,([1-9][0-9]*)?1[1-9]$,lt,N,pl abl,ilt
5039,([1-9][0-9]*)?1[1-9]$,neilt,N,pl abl,ilt


In the rules file [results/number_analysis_rules.csv](results/number_analysis_rules.csv) **pos**, **form** and **ending** cells contain the morphological analysis for the tokens that match **number** with **suffix**.

## Load the rules from the file

In [3]:
def load_number_analysis_rules(file):
    df = read_csv(file, na_filter=False)
    rules = defaultdict(dict)
    for _, r in df.iterrows():
        if r.suffix not in rules[r.number]:
            rules[r.number][r.suffix] = []
        rules[r.number][r.suffix].append({'partofspeech': r.pos, 'form': r.form, 'ending':r.ending})
    return rules

rules = load_number_analysis_rules('results/number_analysis_rules.csv')

## From the koondkorpus tokens find examples and support for the rules

In [4]:
token_count = 0
not_alpha_count = 0

numeric = re.compile('-?(\d+\.?)-?(\D*)$')
examples_good = {}
examples_good_support = Counter()
examples_bad = {}
with open('../temp/wordlist', 'r', encoding='utf_8') as in_f:
    for i, line in enumerate(in_f):
        token_count += 1
        token = line.strip()
        if not token.isalpha():
            not_alpha_count += 1
            m = numeric.match(token)
            if m:
                good_token = False
                number = m.group(1) 
                suffix = m.group(2)
                for number_re in rules:
                    if re.match(number_re, number):
                        if suffix in rules[number_re]:
                            good_token = True
                            examples_good[(number_re, suffix)] = token
                            examples_good_support[(number_re, suffix)] += 1
                if not good_token:
                    examples_bad[(number, suffix)] = token
        if i % 10000 == 0:
            clear_output()
            display('{} {}'.format(i, token))
                
clear_output()

print(len(examples_good), 'good examples')
print(len(examples_bad),  'numeric examples not covered by the rules')
print('token_count:', token_count)
print('not_alpha_count:', not_alpha_count)

df = read_csv('results/number_analysis_rules.csv', na_filter=False)
example = []
support = []
for _, r in df.iterrows():
    example.append(examples_good.get((r.number, r.suffix), ''))
    support.append(examples_good_support.get((r.number, r.suffix), ''))
df['example'] = example
df['support'] = support

df.to_csv('results/number_analysis_rules_with_examples.csv', index=False)
df[:10]

830 good examples
45995 numeric examples not covered by the rules
token_count: 5000215
not_alpha_count: 732035


Unnamed: 0,number,suffix,pos,form,ending,example,support
0,([1-9][0-9]*)?1[1-9]$,,N,?,0,9918,2016.0
1,([1-9][0-9]*)?1[1-9]$,teta,N,pl ab,teta,,
2,([1-9][0-9]*)?1[1-9]$,eta,N,pl ab,teta,,
3,([1-9][0-9]*)?1[1-9]$,ta,N,pl ab,teta,,
4,([1-9][0-9]*)?1[1-9]$,neteta,N,pl ab,teta,,
5,([1-9][0-9]*)?1[1-9]$,eteta,N,pl ab,teta,,
6,([1-9][0-9]*)?1[1-9]$,a,N,pl ab,teta,912a,49.0
7,([1-9][0-9]*)?1[1-9]$,ilt,N,pl abl,ilt,,
8,([1-9][0-9]*)?1[1-9]$,lt,N,pl abl,ilt,8016-lt,41.0
9,([1-9][0-9]*)?1[1-9]$,neilt,N,pl abl,ilt,,


See [results/number_analysis_rules_with_examples.csv](results/number_analysis_rules_with_examples.csv) for all examples

## Some examples of the tokens that contain numbers but are not covered by the rules

In [5]:
from random import random
for k, v in examples_bad.items():
    if (random() < 0.001):
        print(v, k)

980-eurose ('980', 'eurose')
153hektarisel ('153', 'hektarisel')
25protsendiga ('25', 'protsendiga')
-273c ('273', 'c')
91aastane ('91', 'aastane')
300-punktilisele ('300', 'punktilisele')
30-klapilist ('30', 'klapilist')
80kiloste ('80', 'kiloste')
82cm ('82', 'cm')
15minutilises ('15', 'minutilises')
1800-mehelise ('1800', 'mehelise')
13kraadine ('13', 'kraadine')
206cc ('206', 'cc')
12-dollarilist ('12', 'dollarilist')
165-kroonine ('165', 'kroonine')
121-meetrise ('121', 'meetrise')
9sentimeetrine ('9', 'sentimeetrine')
607sekundit ('607', 'sekundit')
30-liitrilise ('30', 'liitrilise')
113jj ('113', 'jj')
0-baasiline ('0', 'baasiline')
65meetrine ('65', 'meetrine')
71-kroonine ('71', 'kroonine')
90Sr ('90', 'Sr')
97protsendiline ('97', 'protsendiline')
18-mehelised ('18', 'mehelised')
716-k ('716', 'k')
2Quickstarti ('2', 'Quickstarti')
1-MCP-l ('1', 'MCP-l')
3A ('3', 'A')
6nnetu ('6', 'nnetu')
50naelsterlingise ('50', 'naelsterlingise')
6nnest ('6', 'nnest')
051-le ('051', 'le')
0

## Analyze numeric tokens

In [6]:
import re
def analyze_number(token):
    m = re.match('-?(\d+\.?)-?(\D*)$', token)
    if not m:
        return []
    m.group(0), 
    number = m.group(1)
    ordinal_number = number.rstrip('.') + '.'
    ending = m.group(2)
    result = []
    for number_re, analyses in rules.items():
        if re.match(number_re, number):
            for analysis in analyses[ending]:
                if analysis['partofspeech'] == 'O':
                    a = {'lemma':ordinal_number, 'root':ordinal_number, 'root_tokens':[ordinal_number], 'clitic':''}
                else:
                    a = {'lemma':number, 'root':number, 'root_tokens':[number], 'clitic':''}
                a.update(analysis)
                result.append(a)
            break
    return result

analyze_number('11nes')

[{'clitic': '',
  'ending': 's',
  'form': 'sg in',
  'lemma': '11',
  'partofspeech': 'N',
  'root': '11',
  'root_tokens': ['11']},
 {'clitic': '',
  'ending': '0',
  'form': 'sg n',
  'lemma': '11.',
  'partofspeech': 'O',
  'root': '11.',
  'root_tokens': ['11.']}]

In [7]:
analyze_number('11')

[{'clitic': '',
  'ending': '0',
  'form': '?',
  'lemma': '11',
  'partofspeech': 'N',
  'root': '11',
  'root_tokens': ['11']}]

In [8]:
analyze_number('11iks')

[{'clitic': '',
  'ending': 'iks',
  'form': 'pl tr',
  'lemma': '11',
  'partofspeech': 'N',
  'root': '11',
  'root_tokens': ['11']},
 {'clitic': '',
  'ending': 'iks',
  'form': 'pl tr',
  'lemma': '11.',
  'partofspeech': 'O',
  'root': '11.',
  'root_tokens': ['11.']}]

# Roman numerals
## From the koondkorpus tokens find examples of Roman numerals

In [9]:
upper_case_roman_numeral_match = re.compile('(M*(CM)?(C?D)?C{0,3}(XC)?(X?L)?X{0,3}(IX)?(I?V)?I{0,3})(-(.*))?$').match
lower_case_roman_numeral_match = re.compile('(m*(cm)?(c?d)?c{0,3}(xc)?(x?l)?x{0,3}(ix)?(i?v)?i{0,3})(-(.*))?$').match

numbers = []
simple_numbers = []
with open('../temp/wordlist', 'r', encoding='utf_8') as in_f:
    for i, line in enumerate(in_f):
        token = line.strip()

        m = upper_case_roman_numeral_match(token) or lower_case_roman_numeral_match(token)
        if m:
            number = m.group(1)
            suffix = m.group(9)
            if number:
                if not suffix or suffix.isalpha() and (suffix.islower() or suffix.isupper()):
                    numbers.append(token)
                if token.isalpha():
                    simple_numbers.append(token)

## Simple Roman numerals in the koondkorpus

In [12]:
print(len(simple_numbers), 'tokens')
', '.join(simple_numbers)

288 tokens


'C, CC, CCC, CCCL, CCCXV, CCI, CCV, CCX, CCXII, CD, CDC, CDI, CDL, CDLXXXVIII, CDV, CDX, CI, CII, CIII, CIV, CIX, CL, CLI, CLII, CLV, CLX, CLXIV, CLXXXIX, CM, CMC, CMCI, CMD, CMI, CML, CMV, CMX, CV, CVII, CX, CXC, CXCV, CXII, CXIX, CXL, CXX, CXXIX, CXXVII, CXXX, D, DC, DCC, DCI, DCLXVI, DCX, DI, DII, DIII, DIV, DIX, DIXI, DL, DLV, DLX, DV, DVI, DX, DXC, I, II, III, IV, IVI, IX, IXI, IXIV, L, LI, LII, LIV, LIX, LV, LX, LXI, LXX, LXXIII, LXXVI, LXXX, LXXXIX, LXXXV, M, MC, MCC, MCD, MCI, MCL, MCM, MCMC, MCMXC, MCMXCII, MCMXCVI, MCMXCVII, MCMXIX, MCMXLIX, MCV, MCX, MD, MDC, MDCXXXI, MDCXXXII, MDI, MDL, MDV, MDX, MI, MII, MIV, MIX, MIXI, ML, MLI, MM, MMC, MMCCLXXXIX, MMD, MMI, MMII, MML, MMLV, MMM, MMMM, MMMMM, MMMMMM, MMMMMMMMMM, MMMMMMMMMMM, MMMMMMMMMMMMMMMM, MMX, MMXI, MV, MX, V, VI, VII, VIII, X, XC, XCVIII, XI, XII, XIII, XIV, XIX, XL, XLI, XLII, XLIII, XLVI, XV, XVI, XVII, XVIII, XX, XXI, XXII, XXIII, XXIV, XXIX, XXV, XXVI, XXVII, XXVIII, XXX, XXXI, XXXII, XXXIII, XXXIV, XXXIX, XXXV, 

## Roman numerals with all sorts of suffixes

In [14]:
print(len(numbers), 'tokens. \nSome examples:')
', '.join(numbers[::50])

5774 tokens. 
Some examples:


'C, C-alagruppi, C-grupp, C-kategooriat, C-korvist, C-näitaja, C-samba, C-terminali, C-viirusele, CC-, CD-WR, CD-draivid, CD-keskusega, CD-kopyst, CD-makke, CD-plaadidele, CD-pragu, CD-salvestuse, CD-toorik, CD-ümbristesse, CMD-J, CXII, D-d, D-keelekategooria, D-nelikus, D-suunal, D-vitamiinivarude, DL-määramismeetodite, I, I-grupi, I-punktina, II-e, III-l, IV-st, L-e, L-kujuliselt, L-valiin, M-F, M-id, M-lle, M-päevas, M-viirusega, MD-ga, ML-SS, MM-alagruppide, MM-esitusi, MM-finaalis, MM-hooaeg, MM-kalender, MM-koha, MM-krossidel, MM-linna, MM-medali, MM-nimetusest, MM-pretendendi, MM-pääsunormi, MM-romaanis, MM-sõidust, MM-tiitlil, MM-tsüklile, MM-valikkohtumistes, MM-valikturniirist, MM-võistlusteks, MMM-s, MX, V-ga, V-linkide, V-playeriga, V-universaal, VII-VIII, X-TABEL, X-failid, X-kastiga, X-lehe, X-pressi, X-tabelis, X-windows, XIII-t, XVI-XIX, XXX-large, c-kvark, cd-boxid, cd-kirjutajasse, cd-plaatide, cd-seadmetega, cm-, d-ebüütkogu, d-ta, dl-le, i-bändide, i-hääletust, i-ki