Permalink
Fetching contributors…
Cannot retrieve contributors at this time
697 lines (578 sloc) 14.4 KB
! number.lexc
!
! Released under the terms of the MIT License
! Copyright (c) 2011-2015 Çağrı Çöltekin <cagri@coltekin.net>
!
! Permission is hereby granted, free of charge, to any person obtaining a
! copy of this software and associated documentation files (the "Software"),
! to deal in the Software without restriction, including without limitation
! the rights to use, copy, modify, merge, publish, distribute, sublicense,
! and/or sell copies of the Software, and to permit persons to whom the
! Software is furnished to do so, subject to the following conditions:
!
! The above copyright notice and this permission notice shall be included in
! all copies or substantial portions of the Software.
!
! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
! AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
! DEALINGS IN THE SOFTWARE.
!
! Here, we define both numbers written as text and digits, the
! digits part includes support for both Arabic and Roman numerals.
! The definition also covers common number groupings.
! At the moment, there is some rudimentary support for recognizing
! date and time expressions. More, e.g. phone ! numbers, may be included
! in the future.
!
! The definition here is rather ugly with a lot of replication. There
! should be a cleaner solution, but it is not obvious (to me) with
! lexc/xfst.
! - If done using xfst, it can certainly be a lot neater, but then
! suffixation (e.g., derivations to other word classes) needs to
! be replicated in xfst.
! - We (can) use xfst syntax in lexc below (in <>), but this does not
! allow re-use of automata defined earlier in other places (other
! other brackets or in an external xfst file)
!
! Ugly as it is the following works, one needs to be careful to
! modify all replicated parts when modifications are needed.
!
LEXICON Number
NumberAlpha;
NumberNumeric;
NumberRoman;
NumDateTime;
NumFraction;
LEXICON NumberAlpha
n0;
n0;
n1;
n10;
n100;
n1k;
n1m;
n1g;
n1t;
n1K;
! more ??
LEXICON n0
sıfır NumStem;
! we also squeeze half & quarter terms here
yarım NumStem;
buçuk NumStem;
çeyrek NumStem;
tek NumStem;
LEXICON n1 ! 1 to 9
bir@dB n1cont;
iki@dB n1cont;
üç@dB n1cont;
dör^t@dB n1cont;
beş@dB n1cont;
altı@dB n1cont;
yedi@dB n1cont;
sekiz@dB n1cont;
dokuz@dB n1cont;
LEXICON n1cont
NumStem;
NumHalf;
LEXICON n10 ! 11 to 99
on@dB n10cont;
yirmi@dB n10cont;
otuz@dB n10cont;
kırk@dB n10cont;
elli@dB n10cont;
altmış@dB n10cont;
yetmiş@dB n10cont;
seksen@dB n10cont;
doksan@dB n10cont;
LEXICON n10cont
NumStem;
NumHalf;
n1;
LEXICON n100 ! 100 to 999
prefixN100;
N100;
LEXICON prefixN100
iki@dB N100;
üç@dB N100;
dör^t@dB N100;
beş@dB N100;
altı@dB N100;
yedi@dB N100;
sekiz@dB N100;
dokuz@dB N100;
LEXICON N100
yüz@dB n100cont;
LEXICON n100cont
NumStem;
NumHalf;
n10;
n1;
!--------------------
LEXICON n1k ! 1000 to 999,999
prefixN1k;
N1k;
LEXICON prefixN1k
prefixN1k2;
prefixN1k10;
prefixN1k100;
LEXICON prefixN1k1
bir@dB N1k2cont;
prefixN1k2;
LEXICON prefixN1k2
iki@dB N1k2cont;
üç@dB N1k2cont;
dör^t@dB N1k2cont;
beş@dB N1k2cont;
altı@dB N1k2cont;
yedi@dB N1k2cont;
sekiz@dB N1k2cont;
dokuz@dB N1k2cont;
LEXICON N1k2cont
N1k;
LEXICON prefixN1k10
on@dB N1k10cont;
yirmi@dB N1k10cont;
otuz@dB N1k10cont;
kırk@dB N1k10cont;
elli@dB N1k10cont;
altmış@dB N1k10cont;
yetmiş@dB N1k10cont;
seksen@dB N1k10cont;
doksan@dB N1k10cont;
LEXICON N1k10cont
N1k;
prefixN1k1;
LEXICON prefixN1k100
N1k100pre;
N1k100;
LEXICON N1k100pre
iki@dB N1k100preCont;
üç@dB N1k100preCont;
dör^t@dB N1k100preCont;
beş@dB N1k100preCont;
altı@dB N1k100preCont;
yedi@dB N1k100preCont;
sekiz@dB N1k100preCont;
dokuz@dB N1k100preCont;
LEXICON N1k100preCont
N1k100;
LEXICON N1k100
yüz@dB N1k100Cont;
LEXICON N1k100Cont
N1k;
prefixN1k10;
LEXICON N1k
bin@dB n1kCont;
LEXICON n1kCont
NumStem;
NumHalf;
n100;
n10;
n1;
!--------------------
LEXICON n1m ! 1,000,000 to 999,999,999
prefixN1m;
N1mNoPrefix;
LEXICON prefixN1m
prefixN1m1;
prefixN1m10;
prefixN1m100;
LEXICON prefixN1m1
bir@dB N1m1cont;
iki@dB N1m1cont;
üç@dB N1m1cont;
dör^t@dB N1m1cont;
beş@dB N1m1cont;
altı@dB N1m1cont;
yedi@dB N1m1cont;
sekiz@dB N1m1cont;
dokuz@dB N1m1cont;
LEXICON N1m1cont
N1m;
LEXICON prefixN1m10
on@dB N1m10cont;
yirmi@dB N1m10cont;
otuz@dB N1m10cont;
kırk@dB N1m10cont;
elli@dB N1m10cont;
altmış@dB N1m10cont;
yetmiş@dB N1m10cont;
seksen@dB N1m10cont;
doksan@dB N1m10cont;
LEXICON N1m10cont
N1m;
prefixN1m1;
LEXICON prefixN1m100
N1m100pre;
N1m100;
LEXICON N1m100pre
iki@dB N1m100preCont;
üç@dB N1m100preCont;
dör^t@dB N1m100preCont;
beş@dB N1m100preCont;
altı@dB N1m100preCont;
yedi@dB N1m100preCont;
sekiz@dB N1m100preCont;
dokuz@dB N1m100preCont;
LEXICON N1m100preCont
N1m100;
LEXICON N1m100
yüz@dB N1m100Cont;
LEXICON N1m100Cont
N1m;
prefixN1m10;
LEXICON N1m
milyon@dB n1mCont;
LEXICON N1mNoPrefix
milyon@dB NumStem;
LEXICON n1mCont
NumStem;
NumHalf;
n1k;
n100;
n10;
n1;
!------------------
LEXICON n1g ! 1,000,000,000 to 999,999,999,999
prefixN1g;
N1gNoPrefix;
LEXICON prefixN1g
prefixN1g1;
prefixN1g10;
prefixN1g100;
LEXICON prefixN1g1
bir@dB N1g1cont;
iki@dB N1g1cont;
üç@dB N1g1cont;
dör^t@dB N1g1cont;
beş@dB N1g1cont;
altı@dB N1g1cont;
yedi@dB N1g1cont;
sekiz@dB N1g1cont;
dokuz@dB N1g1cont;
LEXICON N1g1cont
N1g;
LEXICON prefixN1g10
on@dB N1g10cont;
yirmi@dB N1g10cont;
otuz@dB N1g10cont;
kırk@dB N1g10cont;
elli@dB N1g10cont;
altmış@dB N1g10cont;
yetmiş@dB N1g10cont;
seksen@dB N1g10cont;
doksan@dB N1g10cont;
LEXICON N1g10cont
N1g;
prefixN1g1;
LEXICON prefixN1g100
N1g100pre;
N1g100;
LEXICON N1g100pre
iki@dB N1g100preCont;
üç@dB N1g100preCont;
dör^t@dB N1g100preCont;
beş@dB N1g100preCont;
altı@dB N1g100preCont;
yedi@dB N1g100preCont;
sekiz@dB N1g100preCont;
dokuz@dB N1g100preCont;
LEXICON N1g100preCont
N1g100;
LEXICON N1g100
yüz@dB N1g100Cont;
LEXICON N1g100Cont
N1g;
prefixN1g10;
LEXICON N1g
milyar@dB n1gCont;
LEXICON N1gNoPrefix
milyar@dB NumStem;
LEXICON n1gCont
NumStem;
NumHalf;
n1m;
n1k;
n100;
n10;
n1;
!------------------
LEXICON n1t ! 1,000,000,000 to 999,999,999,999
prefixN1t;
N1tNoPrefix;
LEXICON prefixN1t
prefixN1t1;
prefixN1t10;
prefixN1t100;
LEXICON prefixN1t1
bir@dB N1t1cont;
iki@dB N1t1cont;
üç@dB N1t1cont;
dör^t@dB N1t1cont;
beş@dB N1t1cont;
altı@dB N1t1cont;
yedi@dB N1t1cont;
sekiz@dB N1t1cont;
dokuz@dB N1t1cont;
LEXICON N1t1cont
N1t;
LEXICON prefixN1t10
on@dB N1t10cont;
yirmi@dB N1t10cont;
otuz@dB N1t10cont;
kırk@dB N1t10cont;
elli@dB N1t10cont;
altmış@dB N1t10cont;
yetmiş@dB N1t10cont;
seksen@dB N1t10cont;
doksan@dB N1t10cont;
LEXICON N1t10cont
N1t;
prefixN1t1;
LEXICON prefixN1t100
N1t100pre;
N1t100;
LEXICON N1t100pre
iki@dB N1t100preCont;
üç@dB N1t100preCont;
dör^t@dB N1t100preCont;
beş@dB N1t100preCont;
altı@dB N1t100preCont;
yedi@dB N1t100preCont;
sekiz@dB N1t100preCont;
dokuz@dB N1t100preCont;
LEXICON N1t100preCont
N1t100;
LEXICON N1t100
yüz@dB N1t100Cont;
LEXICON N1t100Cont
N1t;
prefixN1t10;
LEXICON N1t
trilyon@dB n1tCont;
LEXICON N1tNoPrefix
trilyon@dB NumStem;
LEXICON n1tCont
NumStem;
NumHalf;
n1g;
n1m;
n1k;
n100;
n10;
n1;
!------------------
LEXICON n1K ! 1,000,000,000,000,000
prefixN1K;
N1KNoPrefix;
LEXICON prefixN1K
prefixN1K1;
prefixN1K10;
prefixN1K100;
LEXICON prefixN1K1
bir@dB N1K1cont;
iki@dB N1K1cont;
üç@dB N1K1cont;
dör^t@dB N1K1cont;
beş@dB N1K1cont;
altı@dB N1K1cont;
yedi@dB N1K1cont;
sekiz@dB N1K1cont;
dokuz@dB N1K1cont;
LEXICON N1K1cont
N1K;
LEXICON prefixN1K10
on@dB N1K10cont;
yirmi@dB N1K10cont;
otuz@dB N1K10cont;
kırk@dB N1K10cont;
elli@dB N1K10cont;
altmış@dB N1K10cont;
yetmiş@dB N1K10cont;
seksen@dB N1K10cont;
doksan@dB N1K10cont;
LEXICON N1K10cont
N1K;
prefixN1K1;
LEXICON prefixN1K100
N1K100pre;
N1K100;
LEXICON N1K100pre
iki@dB N1K100preCont;
üç@dB N1K100preCont;
dör^t@dB N1K100preCont;
beş@dB N1K100preCont;
altı@dB N1K100preCont;
yedi@dB N1K100preCont;
sekiz@dB N1K100preCont;
dokuz@dB N1K100preCont;
LEXICON N1K100preCont
N1K100;
LEXICON N1K100
yüz@dB N1K100Cont;
LEXICON N1K100Cont
N1K;
prefixN1K10;
LEXICON N1K
katrilyon@dB n1KCont;
LEXICON N1KNoPrefix
katrilyon@dB NumStem;
LEXICON n1KCont
NumStem;
NumHalf;
n1t;
n1g;
n1m;
n1k;
n100;
n10;
n1;
! Numeric digits: we generate numbers of arbitrary length with
! optional thousand separator and possibly one decimal separator.
! Thousand separator is not allowed after decimal separator. We
! insert symbols @tSep and dSep here and replace using xfst. this
! makes it easy to (optionally) accept a number of different
! configuration.
!
! The analysis symbols for separators are the official ones (dot for
! thousand separator, comma for decimal)
!
! We have `silent' vowels and consonants here. They serve for the
! purposes of buffer deletion, vowel harmony and voicing assimilation.
! They are removed from both surface and analysis.
LEXICON NumberNumeric
<%0|1|2|3|4|5|6|7|8|9> FirstNum;
<¹|²|³½|¼|¾> NumericStem;
NumPercent;
dSep;
LEXICON FirstNum
<%0|1|2|3|4|5|6|7|8|9> SecondNum;
tSep;
dSep;
NumericStem;
LEXICON SecondNum
<%0|1|2|3|4|5|6|7|8|9> ThirdNum;
tSep;
dSep;
NumericStem;
LEXICON ThirdNum
<%0|1|2|3|4|5|6|7|8|9> FourthNum;
tSep;
dSep;
NumericStem;
LEXICON FourthNum ! no more thousand separator
<%0|1|2|3|4|5|6|7|8|9> FourthNum;
dSep;
NumericStem;
LEXICON dSep
,:@dSep dSepCont;
LEXICON dSepCont ! no more separators
<%0|1|2|3|4|5|6|7|8|9> dSepContCont;
LEXICON dSepContCont
dSepCont;
NumericStem;
LEXICON tSep ! now we have to have tSep after every 3 digits
.:@tSep tStep1;
LEXICON tStep1
<%0|1|2|3|4|5|6|7|8|9> tStep2;
LEXICON tStep2
<%0|1|2|3|4|5|6|7|8|9> tStep3;
LEXICON tStep3
<%0|1|2|3|4|5|6|7|8|9> tStep3cont;
LEXICON tStep3cont
tSep;
dSep;
NumericStem;
!
! One may want to be more restrictive with the percent sign,
! but we accept anything, which makes sense for things like %200.
! It may also be good for generation to restrict the suffixes attached
! after a percentage. On the other hand, for most cases, percent
! sign will be tokenized separately.
!
LEXICON NumPercent
%<perc%>:%%@MB NumPercentCont;
LEXICON NumPercentCont
<%0|1|2|3|4|5|6|7|8|9> FirstNum;
!
! Roman numerals
!
! also a bit ugly to do it here, but it is more convenient.
! We just try to recognize them, no trickery to convert them.
!
! We only recognize the modern notation, but we
! additionally accept IIII.
!
LEXICON NumberRoman
< [M|D|C|L|X|V|I]+ .o. ! make sure there is at least one character
M* ! thousands
(C [D|M] | (D) C^{0,3}) ! hundreds
(X [L|C] | (L) X^{0,3}) ! tens
(I [V|X] | (V) I^{0,3} | I^4) > RomanStem;
! the following is exactly the same, but with special UNICODE characters
< [Ⅿ|Ⅾ|Ⅽ|Ⅼ|Ⅹ|Ⅴ|Ⅰ]+ .o.
Ⅿ*
(Ⅽ [Ⅾ|Ⅿ] | (Ⅾ) Ⅽ^{0,3})
(Ⅹ [Ⅼ|Ⅽ] | (Ⅼ) Ⅹ^{0,3})
(Ⅰ [Ⅴ|Ⅹ] | (Ⅴ) Ⅰ^{0,3} | Ⅰ^4) > RomanStem;
!
! and lowercase versions,
!
! TODO: maybe we should do this in morph-phon part
< [m|d|c|l|x|v|i]+ .o.
m*
(c [d|m] | (d) c^{0,3})
(x [l|c] | (l) x^{0,3})
(i [v|x] | (v) i^{0,3} | i^4) > RomanStem;
! the following is exactly the same, but with lowercase characters
< [ⅿ|ⅾ|ⅽ|ⅼ|ⅹ|ⅴ|ⅰ]+ .o.
ⅿ*
(ⅽ [ⅾ|ⅿ] | (ⅾ) ⅽ^{0,3})
(ⅹ [ⅼ|ⅽ] | (ⅼ) ⅹ^{0,3})
(ⅰ [ⅴ|ⅹ] | (ⅴ) ⅰ^{0,3} | ⅰ^4) > RomanStem;
! combined unicode characters
< Ⅰ|Ⅱ|Ⅲ|Ⅳ|Ⅴ|Ⅵ|Ⅶ|Ⅷ|Ⅸ|Ⅹ|Ⅺ|Ⅻ > RomanStem;
< ⅰ|ⅱ|ⅲ|ⅳ|ⅴ|ⅵ|ⅶ|ⅷ|ⅸ|ⅹ|ⅺ|ⅻ > RomanStem;
!
! Dates and times.
!
! This matches some common date/time formats.
! 24-hour like time format with either with `:' or `.' sepearators are
! accpted
!
! Dates of (D)D/MM/YYYY format (also with `-' or `.' as
! separators) and YYYY-MM-DD are accepted. No month-first
! (USA style) dates (yet).
!
LEXICON NumDateTime
< [[(%0) [1|2|3|4|5|6|7|8|9]]|
[[1|2][%0|1|2|3|4|5|6|7|8|9]] ]
[%:|%.]
[%0|1|2|3|4|5] [%0|1|2|3|4|5|6|7|8|9]
([%:|%.] [%0|1|2|3|4|5] [%0|1|2|3|4|5|6|7|8|9]) > TimeStem;
< [[(%0) [1|2|3|4|5|6|7|8|9]]|
[[1|2] [%0|1|2|3|4|5|6|7|8|9]]|
[3 [%0|1]]]
[%/|%-|%.]
[[(%0) [1|2|3|4|5|6|7|8|9]]|
[1 [%0|1|2]] ]
[%/|%-|%.]
[%0|1|2|3|4|5|6|7|8|9]^{1,4} > DateStem;
< [%0|1|2|3|4|5|6|7|8|9]^{1,4}
%-
[[(%0) [1|2|3|4|5|6|7|8|9]]|
[1 [%0|1|2]] ]
%-
[[(%0) [1|2|3|4|5|6|7|8|9]]|
[[1|2] [%0|1|2|3|4|5|6|7|8|9]]|
[3 [%0|1]]] > DateStem;
% Trivial acceptor for fraction expressions like 1/4
LEXICON NumFraction
< [%0|1|2|3|4|5|6|7|8|9]+
%/
[%0|1|2|3|4|5|6|7|8|9]+ > NumericStem;