/
abbreviations.lexc
201 lines (160 loc) · 6.78 KB
/
abbreviations.lexc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
! Divvun & Giellatekno - open source grammars for Sámi and other languages
! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament
! http://giellatekno.uit.no & http://divvun.no
!
! This program is free software; you can redistribute and/or modify
! this file under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version. The GNU General Public License
! is found at http://www.gnu.org/licenses/gpl.html. It is
! also available in the file $GTHOME/LICENSE.txt.
!
! Other licensing options are available upon request, please contact
! giellatekno@uit.no or feedback@divvun.no
! ===================================================
!! # Continuation lexicons for Finnish abbreviations
! ===================================================
!! ## Lexica for adding tags and periods
! ----------------------------------
!! ## The sublexica
!! ### Continuation lexicons for abbrs both with and witout final period
!LEXICON ab-dot-noun-adj-trab !!= * __@CODE@__
!+N+Sem/Lang: ab-dot-noun-trab ;
!+A+Sem/Lang: ab-dot-adj-trab ;
LEXICON ab-noun-itrab +ABBR+Gram/IAbbr: ab-noun ;
LEXICON ab-noun-trab +ABBR+Gram/TAbbr: ab-noun ;
LEXICON ab-noun-trnumab +ABBR+Gram/TNumAbbr: ab-noun ;
LEXICON ab-noun !!= * **@CODE@**
ab-nodot-noun ;
+Use/NG: ab-dot-noun ;
LEXICON ab-adj-itrab +ABBR+Gram/IAbbr: ab-adj ;
LEXICON ab-adj-trab +ABBR+Gram/TAbbr: ab-adj ;
LEXICON ab-adj !!= * **@CODE@**
ab-nodot-adj ;
+Use/NG: ab-dot-adj ;
LEXICON ab-adv-itrab +ABBR+Gram/IAbbr: ab-adv ;
!LEXICON ab-adv-numnoab +ABBR+Gram/NoAbbr: ab-adv ;
!LEXICON ab-adv-trab +ABBR+Gram/TAbbr: ab-adv ;
!LEXICON ab-adv-trnumab +ABBR+Gram/TNumAbbr: ab-adv ;
LEXICON ab-adv !!= * **@CODE@**
ab-nodot-adv ;
+Use/NG: ab-dot-adv ;
LEXICON ab-num-itrab +ABBR+Gram/IAbbr: ab-num ;
LEXICON ab-num !!= * **@CODE@**
ab-nodot-num ;
+Use/NG: ab-dot-num ;
!! ### Lexicons without final period
LEXICON ab-nodot-noun-itrab +ABBR+Gram/IAbbr: ab-nodot-noun ;
LEXICON ab-nodot-noun-trab +ABBR+Gram/TAbbr: ab-nodot-noun ;
LEXICON ab-nodot-noun-trnumab +ABBR+Gram/TNumAbbr: ab-nodot-noun ;
LEXICON ab-nodot-noun !!= * **@CODE@** The bulk
nodot-attrnomaccgen-infl ;
nodot-oblique-infl ;
LEXICON ab-nodot-adj-itrab +ABBR+Gram/IAbbr: ab-nodot-adj ;
LEXICON ab-nodot-adj !!= * **@CODE@**
nodot-attrnomaccgen-infl ;
LEXICON ab-nodot-adv-itrab +ABBR+Gram/IAbbr: ab-nodot-adv ;
LEXICON ab-nodot-adv-trnumab +ABBR+Gram/TNumAbbr: ab-nodot-adv ;
LEXICON ab-nodot-adv !!= * **@CODE@**
# ;
+Attr: # ;
RHyph ;
! +Sg+Nom: # ;
! +Sg+Acc: # ;
! +Sg+Gen: # ;
LEXICON ab-nodot-num !!= * **@CODE@**
nodot-nomaccgen-infl ;
!! ### Lexicons with final period
LEXICON ab-dot-noun-itrab +ABBR+Gram/IAbbr: ab-dot-noun ;
!LEXICON ab-dot-noun-noab +ABBR+Gram/NoAbbr: ab-dot-noun ;
LEXICON ab-dot-noun-trab +ABBR+Gram/TAbbr: ab-dot-noun ;
LEXICON ab-dot-noun-trnumab +ABBR+Gram/TNumAbbr: ab-dot-noun ;
LEXICON ab-dot-noun !!= * **@CODE@** This is the lexicon for abbrs that must have a period.
dot-attrnomaccgen-infl ;
LEXICON ab-dot-adj-itrab +ABBR+Gram/IAbbr: ab-dot-adj ;
!LEXICON ab-dot-adj-noab +ABBR+Gram/NoAbbr: ab-dot-adj ;
LEXICON ab-dot-adj-trab +ABBR+Gram/TAbbr: ab-dot-adj ;
!LEXICON ab-dot-adj-trnumab +ABBR+Gram/TNumAbbr: ab-dot-adj ;
LEXICON ab-dot-adj !!= * **@CODE@** This is the lexicon for abbrs that must have a period.
dot-attrnomaccgen-infl ;
LEXICON ab-dot-adv-itrab +ABBR+Gram/IAbbr: ab-dot-adv ;
LEXICON ab-dot-adv-numnoab +ABBR+Gram/NumNoAbbr: ab-dot-adv ;
LEXICON ab-dot-adv-trab +ABBR+Gram/TAbbr: ab-dot-adv ;
LEXICON ab-dot-adv-trnumab +ABBR+Gram/TNumAbbr: ab-dot-adv ;
LEXICON ab-dot-adv !!= * **@CODE@** This is the lexicon for abbrs that must have a period.
DOT ; ! Adv without case.
!LEXICON ab-dot-num-itrab +Num+ABBR+Gram/IAbbr: ab-dot-num ;
LEXICON ab-dot-num-trab +Num+ABBR+Gram/TAbbr: ab-dot-num ;
LEXICON ab-dot-num !!= * **@CODE@** This is the lexicon for abbrs that must have a period.
dot-nomaccgen-infl ;
!LEXICON ab-dot-cc-itrab +CC+ABBR+Gram/IAbbr: ab-dot-cc ;
LEXICON ab-dot-cc-trab +CC+ABBR+Gram/TAbbr: ab-dot-cc ;
LEXICON ab-dot-cc !!= * **@CODE@**
DOT ;
! Then, as an afterthought, come our two verbs, gč. and vrd.
!LEXICON ab-verb-itrab +V+ABBR+Gram/IAbbr: ab-verb ;
!LEXICON ab-verb-trab +V+ABBR+Gram/TAbbr: ab-verb ;
!LEXICON ab-verb !!= * __@CODE@__ A lexicon for "gč." and perhaps also other abbreviated verbs.
+Use/NG: ab-dot-verb ;
ab-nodot-verb ;
LEXICON ab-dot-verb-itrab +V+ABBR+Gram/IAbbr: ab-dot-verb ;
LEXICON ab-dot-verb-trab +V+ABBR+Gram/TAbbr: ab-dot-verb ;
!LEXICON ab-dot-verb-trnumab +V+ABBR+Gram/TNumAbbr: ab-dot-verb ;
LEXICON ab-dot-verb !!= * **@CODE@**
+TV+Imprt: DOT ; ! Period.
LEXICON ab-nodot-verb !!= * **@CODE@**
+TV+Imprt: # ; ! No period.
! riegádan:
LEXICON ab-dot-IVprfprc-trab +V+ABBR+Gram/TAbbr: ab-dot-IVprfprc ;
!LEXICON ab-dot-IVprfprc-trnumab +V+ABBR+Gram/TNumAbbr: ab-dot-IVprfprc ;
LEXICON ab-dot-IVprfprc !!= * **@CODE@**
+Sg+IV+PrfPrc: DOT ; ! Period.
LEXICON nodot-attrnomaccgen-infl !!= * **@CODE@**
nodot-attr-infl ;
nodot-nomaccgen-infl ;
LEXICON nodot-attr-infl !!= * **@CODE@**
+Attr: # ;
LEXICON nodot-nomaccgen-infl !!= * **@CODE@**
+Sg+Nom: # ;
+Sg+Gen:%:n # ;
RHyph ;
LEXICON nodot-oblique-infl
+Sg+Ill:%>%:iin # ;
+Sg+Ine:%>%:ssa # ;
+Sg+Ela:%>%:sta # ;
+Sg+Ess:%>%:na # ;
+Pl+Nom:%>%:t # ;
+Pl+Gen:%>%:iden # ;
+Pl+Ill:%>%:ihin # ;
+Pl+Ine:%>%:issa # ;
+Pl+Ela:%>%:ista # ;
+Pl+Com:%>%:ine # ;
LEXICON dot-attrnomaccgen-infl !!= * **@CODE@**
dot-nomaccgen-infl ;
dot-attr ;
LEXICON dot-attr !!= * **@CODE@**
+Attr: DOT ;
LEXICON dot-nomaccgen-infl !!= * **@CODE@**
+Sg+Nom: DOT ;
+Sg+Acc: DOT ;
+Sg+Gen: DOT ;
LEXICON DOT !!= * **@CODE@** - Adds the dot to dotted abbreviations.
+Use/-PMatch:%. # ; ! We need the dot here for regular fsts
! Split the abbr + full stop in two segments, but only when using pmatch:
< "@P.Pmatch.Loc@" {.} "+CLB":0 "+Use/PMatch":0 > # ;
! Make a regular ABBR analysis AND backtrack to find alternative analyses:
< "+Use/PMatch":0 "@P.Pmatch.Backtrack@" 0:%. > # ;
! Gives:
!$ echo 'su.' \
!| hfst-tokenise -g tools/tokenisers/tokeniser-gramcheck-gt-desc.pmhfst
!"<su.>"
! "." CLB <W:0.0> "<.>"
! "su" Adv ABBR Gram/NumNoAbbr <W:0.0> "<su>"
! "su" Adv ABBR Gram/NumNoAbbr <W:0.0>
! "." CLB <W:0.0> "<.>"
! "son" Pron Pers Sg3 Gen <W:0.0> "<su>"
! "." CLB <W:0.0> "<.>"
! "son" Pron Pers Sg3 Acc <W:0.0> "<su>"
!:\n
!
! which is exactly what we want.