tools/tokenisers/tokeniser-gramcheck-gt-desc.pmscript

!! # Grammar checker tokenisation for slh

!! Requires a recent version of HFST (3.10.0 / git revision>=3aecdbc)
!! Then just:
!! ```
!! $ make
!! $ echo "ja, ja" | hfst-tokenise --giella-cg tokeniser-disamb-gt-desc.pmhfst
!! ```

!! More usage examples:
!! ```
!! $ echo "Juos gorreválggain lea (dárbbašlaš) deavdit gáibádusa boasttu olmmoš, man mielde lahtuid." | hfst-tokenise --giella-cg tokeniser-disamb-gt-desc.pmhfst
!! $ echo "(gáfe) 'ja' ja 3. ja? ц jaja ukjend \"ukjend\"" | hfst-tokenise --giella-cg tokeniser-disamb-gt-desc.pmhfst
!! $ echo "márffibiillagáffe" | hfst-tokenise --giella-cg tokeniser-disamb-gt-desc.pmhfst
!! ```

!! Pmatch documentation:
!! <https://github.com/hfst/hfst/wiki/HfstPmatch>

set need-separators off

Define lexmorph @bin"analyser_relabelled-gramcheck-gt-desc.hfst" ;

Define morphology [ lexmorph | UpCase(lexmorph, U) ] ;

Define url @bin"analyser-url-gt-desc.hfst" ;

! explicit list of flags, since `?` does not match flags; the list is
! dynamically built, since it is language dep:
Define flags @bin"filters/gramcheck-tokeniser-flags.hfst" ;
Define inputmark    [ 0:"@PMATCH_INPUT_MARK@" | 0:"@PMATCH_BACKTRACKING@" ];

!! Characters which have analyses in the lexicon, but can appear without spaces
!! before/after, that is, with no context conditions, and adjacent to words:
!! * Punct contains ASCII punctuation marks
!! * The symbol after m-dash is soft-hyphen `U+00AD`
!! * The symbol following {•} is byte-order-mark / zero-width no-break space
!!   `U+FEFF`.
Define incondform      Punct|
                       {„}|{“}|{”}|{…}|{‚}|{‘}|{’}|
                       {–}|{—}|{­}|{_}|{<}|{>}|{«}|{»}|
                       {@}|{'}|{‹}|{›}|{➤}|{•}|{﻿} ;

!! Whitespace contains ASCII white space and
!! the List contains some unicode white space characters
!! * En Quad U+2000 to Zero-Width Joiner U+200d'
!! * Narrow No-Break Space U+202F
!! * Medium Mathematical Space U+205F
!! * Word joiner U+2060
Define blank           Whitespace | incondform
                       | Lst({           ​‌‍  ⁠})
                       ;

Define nonprintable [ 0:? | inputmark | flags ];
Define any          [ ? | nonprintable ];

Define incondword       morphology & [ any* incondform:[?*] nonprintable* ] ;
! Ends in punctuation – no context condition

Define morphoword       morphology                   LC([blank | #]) RC([blank | # ]);
Define urlword          url                          LC([blank | #]) RC([blank | # ]);

!! Apart from what's in our morphology, there are
!! 1) unknown word-like forms, and
!! 2) unmatched strings
!! We want to give 1) a match, but let 2) be treated specially by hfst-tokenise -a
Define alphabet "a-z"
               |"A-Z"
               |Lst({àáâãāăȧäảåǎȁȃąạḁæǽǣèéêẽēĕėëẻěȅȇẹȩęḙḛìíîĩīīĭi̇ïỉǐịįȉȋḭɨòóôõōŏȯöỏőǒȍȏơǫọɵøờớỡởợǭộǿœùúûũūŭüủůűǔȕȗưụṳųṷṵừứữửựʉỳýŷỹȳẏÿỷƴỵɏÀÁÂÃĀĂȦÄẢÅǍȀȂĄẠḀÆǼǢÈÉÊẼĒĔĖËẺĚȄȆẸȨĘḘḚÌÍÎĨĪĪĬİÏỈǏỊĮȈȊḬƗÒÓÔÕŌŎȮÖỎŐǑȌȎƠǪỌƟØỜỚỠỞỢǬỘǾŒÙÚÛŨŪŬÜỦŮŰǓȔȖƯỤṲŲṶṴỪỨỮỬỰɄỲÝŶỸȲẎŸỶƳỴɎšžčđðíŋňŧñńŠŽČĐÐÍŊŇŦÑ})
               !! * select extended latin symbols
               | "0-9"
               | Lst({_§°†}) !! * select symbols
               ! Combining diacritics as individual symbols,
               ! to be able to analyse unknown words with
               ! decomposed diacritics. All combining diacritics
               ! U+0300—U+036F, U+20D0—U+20F0. Grouped according
               ! to position relative to base char, then more or
               ! less according to Unicode number.
               ! NB: The following list will look odd in many editors!
               | Lst({̶̴̵̶̷̸⃥⃦⃪⃫⃒⃓⃘⃙⃚̡̢̧̨᷐᷎̛᷹̖̗̘̟̠̣̤̥̦̩̪̫̬̭̮̯̰̱̲̳̹̺̻̼͇͈͉͍͎͙͚͓͔͕͖⃨⃬⃭⃮⃯᷂᷏᷹᷽᷿᷊᷷᷸̀́̂̌̃̄̅̆̇̈⃰̉̊̋̍̎̏̐̑̒̓̔̽̾̿̀́͂̓̈́͆͊͋͌͐͑͒͗͛⃐⃑⃔⃕⃖⃗⃛⃜⃡⃧⃩᷀᷁᷃᷄᷅᷆᷇᷈᷉᷋᷌᷵᷻᷾ͣͤͥͦͧͨͩͪͫͬͭͮͯ᷑᷒ᷓᷔᷕᷖᷗᷘᷙᷚᷛᷜᷝᷞᷟᷠᷡᷢᷣᷤᷥᷦᷧᷨᷩᷪᷫᷬᷭᷮᷯᷰᷱᷲᷳᷴ̕̚͘᷶͜͟͢᷼͝͞͠͡᷍ͅ⃝⃞⃟⃠⃢⃣⃤})
               !! * various symbols from Private area (probably Microsoft),
               !!   so far:
               !!   * U+F0B7 for "x in box"
               | {}
               ;

Define alphamiddle {'}|{-}|{:} ; ! Treat foo-bar as one big unknown, but -foo (or bar-) as two tokens
Define alphaword alphabet+ ( alphamiddle alphabet+ );
!! TODO: Could use something like this, but built-in's don't include šžđčŋ:
! Define MixCase(X) [OptCap(X) | UpCase(X)];
! Define alphaword MixCase(alphabet+);


!!! Unknown handling:
Define unknownform [alphaword].u ;
!! Simply give an empty reading when something is unknown:
Define unknownword unknownform 0:% %?%?::280284
                        LC([blank | #]) RC([[blank ] | # ]);
!! hfst-tokenise --giella-cg will treat such empty analyses as unknowns, and
!! remove empty analyses from other readings. Empty readings are also
!! legal in CG, they get a default baseform equal to the wordform, but
!! no tag to check, so it's safer to let hfst-tokenise handle them.


! Superblank handling (TODO):
! Define anyExceptEsc [ ? - [ %\ ] ];
! Define unescaped [ anyExceptEsc | %\ ? | # ]+;
! Define anyExceptSuperendEsc [ ? - [ %\ | %] ] ];
! Define notSuperend [ %\ ? | anyExceptSuperendEsc ];
! Define superblank %[ notSuperend* %] EndTag(superblank) LC(unescaped) ;
! Define tokenOrBlank [ token | superblank ];
! Needs hfst-tokenise to output things differently depending on the tag they get

!! Finally we mark as a token any sequence making up a:
!! * known word in context
!! * unknown (OOV) token in context
!! * sequence of word and punctuation
!! * URL in context
Define token [ morphoword | unknownword | incondword | Ins(urlword) ] EndTag(token);

regex token ;