tools/tokenisers/tokeniser-tts-cggt-desc.pmscript

!! # TTS tokenisation for smj

!! Requires a recent version of HFST (3.10.0 / git revision>=3aecdbc)
!! Then just:
!! ```sh
!! make
!! echo "ja, ja" \
!! | hfst-tokenise --giella-cg tokeniser-disamb-gt-desc.pmhfst
!! ```

!! More usage examples:
!! ```sh
!! echo "Juos gorreválggain lea (dárbbašlaš) deavdit gáibádusa \
!!   boasttu olmmoš, man mielde lahtuid." \
!!   | hfst-tokenise --giella-cg tokeniser-disamb-gt-desc.pmhfst
!! echo "(gáfe) 'ja' ja 3. ja? ц jaja ukjend \"ukjend\"" \
!!   | hfst-tokenise --giella-cg tokeniser-disamb-gt-desc.pmhfst
!! echo "márffibiillagáffe" \
!!   | hfst-tokenise --giella-cg tokeniser-disamb-gt-desc.pmhfst
!! ```

!! Pmatch documentation:
!! <https://kitwiki.csc.fi/twiki/bin/view/KitWiki/HfstPmatch>

set need-separators off

Define  inputlexmorph @bin"analyser_relabelled-tts-cggt-input.hfst" ;
Define outputlexmorph @bin"analyser_relabelled-tts-cggt-output.hfst" ;

Define inmorphology [ inputlexmorph | UpCase(inputlexmorph, U) ] ;
Define outmorphology [ outputlexmorph ] ;
Define morphology [ inmorphology .o. outmorphology ] ;

Define url @bin"analyser-url-gt-desc.hfst" ;

! explicit list of flags, since `?` does not match flags; the list is dynamically built, since it is language dep:
Define flags @bin"filters/tts-tokeniser-flags.hfst" ;
Define inputmark    [ 0:"@PMATCH_INPUT_MARK@" | 0:"@PMATCH_BACKTRACKING@" ];

!! Characters which have analyses in the lexicon, but can appear without spaces
!! before/after, that is, with no context conditions, and adjacent to words:
!! * Punct contains ASCII punctuation marks
!! * The symbol after m-dash is soft-hyphen `U+00AD`
!! * The symbol following {•} is byte-order-mark / zero-width no-break space
!!   `U+FEFF`.
Define incondform      Punct|
                       {„}|{“}|{”}|{…}|{‚}|{‘}|{’}|
                       {–}|{—}|{­}|{_}|{<}|{>}|{«}|{»}|
                       {@}|{'}|{‹}|{›}|{➤}|{•}|{﻿} ;

!! Whitespace contains ASCII white space and
!! the List contains some unicode white space characters
!! * En Quad U+2000 to Zero-Width Joiner U+200d'
!! * Narrow No-Break Space U+202F
!! * Medium Mathematical Space U+205F
!! * Word joiner U+2060
Define blank           Whitespace | incondform
                       | Lst({           ​‌‍  ⁠})
                       ;

Define nonprintable [ 0:? | inputmark | flags ];
Define any          [ ? | nonprintable ];

Define incondword       morphology & [ any* incondform:[?*] nonprintable* ] ; ! Ends in punctuation – no context condition

Define morphoword       Uncompose(morphology, inmorphology, outmorphology)
                        LC([blank | #]) RC([blank | # ]);
Define urlword          url                          LC([blank | #]) RC([blank | # ]);

!! Apart from what's in our morphology, there are
!! 1) unknown word-like forms, and
!! 2) unmatched strings
!! We want to give 1) a match, but let 2) be treated specially by hfst-tokenise -a
Define alphabet "a-z"
               |"A-Z"
               |Lst({àáâãāăȧäảåǎȁȃąạḁæǽǣèéêẽēĕėëẻěȅȇẹȩęḙḛìíîĩīīĭi̇ïỉǐịįȉȋḭɨòóôõōŏȯöỏőǒȍȏơǫọɵøờớỡởợǭộǿœùúûũūŭüủůűǔȕȗưụṳųṷṵừứữửựʉỳýŷỹȳẏÿỷƴỵɏÀÁÂÃĀĂȦÄẢÅǍȀȂĄẠḀÆǼǢÈÉÊẼĒĔĖËẺĚȄȆẸȨĘḘḚÌÍÎĨĪĪĬİÏỈǏỊĮȈȊḬƗÒÓÔÕŌŎȮÖỎŐǑȌȎƠǪỌƟØỜỚỠỞỢǬỘǾŒÙÚÛŨŪŬÜỦŮŰǓȔȖƯỤṲŲṶṴỪỨỮỬỰɄỲÝŶỸȲẎŸỶƳỴɎšžčđðíŋňŧñńŠŽČĐÐÍŊŇŦÑ})
               !! * select extended latin symbols
               | "0-9"
               | Lst({_§°†}) !! * select symbols
               ! Combining diacritics as individual symbols,
               ! to be able to analyse unknown words with
               ! decomposed diacritics. All combining diacritics
               ! U+0300—U+036F, U+20D0—U+20F0. Grouped according
               ! to position relative to base char, then more or
               ! less according to Unicode number.
               ! NB: The following list will look odd in many editors!
               | Lst({̶̴̵̶̷̸⃥⃦⃪⃫⃒⃓⃘⃙⃚̡̢̧̨᷐᷎̛᷹̖̗̘̟̠̣̤̥̦̩̪̫̬̭̮̯̰̱̲̳̹̺̻̼͇͈͉͍͎͙͚͓͔͕͖⃨⃬⃭⃮⃯᷂᷏᷹᷽᷿᷊᷷᷸̀́̂̌̃̄̅̆̇̈⃰̉̊̋̍̎̏̐̑̒̓̔̽̾̿̀́͂̓̈́͆͊͋͌͐͑͒͗͛⃐⃑⃔⃕⃖⃗⃛⃜⃡⃧⃩᷀᷁᷃᷄᷅᷆᷇᷈᷉᷋᷌᷵᷻᷾ͣͤͥͦͧͨͩͪͫͬͭͮͯ᷑᷒ᷓᷔᷕᷖᷗᷘᷙᷚᷛᷜᷝᷞᷟᷠᷡᷢᷣᷤᷥᷦᷧᷨᷩᷪᷫᷬᷭᷮᷯᷰᷱᷲᷳᷴ̕̚͘᷶͜͟͢᷼͝͞͠͡᷍ͅ⃝⃞⃟⃠⃢⃣⃤})
               !! * various symbols from Private area (probably Microsoft),
               !!   so far:
               !!   * U+F0B7 for "x in box"
               | {}
               ;

Define alphamiddle {'} ; ! Treat foo'bar as one big unknown, but 'foo (or bar') as two tokens
Define alphaword alphabet+ ( alphamiddle alphabet+ );
!! TODO: Could use something like this, but built-in's don't include šžđčŋ:
! Define MixCase(X) [OptCap(X) | UpCase(X)];
! Define alphaword MixCase(alphabet+);


!!! Unknown handling:
Define unknownform [alphaword].u ;
!! Simply give an empty reading when something is unknown:
Define unknownword unknownform 0:% %?%?::280284
                        LC([blank | #]) RC([[blank ] | # ]);
!! hfst-tokenise --giella-cg will treat such empty analyses as unknowns, and
!! remove empty analyses from other readings. Empty readings are also
!! legal in CG, they get a default baseform equal to the wordform, but
!! no tag to check, so it's safer to let hfst-tokenise handle them.


!!! Superblank handling (TODO):
! Define anyExceptEsc [ ? - [ %\ ] ];
! Define unescaped [ anyExceptEsc | %\ ? | # ]+;
! Define anyExceptSuperendEsc [ ? - [ %\ | %] ] ];
! Define notSuperend [ %\ ? | anyExceptSuperendEsc ];
! Define superblank %[ notSuperend* %] EndTag(superblank) LC(unescaped) ;
! Define tokenOrBlank [ token | superblank ];
!! Needs hfst-tokenise to output things differently depending on the tag they get


! Define token [ Uncompose(morphoword, inmorphology, outmorphology, phon) | unknownwordEmpty | incondword | Ins(urlword) ] EndTag(token);
Define token [ morphoword | unknownword | incondword | Ins(urlword) ] EndTag(token);

regex token ;