Permalink
Fetching contributors…
Cannot retrieve contributors at this time
163 lines (140 sloc) 5.81 KB
! guesser.lexc
!
! Released under the terms of the MIT License
! Copyright (c) 2011-2015 Çağrı Çöltekin <cagri@coltekin.net>
!
! Permission is hereby granted, free of charge, to any person obtaining a
! copy of this software and associated documentation files (the "Software"),
! to deal in the Software without restriction, including without limitation
! the rights to use, copy, modify, merge, publish, distribute, sublicense,
! and/or sell copies of the Software, and to permit persons to whom the
! Software is furnished to do so, subject to the following conditions:
!
! The above copyright notice and this permission notice shall be included in
! all copies or substantial portions of the Software.
!
! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
! AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
! DEALINGS IN THE SOFTWARE.
!
!
! The aim of this file to define lexical items with minimal/reasonable
! restrictions instead of list of stems. This allows us to guess the
! unknown words based on their morphological properties.
!
! This file defines lexicons for all word classes we use in
! morph.lexc.
!
! The file is PREPROCESSED using C preprocessor before passing to
! lexc compiler to avoid some repetition. The following symbols
! are replaced with corresponding sets of symbols
!
! - TRUC An uppercase character from the Turkish alphabet
! - TRLC A lowercase character from the Turkish alphabet
! - TRXUC TRUC and the letters Q, W, X. These are not officially
! in Turkish alphabet, but occur frequently in
! some terms and foreign (mostly English) origin words
! (and starting to be used often in some Kurdish words or
! names).
! - TRXLC Lowercase version of TRXUC
! - LATUC A wider range of uppercase characters from European
! languages, and languages writen with a Latin-based
! alphabet in general. This is particularly useful
! useful for catching proper names in other languages.
! - LATLC Lowercase version of LATUC.
!
!
!
#include "options.h"
#define TRUC_V E|İ|Ö|Ü|Î|A|I|O|U|Â|Û
#define TRLC_V e|i|ö|ü|î|a|ı|o|u|â|û
#define TRUC_C B|D|C|G|V|Z|J|F|Ğ|L|M|N|R|Y|P|T|Ç|K|F|S|Ş|H
#define TRLC_C b|d|c|g|v|z|j|f|ğ|l|m|n|r|y|p|t|ç|k|f|s|ş|h
#define DIGIT %0|1|2|3|4|5|6|7|8|9
#define TRUC TRUC_V|TRUC_C
#define TRLC TRLC_V|TRLC_C
#define TRXUC TRUC|Q|W|X
#define TRXLC TRLC|q|w|x
#define LATUC A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|À|Á|Â|Ã|Ä|Å|Æ|Ç|È|É|Ê|Ë|Ì|Í|Î|Ï|Ð|Ñ|Ò|Ó|Ô|Õ|Ö|Ø|Ù|Ú|Û|Ü|Ý|Þ|ß|Ĉ|Ć|Ą|Ā|Ă|Ċ|Č|Ď|Đ|Ē|Ĕ|Ė|Ę|Ě|Ĝ|Ğ|Ġ|Ģ|Ĥ|Ħ|Ĩ|Ī|Ĭ|Į|İ|IJ|Ĵ|Ķ|Ĺ|Ļ|Ľ|Ŀ|Ł|Ń|Ņ|Ň|Ŋ|Ō|Ŏ|Ő|Œ|Ŕ|Ŗ|Ř|Ś|Ŝ|Ş|Š|Ţ|Ť|Ŧ|Ũ|Ū|Ŭ|Ů|Ű|Ų|Ŵ|Ŷ|Ÿ|Ź|Ż|Ž
#define LATLC a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|à|á|â|ã|ä|å|æ|ç|è|é|ê|ë|ì|í|î|ï|ð|ñ|ò|ó|ô|õ|ö|ø|ù|ú|û|ü|ý|þ|ÿ|ā|ă|ą|ć|ĉ|ċ|č|ď|đ|ē|ĕ|ė|ę|ě|ĝ|ğ|ġ|ģ|ĥ|ħ|ĩ|ī|ĭ|į|ı|ij|ĵ|ķ|ĸ|ĺ|ļ|ľ|ŀ|ł|ń|ņ|ň|ʼn|ŋ|ō|ŏ|ő|œ|ŕ|ŗ|ř|ś|ŝ|ş|š|ţ|ť|ŧ|ũ|ū|ŭ|ů|ű|ų|ŵ|ŷ|ź|ż|ž|ſ
#ifdef GUESSER_MIN_LENGTH
#define MIN GUESSER_MIN_LENGTH
#else
#define MIN 2
#endif
#ifdef GUESSER_MAX_LENGTH
#define MAX GUESSER_MAX_LENGTH
#else
#define MAX 9
#endif
LEXICON Root
! LEXICON NRoot
!<[TRUC|TRLC][TRLC]^{MIN,MAX}> N;
<[LATUC|LATLC|DIGIT|%-|%+]^{MIN,MAX}> N;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> Ncomp;
<\?> Nfamily;
<\?> Ntime;
! LEXICON NPRoot
!<[TRUC][TRLC]^{MIN,MAX}> NP;
<[LATUC|LATLC|DIGIT|%-|%+]^{MIN,MAX}> NP;
<[TRUC][TRLC]^{MIN,MAX}> NPcomp;
! LEXICON AdjRoot
<[TRUC|TRLC][TRLC]^{MIN,MAX}> Adj;
<\?> Adj;
<\?> AdjNcomp;
! LEXICON AdvRoot
<[TRUC|TRLC][TRLC]^{MIN,MAX}> Adv;
<\?> AdvTime;
! LEXICON VerbRoot
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_AorAr;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_CausAr_AorAr;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_CausArt_AorAr;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_CausAt_AorAr;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_CausDir;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_CausDir_AorAr;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_CausIr_AorAr;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_CausIrreg;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_CausIrreg_AorAr;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_CausIt_AorAr;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_Rcp;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_Rcp_AorAr;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_Rcp_CausDir;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_Rcp_CausDir_AorAr;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_Rcp_CausIrreg_AorAr;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_Rfl;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_Rfl_AorAr;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_Rfl_CausDir_AorAr;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_Rfl_CausIrreg_AorAr;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_Rfl_Rcp;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_Rfl_Rcp_AorAr;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_Rfl_Rcp_CausDir_AorAr;
<[TRUC|TRLC][TRLC]^{MIN,MAX}> V_Rfl_Rcp_CausIrreg;
! LEXICON InterjectionRoot
<[TRUC|TRLC][TRLC]^{MIN,MAX}> Interjection;
! LEXICON OnomRoot
<[TRUC|TRLC][TRLC]^{MIN,MAX}> Onom;
! LEXICON Abbreviation
<[TRUC|TRLC][TRLC]^{1,MAX}> N_unit;
<[TRUC]^{1,MAX}> N_abbr;
<[TRUC|%.]^{1,MAX}> NP_abbr;
!
! Guessing the following is not really necessary since all of these
! are closed-class words.
!
! LEXICON PrnRoot
<\?> Prn;
<\?> DemPrn;
<\?> LocPrn;
<\?> PersPrn;
! LEXICON DetRoot
<\?> Det;
! LEXICON CnjRoot
<\?> CnjCoo;
<\?> CnjAdv;
<\?> CnjSub;
#include "morph.cpp.lexc"