From 59b27c998ce5fc950b5efc5a82627b94192c03cf Mon Sep 17 00:00:00 2001 From: Christophe Rhodes Date: Wed, 5 Aug 2009 14:17:51 +0000 Subject: [PATCH] 1.0.30.36: Hangul syllable character names Provide for the construction of Hangul syllable character names, as required by Unicode ("This character name is a normative property of the character"). At present done at build-time; if the increase in core size is too painful, it can be done algorithmically in CHAR-NAME and NAME-CHAR. --- NEWS | 4 +- tools-for-build/Jamo.txt | 92 ++++++++++++++++++++++++++++++++++++++++ tools-for-build/ucd.lisp | 36 ++++++++++++++++ version.lisp-expr | 2 +- 4 files changed, 132 insertions(+), 2 deletions(-) create mode 100644 tools-for-build/Jamo.txt diff --git a/NEWS b/NEWS index f743ead17..b9d2bb4fb 100644 --- a/NEWS +++ b/NEWS @@ -4,9 +4,11 @@ changes relative to sbcl-1.0.30: inlining cases: failure to stack allocate when equivalent code is manually open coded is now considered a bug. * improvement: the Unicode character database has been upgraded to the - Unicode 5.0.1 standard, giving names and properties to a number of new + Unicode 5.1 standard, giving names and properties to a number of new characters, and providing a few extra characters with case transformations. + * improvement: the system now recognizes and produces names for Unicode + Hangul syllable characters. * new feature: experimental :EMIT-CFASL parameter to COMPILE-FILE can be used to output toplevel compile-time effects into a separate .CFASL file. diff --git a/tools-for-build/Jamo.txt b/tools-for-build/Jamo.txt new file mode 100644 index 000000000..1fde508fb --- /dev/null +++ b/tools-for-build/Jamo.txt @@ -0,0 +1,92 @@ +# Jamo-5.1.0.txt +# Date: 2008-03-20, 17:59:00 PDT [KW] +# +# Unicode Character Database +# Copyright (c) 1991-2008 Unicode, Inc. +# For terms of use, see http://www.unicode.org/terms_of_use.html +# For documentation, see UCD.html +# +# This file defines the Jamo Short Name property. +# +# See Section 3.12 of The Unicode Standard, Version 5.0 +# for more information. +# +# Each line contains two fields, separated by a semicolon. +# +# The first field gives the code point, in 4-digit hexadecimal +# form, of a combining jamo character that participates in +# the algorithmic determination Hangul syllable character names. +# The second field gives the Jamo Short Name as a one-, two-, +# or three-character ASCII string (or in one case, for U+110B, +# the null string). +# +# ############################################################# + +1100; G # HANGUL CHOSEONG KIYEOK +1101; GG # HANGUL CHOSEONG SSANGKIYEOK +1102; N # HANGUL CHOSEONG NIEUN +1103; D # HANGUL CHOSEONG TIKEUT +1104; DD # HANGUL CHOSEONG SSANGTIKEUT +1105; R # HANGUL CHOSEONG RIEUL +1106; M # HANGUL CHOSEONG MIEUM +1107; B # HANGUL CHOSEONG PIEUP +1108; BB # HANGUL CHOSEONG SSANGPIEUP +1109; S # HANGUL CHOSEONG SIOS +110A; SS # HANGUL CHOSEONG SSANGSIOS +110B; # HANGUL CHOSEONG IEUNG +110C; J # HANGUL CHOSEONG CIEUC +110D; JJ # HANGUL CHOSEONG SSANGCIEUC +110E; C # HANGUL CHOSEONG CHIEUCH +110F; K # HANGUL CHOSEONG KHIEUKH +1110; T # HANGUL CHOSEONG THIEUTH +1111; P # HANGUL CHOSEONG PHIEUPH +1112; H # HANGUL CHOSEONG HIEUH +1161; A # HANGUL JUNGSEONG A +1162; AE # HANGUL JUNGSEONG AE +1163; YA # HANGUL JUNGSEONG YA +1164; YAE # HANGUL JUNGSEONG YAE +1165; EO # HANGUL JUNGSEONG EO +1166; E # HANGUL JUNGSEONG E +1167; YEO # HANGUL JUNGSEONG YEO +1168; YE # HANGUL JUNGSEONG YE +1169; O # HANGUL JUNGSEONG O +116A; WA # HANGUL JUNGSEONG WA +116B; WAE # HANGUL JUNGSEONG WAE +116C; OE # HANGUL JUNGSEONG OE +116D; YO # HANGUL JUNGSEONG YO +116E; U # HANGUL JUNGSEONG U +116F; WEO # HANGUL JUNGSEONG WEO +1170; WE # HANGUL JUNGSEONG WE +1171; WI # HANGUL JUNGSEONG WI +1172; YU # HANGUL JUNGSEONG YU +1173; EU # HANGUL JUNGSEONG EU +1174; YI # HANGUL JUNGSEONG YI +1175; I # HANGUL JUNGSEONG I +11A8; G # HANGUL JONGSEONG KIYEOK +11A9; GG # HANGUL JONGSEONG SSANGKIYEOK +11AA; GS # HANGUL JONGSEONG KIYEOK-SIOS +11AB; N # HANGUL JONGSEONG NIEUN +11AC; NJ # HANGUL JONGSEONG NIEUN-CIEUC +11AD; NH # HANGUL JONGSEONG NIEUN-HIEUH +11AE; D # HANGUL JONGSEONG TIKEUT +11AF; L # HANGUL JONGSEONG RIEUL +11B0; LG # HANGUL JONGSEONG RIEUL-KIYEOK +11B1; LM # HANGUL JONGSEONG RIEUL-MIEUM +11B2; LB # HANGUL JONGSEONG RIEUL-PIEUP +11B3; LS # HANGUL JONGSEONG RIEUL-SIOS +11B4; LT # HANGUL JONGSEONG RIEUL-THIEUTH +11B5; LP # HANGUL JONGSEONG RIEUL-PHIEUPH +11B6; LH # HANGUL JONGSEONG RIEUL-HIEUH +11B7; M # HANGUL JONGSEONG MIEUM +11B8; B # HANGUL JONGSEONG PIEUP +11B9; BS # HANGUL JONGSEONG PIEUP-SIOS +11BA; S # HANGUL JONGSEONG SIOS +11BB; SS # HANGUL JONGSEONG SSANGSIOS +11BC; NG # HANGUL JONGSEONG IEUNG +11BD; J # HANGUL JONGSEONG CIEUC +11BE; C # HANGUL JONGSEONG CHIEUCH +11BF; K # HANGUL JONGSEONG KHIEUKH +11C0; T # HANGUL JONGSEONG THIEUTH +11C1; P # HANGUL JONGSEONG PHIEUPH +11C2; H # HANGUL JONGSEONG HIEUH + diff --git a/tools-for-build/ucd.lisp b/tools-for-build/ucd.lisp index 599fda878..7851c367c 100644 --- a/tools-for-build/ucd.lisp +++ b/tools-for-build/ucd.lisp @@ -114,8 +114,44 @@ do (slurp-ucd-line line))) (second-pass) (build-misc-table) + (fixup-hangul-syllables) *decompositions*) +(defun fixup-hangul-syllables () + ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12" + (let* ((sbase #xac00) + (lbase #x1100) + (vbase #x1161) + (tbase #x11a7) + (scount 11172) + (lcount 19) + (vcount 21) + (tcount 28) + (ncount (* vcount tcount)) + (table (make-hash-table))) + (with-open-file (*standard-input* + (make-pathname :name "Jamo" :type "txt" + :defaults *unicode-character-database*)) + (loop for line = (read-line nil nil) + while line + if (position #\; line) + do (add-jamo-information line table))) + (dotimes (sindex scount) + (let* ((l (+ lbase (floor sindex ncount))) + (v (+ vbase (floor (mod sindex ncount) tcount))) + (tee (+ tbase (mod sindex tcount))) + (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]" + (gethash l table) (gethash v table) + (= tee tbase) (gethash tee table)))) + (setf (gethash (+ sbase sindex) *unicode-names*) name))))) + +(defun add-jamo-information (line table) + (let* ((split (split-string line #\;)) + (code (parse-integer (first split) :radix 16)) + (syllable (string-trim '(#\Space) + (subseq (second split) 0 (position #\# (second split)))))) + (setf (gethash code table) syllable))) + (defun split-string (line character) (loop for prev-position = 0 then (1+ position) for position = (position character line :start prev-position) diff --git a/version.lisp-expr b/version.lisp-expr index aadb4c29a..379f07c7f 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"1.0.30.35" +"1.0.30.36"