Skip to content

Commit

Permalink
1.0.32.21: compress most unibyte-external-format definitions
Browse files Browse the repository at this point in the history
All the unibyte-mapper-based external-formats had huge amounts of
cut-and-pasted code, differing only in names of functions.  This is,
oddly enough, a clear case for abstracting away the repeated code into a
macro.

In the process, convert them to the multibyte apparatus, which has support
for the nice restarts, and remove the too-simple unibyte
DEFINE-EXTERNAL-FORMAT (and EXTERNAL-FORMAT-DECODING-ERROR) which are now
unused.

Include a far-from-comprehensive set of tests, which are mostly for
iso-8859-x formats
  • Loading branch information
csrhodes committed Nov 11, 2009
1 parent 314ebe7 commit a18894d
Show file tree
Hide file tree
Showing 10 changed files with 628 additions and 1,892 deletions.
20 changes: 20 additions & 0 deletions OPTIMIZATIONS
Original file line number Diff line number Diff line change
Expand Up @@ -405,3 +405,23 @@ can be transformed into
which allows compiler-macro-expansion for FOO. (Only constant
arguments can be moved inside the new lambda -- otherwise evaluation
order is altered.)

--------------------------------------------------------------------------------
#41

The unibyte external formats are written in a very generic way. Three
optimizations immediately applicable that could be automatically
generated:

(a) if the external format merely permutes the first 256 characters, a
constant-time lookup (rather than a binary search) could be
performed on output. This applies at least to EBCDIC, which
currently has a hand-rolled mapper instead.

(b) if there are no undefined characters corresponding to the 256
codes, then no error checking need be done on input.

(c) if there is a way to use particular bits of the exceptional
characters, constant-time output (rather than binary search) can
still be achieved as used to be done by the latin-9 external
format before 1.0.31.
12 changes: 6 additions & 6 deletions src/code/external-formats/enc-basic.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,14 @@
finally (return (coerce string 'simple-string))))))))
(instantiate-octets-definition define-ascii->string)

(define-external-format (:ascii :us-ascii :ansi_x3.4-1968
:iso-646 :iso-646-us :|646|)
1 t
(define-unibyte-external-format :ascii
(:us-ascii :ansi_x3.4-1968 :iso-646 :iso-646-us :|646|)
(if (>= bits 128)
(external-format-encoding-error stream bits)
(setf (sap-ref-8 sap tail) bits))
(code-char byte)
(if (>= byte 128)
(return-from decode-break-reason 1)
(code-char byte))
ascii->string-aref
string->ascii)

Expand Down Expand Up @@ -101,8 +102,7 @@
;;; Multiple names for the :ISO{,-}8859-* families are needed because on
;;; FreeBSD (and maybe other BSD systems), nl_langinfo("LATIN-1") will
;;; return "ISO8859-1" instead of "ISO-8859-1".
(define-external-format (:latin-1 :latin1 :iso-8859-1 :iso8859-1)
1 t
(define-unibyte-external-format :latin-1 (:latin1 :iso-8859-1 :iso8859-1)
(if (>= bits 256)
(external-format-encoding-error stream bits)
(setf (sap-ref-8 sap tail) bits))
Expand Down
132 changes: 3 additions & 129 deletions src/code/external-formats/enc-cyr.lisp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
(in-package "SB!IMPL")

(define-unibyte-mapper koi8-r->code-mapper code->koi8-r-mapper
(define-unibyte-mapping-external-format :koi8-r (:|koi8-r|)
(#x80 #x2500) ; BOX DRAWINGS LIGHT HORIZONTAL
(#x81 #x2502) ; BOX DRAWINGS LIGHT VERTICAL
(#x82 #x250C) ; BOX DRAWINGS LIGHT DOWN AND RIGHT
Expand Down Expand Up @@ -131,49 +131,7 @@
(#xFF #x042A) ; CYRILLIC CAPITAL LETTER HARD SIGN
)

(declaim (inline get-koi8-r-bytes))
(defun get-koi8-r-bytes (string pos)
(declare (optimize speed (safety 0))
(type simple-string string)
(type array-range pos))
(get-latin-bytes #'code->koi8-r-mapper :koi8-r string pos))

(defun string->koi8-r (string sstart send null-padding)
(declare (optimize speed (safety 0))
(type simple-string string)
(type array-range sstart send))
(values (string->latin% string sstart send #'get-koi8-r-bytes null-padding)))

(defmacro define-koi8-r->string* (accessor type)
(declare (ignore type))
(let ((name (make-od-name 'koi8-r->string* accessor)))
`(progn
(defun ,name (string sstart send array astart aend)
(,(make-od-name 'latin->string* accessor) string sstart send array astart aend #'koi8-r->code-mapper)))))

(instantiate-octets-definition define-koi8-r->string*)

(defmacro define-koi8-r->string (accessor type)
(declare (ignore type))
`(defun ,(make-od-name 'koi8-r->string accessor) (array astart aend)
(,(make-od-name 'latin->string accessor) array astart aend #'koi8-r->code-mapper)))

(instantiate-octets-definition define-koi8-r->string)

(define-external-format (:koi8-r :|koi8-r|)
1 t
(let ((koi8-r-byte (code->koi8-r-mapper bits)))
(if koi8-r-byte
(setf (sap-ref-8 sap tail) koi8-r-byte)
(external-format-encoding-error stream bits)))
(let ((code (koi8-r->code-mapper byte)))
(if code
(code-char code)
(external-format-decoding-error stream byte)))
koi8-r->string-aref
string->koi8-r) ;; TODO -- error check

(define-unibyte-mapper koi8-u->code-mapper code->koi8-u-mapper
(define-unibyte-mapping-external-format :koi8-u (:|koi8-u|)
(#x80 #x2500) ; BOX DRAWINGS LIGHT HORIZONTAL
(#x81 #x2502) ; BOX DRAWINGS LIGHT VERTICAL
(#x82 #x250C) ; BOX DRAWINGS LIGHT DOWN AND RIGHT
Expand Down Expand Up @@ -304,49 +262,7 @@
(#xFF #x042A) ; CYRILLIC CAPITAL LETTER HARD SIGN
)

(declaim (inline get-koi8-u-bytes))
(defun get-koi8-u-bytes (string pos)
(declare (optimize speed (safety 0))
(type simple-string string)
(type array-range pos))
(get-latin-bytes #'code->koi8-u-mapper :koi8-u string pos))

(defun string->koi8-u (string sstart send null-padding)
(declare (optimize speed (safety 0))
(type simple-string string)
(type array-range sstart send))
(values (string->latin% string sstart send #'get-koi8-u-bytes null-padding)))

(defmacro define-koi8-u->string* (accessor type)
(declare (ignore type))
(let ((name (make-od-name 'koi8-u->string* accessor)))
`(progn
(defun ,name (string sstart send array astart aend)
(,(make-od-name 'latin->string* accessor) string sstart send array astart aend #'koi8-u->code-mapper)))))

(instantiate-octets-definition define-koi8-u->string*)

(defmacro define-koi8-u->string (accessor type)
(declare (ignore type))
`(defun ,(make-od-name 'koi8-u->string accessor) (array astart aend)
(,(make-od-name 'latin->string accessor) array astart aend #'koi8-u->code-mapper)))

(instantiate-octets-definition define-koi8-u->string)

(define-external-format (:koi8-u :|koi8-u|)
1 t
(let ((koi8-u-byte (code->koi8-u-mapper bits)))
(if koi8-u-byte
(setf (sap-ref-8 sap tail) koi8-u-byte)
(external-format-encoding-error stream bits)))
(let ((code (koi8-u->code-mapper byte)))
(if code
(code-char code)
(external-format-decoding-error stream byte)))
koi8-u->string-aref
string->koi8-u) ;; TODO -- error check

(define-unibyte-mapper x-mac-cyrillic->code-mapper code->x-mac-cyrillic-mapper
(define-unibyte-mapping-external-format :x-mac-cyrillic (:|x-mac-cyrillic|)
(#x80 #x0410) ; CYRILLIC CAPITAL LETTER A
(#x81 #x0411) ; CYRILLIC CAPITAL LETTER BE
(#x82 #x0412) ; CYRILLIC CAPITAL LETTER VE
Expand Down Expand Up @@ -471,45 +387,3 @@
(#xFE #x044E) ; CYRILLIC SMALL LETTER YU
(#xFF #x00A4) ; CURRENCY SIGN
)

(declaim (inline get-x-mac-cyrillic-bytes))
(defun get-x-mac-cyrillic-bytes (string pos)
(declare (optimize speed (safety 0))
(type simple-string string)
(type array-range pos))
(get-latin-bytes #'code->x-mac-cyrillic-mapper :x-mac-cyrillic string pos))

(defun string->x-mac-cyrillic (string sstart send null-padding)
(declare (optimize speed (safety 0))
(type simple-string string)
(type array-range sstart send))
(values (string->latin% string sstart send #'get-x-mac-cyrillic-bytes null-padding)))

(defmacro define-x-mac-cyrillic->string* (accessor type)
(declare (ignore type))
(let ((name (make-od-name 'x-mac-cyrillic->string* accessor)))
`(progn
(defun ,name (string sstart send array astart aend)
(,(make-od-name 'latin->string* accessor) string sstart send array astart aend #'x-mac-cyrillic->code-mapper)))))

(instantiate-octets-definition define-x-mac-cyrillic->string*)

(defmacro define-x-mac-cyrillic->string (accessor type)
(declare (ignore type))
`(defun ,(make-od-name 'x-mac-cyrillic->string accessor) (array astart aend)
(,(make-od-name 'latin->string accessor) array astart aend #'x-mac-cyrillic->code-mapper)))

(instantiate-octets-definition define-x-mac-cyrillic->string)

(define-external-format (:x-mac-cyrillic :|x-mac-cyrillic|)
1 t
(let ((x-mac-cyrillic-byte (code->x-mac-cyrillic-mapper bits)))
(if x-mac-cyrillic-byte
(setf (sap-ref-8 sap tail) x-mac-cyrillic-byte)
(external-format-encoding-error stream bits)))
(let ((code (x-mac-cyrillic->code-mapper byte)))
(if code
(code-char code)
(external-format-decoding-error stream byte)))
x-mac-cyrillic->string-aref
string->x-mac-cyrillic) ;; TODO -- error check
Loading

0 comments on commit a18894d

Please sign in to comment.