Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100755 468 lines (425 sloc) 17.499 kb
e7be80f New release 1.0.3
edi authored
1 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*-
2 ;;; $Header: /usr/local/cvsrep/flexi-streams/length.lisp,v 1.6 2008/05/29 10:25:14 edi Exp $
3
4 ;;; Copyright (c) 2005-2008, Dr. Edmund Weitz. All rights reserved.
5
6 ;;; Redistribution and use in source and binary forms, with or without
7 ;;; modification, are permitted provided that the following conditions
8 ;;; are met:
9
10 ;;; * Redistributions of source code must retain the above copyright
11 ;;; notice, this list of conditions and the following disclaimer.
12
13 ;;; * Redistributions in binary form must reproduce the above
14 ;;; copyright notice, this list of conditions and the following
15 ;;; disclaimer in the documentation and/or other materials
16 ;;; provided with the distribution.
17
18 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
19 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
22 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
24 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 (in-package :flexi-streams)
31
32 (defgeneric encoding-factor (format)
33 (:documentation "Given an external format FORMAT, returns a factor
34 which denotes the octets to characters ratio to expect when
35 encoding/decoding. If the returned value is an integer, the factor is
36 assumed to be exact. If it is a \(double) float, the factor is
37 supposed to be based on heuristics and usually not exact.
38
39 This factor is used in string.lisp.")
40 (declare #.*standard-optimize-settings*))
41
42 (defmethod encoding-factor ((format flexi-8-bit-format))
43 (declare #.*standard-optimize-settings*)
44 ;; 8-bit encodings map octets to characters in an exact one-to-one
45 ;; fashion
46 1)
47
48 (defmethod encoding-factor ((format flexi-utf-8-format))
49 (declare #.*standard-optimize-settings*)
50 ;; UTF-8 characters can be anything from one to six octets, but we
51 ;; assume that the "overhead" is only about 5 percent - this
52 ;; estimate is obviously very much dependant on the content
53 1.05d0)
54
55 (defmethod encoding-factor ((format flexi-utf-16-format))
56 (declare #.*standard-optimize-settings*)
57 ;; usually one character maps to two octets, but characters with
58 ;; code points above #x10000 map to four octets - we assume that we
59 ;; usually don't see these characters but of course have to return a
60 ;; float
61 2.0d0)
62
63 (defmethod encoding-factor ((format flexi-utf-32-format))
64 (declare #.*standard-optimize-settings*)
65 ;; UTF-32 always matches every character to four octets
66 4)
67
68 (defmethod encoding-factor ((format flexi-crlf-mixin))
69 (declare #.*standard-optimize-settings*)
70 ;; if the sequence #\Return #\Linefeed is the line-end marker, this
71 ;; obviously makes encodings potentially longer and definitely makes
72 ;; the estimate unexact
73 (* 1.02d0 (call-next-method)))
74
75 (defgeneric check-end (format start end i)
76 (declare #.*fixnum-optimize-settings*)
77 (:documentation "Helper function used below to determine if we tried
78 to read past the end of the sequence.")
79 (:method (format start end i)
80 (declare #.*fixnum-optimize-settings*)
81 (declare (ignore start))
82 (declare (fixnum end i))
83 (when (> i end)
84 (signal-encoding-error format "This sequence can't be decoded ~
a63150f Fix typo
hans authored
85 using ~A as it is too short. ~A octet~:P missing at the end."
e7be80f New release 1.0.3
edi authored
86 (external-format-name format)
87 (- i end))))
88 (:method ((format flexi-utf-16-format) start end i)
89 (declare #.*fixnum-optimize-settings*)
90 (declare (fixnum start end i))
91 (declare (ignore i))
92 ;; don't warn twice
93 (when (evenp (- end start))
94 (call-next-method))))
95
96 (defgeneric compute-number-of-chars (format sequence start end)
97 (declare #.*standard-optimize-settings*)
98 (:documentation "Computes the exact number of characters required to
99 decode the sequence of octets in SEQUENCE from START to END using the
100 external format FORMAT."))
101
102 (defmethod compute-number-of-chars :around (format (list list) start end)
103 (declare #.*standard-optimize-settings*)
104 (call-next-method format (coerce list 'vector) start end))
105
106 (defmethod compute-number-of-chars ((format flexi-8-bit-format) sequence start end)
107 (declare #.*fixnum-optimize-settings*)
108 (declare (fixnum start end))
109 (declare (ignore sequence))
110 (- end start))
111
112 (defmethod compute-number-of-chars ((format flexi-crlf-mixin) sequence start end)
113 ;; this method only applies to the 8-bit formats as all other
114 ;; formats with CRLF line endings have their own specialized methods
115 ;; below
116 (declare #.*fixnum-optimize-settings*)
117 (declare (fixnum start end) (vector sequence))
118 (let ((i start)
119 (length (- end start)))
120 (declare (fixnum i length))
121 (loop
122 (when (>= i end)
123 (return))
124 (let ((position (search #.(vector +cr+ +lf+) sequence :start2 i :end2 end :test #'=)))
125 (unless position
126 (return))
127 (setq i (1+ position))
128 (decf length)))
129 length))
130
131 (defmethod compute-number-of-chars ((format flexi-utf-8-format) sequence start end)
132 (declare #.*fixnum-optimize-settings*)
133 (declare (fixnum start end) (vector sequence))
134 (let ((sum 0)
135 (i start))
136 (declare (fixnum i sum))
137 (loop
138 (when (>= i end)
139 (return))
140 (let* ((octet (aref sequence i))
141 ;; note that there are no validity checks here
142 (length (cond ((not (logbitp 7 octet)) 1)
143 ((= #b11000000 (logand* octet #b11100000)) 2)
144 ((= #b11100000 (logand* octet #b11110000)) 3)
145 (t 4))))
146 (declare (fixnum length) (type octet octet))
147 (incf sum)
148 (incf i length)))
149 (check-end format start end i)
150 sum))
151
152 (defmethod compute-number-of-chars ((format flexi-crlf-utf-8-format) sequence start end)
153 (declare #.*fixnum-optimize-settings*)
154 (declare (fixnum start end) (vector sequence))
155 (let ((sum 0)
156 (i start)
157 (last-octet 0))
158 (declare (fixnum i sum) (type octet last-octet))
159 (loop
160 (when (>= i end)
161 (return))
162 (let* ((octet (aref sequence i))
163 ;; note that there are no validity checks here
164 (length (cond ((not (logbitp 7 octet)) 1)
165 ((= #b11000000 (logand* octet #b11100000)) 2)
166 ((= #b11100000 (logand* octet #b11110000)) 3)
167 (t 4))))
168 (declare (fixnum length) (type octet octet))
169 (unless (and (= octet +lf+) (= last-octet +cr+))
170 (incf sum))
171 (incf i length)
172 (setq last-octet octet)))
173 (check-end format start end i)
174 sum))
175
176 (defmethod compute-number-of-chars :before ((format flexi-utf-16-format) sequence start end)
177 (declare #.*fixnum-optimize-settings*)
178 (declare (fixnum start end) (vector sequence))
179 (declare (ignore sequence))
180 (when (oddp (- end start))
181 (signal-encoding-error format "~A octet~:P cannot be decoded ~
182 using UTF-16 as ~:*~A is not even."
183 (- end start))))
184
185 (defmethod compute-number-of-chars ((format flexi-utf-16-le-format) sequence start end)
186 (declare #.*fixnum-optimize-settings*)
187 (declare (fixnum start end))
188 (let ((sum 0)
189 (i start))
190 (declare (fixnum i sum))
191 (decf end 2)
192 (loop
193 (when (> i end)
194 (return))
195 (let* ((high-octet (aref sequence (1+ i)))
196 (length (cond ((<= #xd8 high-octet #xdf) 4)
197 (t 2))))
198 (declare (fixnum length) (type octet high-octet))
199 (incf sum)
200 (incf i length)))
201 (check-end format start (+ end 2) i)
202 sum))
203
204 (defmethod compute-number-of-chars ((format flexi-utf-16-be-format) sequence start end)
205 (declare #.*fixnum-optimize-settings*)
206 (declare (fixnum start end) (vector sequence))
207 (let ((sum 0)
208 (i start))
209 (declare (fixnum i sum))
210 (decf end 2)
211 (loop
212 (when (> i end)
213 (return))
214 (let* ((high-octet (aref sequence i))
215 (length (cond ((<= #xd8 high-octet #xdf) 4)
216 (t 2))))
217 (declare (fixnum length) (type octet high-octet))
218 (incf sum)
219 (incf i length)))
220 (check-end format start (+ end 2) i)
221 sum))
222
223 (defmethod compute-number-of-chars ((format flexi-crlf-utf-16-le-format) sequence start end)
224 (declare #.*fixnum-optimize-settings*)
225 (declare (fixnum start end) (vector sequence))
226 (let ((sum 0)
227 (i start)
228 (last-octet 0))
229 (declare (fixnum i sum) (type octet last-octet))
230 (decf end 2)
231 (loop
232 (when (> i end)
233 (return))
234 (let* ((high-octet (aref sequence (1+ i)))
235 (length (cond ((<= #xd8 high-octet #xdf) 4)
236 (t 2))))
237 (declare (fixnum length) (type octet high-octet))
238 (unless (and (zerop high-octet)
239 (= (the octet (aref sequence i)) +lf+)
240 (= last-octet +cr+))
241 (incf sum))
242 (setq last-octet (if (zerop high-octet)
243 (aref sequence i)
244 0))
245 (incf i length)))
246 (check-end format start (+ end 2) i)
247 sum))
248
249 (defmethod compute-number-of-chars ((format flexi-crlf-utf-16-be-format) sequence start end)
250 (declare #.*fixnum-optimize-settings*)
251 (declare (fixnum start end) (vector sequence))
252 (let ((sum 0)
253 (i start)
254 (last-octet 0))
255 (declare (fixnum i sum) (type octet last-octet))
256 (decf end 2)
257 (loop
258 (when (> i end)
259 (return))
260 (let* ((high-octet (aref sequence i))
261 (length (cond ((<= #xd8 high-octet #xdf) 4)
262 (t 2))))
263 (declare (fixnum length) (type octet high-octet))
264 (unless (and (zerop high-octet)
265 (= (the octet (aref sequence (1+ i))) +lf+)
266 (= last-octet +cr+))
267 (incf sum))
268 (setq last-octet (if (zerop high-octet)
269 (aref sequence (1+ i))
270 0))
271 (incf i length)))
272 (check-end format start (+ end 2) i)
273 sum))
274
275 (defmethod compute-number-of-chars :before ((format flexi-utf-32-format) sequence start end)
276 (declare #.*fixnum-optimize-settings*)
277 (declare (fixnum start end))
278 (declare (ignore sequence))
279 (let ((length (- end start)))
280 (when (plusp (mod length 4))
281 (signal-encoding-error format "~A octet~:P cannot be decoded ~
282 using UTF-32 as ~:*~A is not a multiple-value of four."
283 length))))
284
285 (defmethod compute-number-of-chars ((format flexi-utf-32-format) sequence start end)
286 (declare #.*fixnum-optimize-settings*)
287 (declare (fixnum start end))
288 (declare (ignore sequence))
289 (ceiling (- end start) 4))
290
291 (defmethod compute-number-of-chars ((format flexi-crlf-utf-32-le-format) sequence start end)
292 (declare #.*fixnum-optimize-settings*)
293 (declare (fixnum start end) (vector sequence))
294 (let ((i start)
295 (length (ceiling (- end start) 4)))
296 (decf end 8)
297 (loop
298 (when (> i end)
299 (return))
300 (cond ((loop for j of-type fixnum from i
301 for octet across #.(vector +cr+ 0 0 0 +lf+ 0 0 0)
302 always (= octet (aref sequence j)))
303 (decf length)
304 (incf i 8))
305 (t (incf i 4))))
306 length))
307
308 (defmethod compute-number-of-chars ((format flexi-crlf-utf-32-be-format) sequence start end)
309 (declare #.*fixnum-optimize-settings*)
310 (declare (fixnum start end) (vector sequence))
311 (let ((i start)
312 (length (ceiling (- end start) 4)))
313 (decf end 8)
314 (loop
315 (when (> i end)
316 (return))
317 (cond ((loop for j of-type fixnum from i
318 for octet across #.(vector 0 0 0 +cr+ 0 0 0 +lf+)
319 always (= octet (aref sequence j)))
320 (decf length)
321 (incf i 8))
322 (t (incf i 4))))
323 length))
324
325 (defgeneric compute-number-of-octets (format sequence start end)
326 (declare #.*standard-optimize-settings*)
327 (:documentation "Computes the exact number of octets required to
328 encode the sequence of characters in SEQUENCE from START to END using
329 the external format FORMAT."))
330
331 (defmethod compute-number-of-octets :around (format (list list) start end)
332 (declare #.*standard-optimize-settings*)
333 (call-next-method format (coerce list 'string*) start end))
334
335 (defmethod compute-number-of-octets ((format flexi-8-bit-format) string start end)
336 (declare #.*fixnum-optimize-settings*)
337 (declare (fixnum start end))
338 (declare (ignore string))
339 (- end start))
340
341 (defmethod compute-number-of-octets ((format flexi-utf-8-format) string start end)
342 (declare #.*fixnum-optimize-settings*)
343 (declare (fixnum start end) (string string))
344 (let ((sum 0)
345 (i start))
346 (declare (fixnum i sum))
347 (loop
348 (when (>= i end)
349 (return))
350 (let* ((char-code (char-code (char string i)))
351 (char-length (cond ((< char-code #x80) 1)
352 ((< char-code #x800) 2)
353 ((< char-code #x10000) 3)
354 (t 4))))
355 (declare (fixnum char-length) (type char-code-integer char-code))
356 (incf sum char-length)
357 (incf i)))
358 sum))
359
360 (defmethod compute-number-of-octets ((format flexi-crlf-utf-8-format) string start end)
361 (declare #.*fixnum-optimize-settings*)
362 (declare (fixnum start end) (string string))
363 (let ((sum 0)
364 (i start))
365 (declare (fixnum i sum))
366 (loop
367 (when (>= i end)
368 (return))
369 (let* ((char-code (char-code (char string i)))
370 (char-length (cond ((= char-code #.(char-code #\Newline)) 2)
371 ((< char-code #x80) 1)
372 ((< char-code #x800) 2)
373 ((< char-code #x10000) 3)
374 (t 4))))
375 (declare (fixnum char-length) (type char-code-integer char-code))
376 (incf sum char-length)
377 (incf i)))
378 sum))
379
380 (defmethod compute-number-of-octets ((format flexi-utf-16-format) string start end)
381 (declare #.*fixnum-optimize-settings*)
382 (declare (fixnum start end) (string string))
383 (let ((sum 0)
384 (i start))
385 (declare (fixnum i sum))
386 (loop
387 (when (>= i end)
388 (return))
389 (let* ((char-code (char-code (char string i)))
390 (char-length (cond ((< char-code #x10000) 2)
391 (t 4))))
392 (declare (fixnum char-length) (type char-code-integer char-code))
393 (incf sum char-length)
394 (incf i)))
395 sum))
396
397 (defmethod compute-number-of-octets ((format flexi-crlf-utf-16-le-format) string start end)
398 (declare #.*fixnum-optimize-settings*)
399 (declare (fixnum start end) (string string))
400 (let ((sum 0)
401 (i start))
402 (declare (fixnum i sum))
403 (loop
404 (when (>= i end)
405 (return))
406 (let* ((char-code (char-code (char string i)))
407 (char-length (cond ((= char-code #.(char-code #\Newline)) 4)
408 ((< char-code #x10000) 2)
409 (t 4))))
410 (declare (fixnum char-length) (type char-code-integer char-code))
411 (incf sum char-length)
412 (incf i)))
413 sum))
414
415 (defmethod compute-number-of-octets ((format flexi-crlf-utf-16-be-format) string start end)
416 (declare #.*fixnum-optimize-settings*)
417 (declare (fixnum start end) (string string))
418 (let ((sum 0)
419 (i start))
420 (declare (fixnum i sum))
421 (loop
422 (when (>= i end)
423 (return))
424 (let* ((char-code (char-code (char string i)))
425 (char-length (cond ((= char-code #.(char-code #\Newline)) 4)
426 ((< char-code #x10000) 2)
427 (t 4))))
428 (declare (fixnum char-length) (type char-code-integer char-code))
429 (incf sum char-length)
430 (incf i)))
431 sum))
432
433 (defmethod compute-number-of-octets ((format flexi-utf-32-format) string start end)
434 (declare #.*fixnum-optimize-settings*)
435 (declare (fixnum start end))
436 (declare (ignore string))
437 (* 4 (- end start)))
438
439 (defmethod compute-number-of-octets ((format flexi-crlf-mixin) string start end)
440 (declare #.*fixnum-optimize-settings*)
441 (declare (fixnum start end) (string string))
442 (+ (call-next-method)
443 (* (case (external-format-name format)
444 (:utf-32 4)
445 (otherwise 1))
446 (count #\Newline string :start start :end end :test #'char=))))
447
448 (defgeneric character-length (format char)
449 (declare #.*fixnum-optimize-settings*)
450 (:documentation "Returns the number of octets needed to encode the
451 single character CHAR.")
452 (:method (format char)
453 (compute-number-of-octets format (string char) 0 1)))
454
455 (defmethod character-length :around ((format flexi-crlf-mixin) (char (eql #\Newline)))
456 (declare #.*fixnum-optimize-settings*)
457 (+ (call-next-method format +cr+)
458 (call-next-method format +lf+)))
459
460 (defmethod character-length ((format flexi-8-bit-format) char)
461 (declare #.*fixnum-optimize-settings*)
462 (declare (ignore char))
463 1)
464
465 (defmethod character-length ((format flexi-utf-32-format) char)
466 (declare #.*fixnum-optimize-settings*)
467 (declare (ignore char))
468 4)
Something went wrong with that request. Please try again.