/
utf_conv.bas
259 lines (217 loc) · 8.39 KB
/
utf_conv.bas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#include "fbcunit.bi"
#include once "utf_conv.bi"
#include once "crt/string.bi"
SUITE( fbc_tests.wstring_.utf_conv )
TEST( zstringToUTF )
dim as zstring ptr srcstr = @"ã é ô"
dim as byte ptr utfstr
dim as integer bytes
utfstr = CharToUTF( UTF_ENCOD_UTF8, srcstr, len( *srcstr ) + 1, NULL, @bytes )
dim as zstring ptr newstr
newstr = UTFToChar( UTF_ENCOD_UTF8, utfstr, NULL, @bytes )
CU_ASSERT( *newstr = *srcstr )
deallocate( newstr )
deallocate( utfstr )
END_TEST
TEST( wstringToUTF )
dim as wstring ptr srcstr = @wstr("ã é ô")
dim as byte ptr utfstr
dim as integer bytes
utfstr = WCharToUTF( UTF_ENCOD_UTF8, srcstr, len( *srcstr ) + 1, NULL, @bytes )
dim as wstring ptr newstr
newstr = UTFToWChar( UTF_ENCOD_UTF8, utfstr, NULL, @bytes )
CU_ASSERT( *newstr = *srcstr )
deallocate( newstr )
deallocate( utfstr )
END_TEST
'' If given a NULL destination, all conversion cases should allocate a result
'' buffer that the caller must deallocate.
TEST( wstringToNullDest )
dim w as wstring * 10 = "123"
dim as integer bytes
dim as ubyte ptr utf8 = WCharToUTF( UTF_ENCOD_UTF8, w, len( w ), NULL, @bytes )
CU_ASSERT( cuint( utf8 ) <> cuint( @w ) )
deallocate( utf8 )
dim as ushort ptr utf16 = WCharToUTF( UTF_ENCOD_UTF16, w, len( w ), NULL, @bytes )
CU_ASSERT( cuint( utf16 ) <> cuint( @w ) )
deallocate( utf16 )
dim as ulong ptr utf32 = WCharToUTF( UTF_ENCOD_UTF32, w, len( w ), NULL, @bytes )
CU_ASSERT( cuint( utf32 ) <> cuint( @w ) )
deallocate( utf32 )
END_TEST
'' If given a NULL destination, all conversion cases should allocate a result
'' buffer that the caller must deallocate.
TEST( zstringToNullDest )
dim z as zstring * 10 = "123"
dim as integer bytes
dim as ubyte ptr utf8 = CharToUTF( UTF_ENCOD_UTF8, z, len( z ), NULL, @bytes )
CU_ASSERT( cuint( utf8 ) <> cuint( @z ) )
deallocate( utf8 )
dim as ushort ptr utf16 = CharToUTF( UTF_ENCOD_UTF16, z, len( z ), NULL, @bytes )
CU_ASSERT( cuint( utf16 ) <> cuint( @z ) )
deallocate( utf16 )
dim as ulong ptr utf32 = CharToUTF( UTF_ENCOD_UTF32, z, len( z ), NULL, @bytes )
CU_ASSERT( cuint( utf32 ) <> cuint( @z ) )
deallocate( utf32 )
END_TEST
'' Tests WCharToUTF and UTFToWChar with NULL dest.
'' The rtlib has UTF encoding routines that can allocate memory for us. If using
'' that version of them, they pre-allocate some chars (currently typically 8) at
'' a time. To ensure this works correctly, we have to test with strings longer
'' than that.
TEST( UTFAndWCharNullDest )
const TEXT = "testing with a long, but simple 7-bit ASCII string"
dim src as wstring * 100 = wstr( TEXT )
CU_ASSERT( len( src ) = len( TEXT ) )
CU_ASSERT( src = TEXT )
scope
'' Convert to UTF8, including null terminator
dim as integer utf8bytes
dim as ubyte ptr utf8 = WCharToUTF( UTF_ENCOD_UTF8, src, len( src ) + 1, NULL, @utf8bytes )
CU_ASSERT( utf8bytes = (len( TEXT ) + 1) )
CU_ASSERT( *cptr( zstring ptr, utf8 ) = TEXT )
'' Convert back to wstring
dim as integer chars
dim as wstring ptr w = UTFToWChar( UTF_ENCOD_UTF8, utf8, NULL, @chars )
CU_ASSERT( chars = len( TEXT ) )
CU_ASSERT( *w = src )
deallocate( utf8 )
deallocate( w )
end scope
scope
'' Convert to UTF16, including null terminator
dim as integer utf16bytes
dim as ushort ptr utf16 = WCharToUTF( UTF_ENCOD_UTF16, src, len( src ) + 1, NULL, @utf16bytes )
CU_ASSERT( (utf16bytes mod 2) = 0 )
CU_ASSERT( (utf16bytes \ 2) = (len( TEXT ) + 1) )
for i as integer = 0 to (utf16bytes \ 2) - 1
CU_ASSERT( utf16[i] = src[i] )
next
'' Convert back to wstring
dim as integer chars
dim as wstring ptr w = UTFToWChar( UTF_ENCOD_UTF16, utf16, NULL, @chars )
CU_ASSERT( chars = len( TEXT ) )
CU_ASSERT( *w = src )
deallocate( utf16 )
deallocate( w )
end scope
scope
'' Convert to UTF32, including null terminator
dim as integer utf32bytes
dim as ulong ptr utf32 = WCharToUTF( UTF_ENCOD_UTF32, src, len( src ) + 1, NULL, @utf32bytes )
CU_ASSERT( (utf32bytes mod 4) = 0 )
CU_ASSERT( (utf32bytes \ 4) = (len( TEXT ) + 1) )
for i as integer = 0 to (utf32bytes \ 4) - 1
CU_ASSERT( utf32[i] = src[i] )
next
'' Convert back to wstring
dim as integer chars
dim as wstring ptr w = UTFToWChar( UTF_ENCOD_UTF32, utf32, NULL, @chars )
CU_ASSERT( chars = len( TEXT ) )
CU_ASSERT( *w = src )
deallocate( utf32 )
deallocate( w )
end scope
END_TEST
'' Tests WCharToUTF and UTFToWChar with a preallocated destination buffer.
'' These do not test the cases where the destination buffer is not long enough
'' to hold the output string (which should work but result in no null
'' terminators).
TEST( UTFAndWCharPreallocDest )
const TEXT = "testing with a long, but simple 7-bit ASCII string"
dim src as wstring * 100 = wstr( TEXT )
'' Use buffers shorter than the input buffer, but longer than the input string
dim as integer buflen = (len( src ) + 2) * 4
dim as ubyte ptr utfbuf = callocate( buflen )
dim as ubyte ptr wbuf = callocate( buflen )
scope
'' Convert to UTF8, including null terminator
dim as integer utf8bytes
dim as ubyte ptr utf8 = WCharToUTF( UTF_ENCOD_UTF8, src, len( src ) + 1, utfbuf, @utf8bytes )
CU_ASSERT( utf8 = utfbuf )
CU_ASSERT( utf8bytes = (len( TEXT ) + 1) )
CU_ASSERT( *cptr( zstring ptr, utf8 ) = TEXT )
'' Convert back to wstring
dim as integer chars = buflen '' Input buffer length in characters
dim as wstring ptr w = UTFToWChar( UTF_ENCOD_UTF8, utf8, wbuf, @chars )
CU_ASSERT( w = wbuf )
CU_ASSERT( chars = len( TEXT ) )
CU_ASSERT( *w = src )
end scope
scope
'' Convert to UTF16, including null terminator
dim as integer utf16bytes
memset( utfbuf, 0, buflen )
dim as ushort ptr utf16 = WCharToUTF( UTF_ENCOD_UTF16, src, len( src ) + 1, utfbuf, @utf16bytes )
CU_ASSERT( utf16 = cptr( ushort ptr, utfbuf ) )
CU_ASSERT( (utf16bytes mod 2) = 0 )
CU_ASSERT( (utf16bytes \ 2) = (len( TEXT ) + 1) )
for i as integer = 0 to (utf16bytes \ 2) - 1
CU_ASSERT( utf16[i] = src[i] )
next
'' Convert back to wstring
dim as integer chars = buflen \ 2 '' Input buffer length in characters
memset( wbuf, 0, buflen )
dim as wstring ptr w = UTFToWChar( UTF_ENCOD_UTF16, utf16, wbuf, @chars )
CU_ASSERT( w = wbuf )
CU_ASSERT( chars = len( TEXT ) )
CU_ASSERT( *w = src )
end scope
scope
'' Convert to UTF32, including null terminator
dim as integer utf32bytes
memset( utfbuf, 0, buflen )
dim as ulong ptr utf32 = WCharToUTF( UTF_ENCOD_UTF32, src, len( src ) + 1, utfbuf, @utf32bytes )
CU_ASSERT( utf32 = cptr( ulong ptr, utfbuf ) )
CU_ASSERT( (utf32bytes mod 4) = 0 )
CU_ASSERT( (utf32bytes \ 4) = (len( TEXT ) + 1) )
for i as integer = 0 to (utf32bytes \ 4) - 1
CU_ASSERT( utf32[i] = src[i] )
next
'' Convert back to wstring
dim as integer chars = buflen \ 4 '' Input buffer length in characters
memset( wbuf, 0, buflen )
dim as wstring ptr w = UTFToWChar( UTF_ENCOD_UTF32, utf32, wbuf, @chars )
CU_ASSERT( w = wbuf )
CU_ASSERT( chars = len( TEXT ) )
CU_ASSERT( *w = src )
end scope
deallocate( utfbuf )
deallocate( wbuf )
END_TEST
TEST( buildUtf16SurrogatePair )
#ifdef __FB_LINUX__
dim utf32 as wstring * 2
utf32[0] = &h292B1
dim utf16bytes as integer
dim utf16 as ushort ptr = WCharToUTF( UTF_ENCOD_UTF16, @utf32, 1, NULL, @utf16bytes )
CU_ASSERT( utf16bytes = sizeof( ushort ) * 2 )
CU_ASSERT( utf16[0] = &hD864 )
CU_ASSERT( utf16[1] = &hDEB1 )
#elseif defined( __FB_WIN32__ )
dim utf32(0 to 1) as ulong
utf32(0) = &h292B1
dim utf16chars as integer
dim utf16 as wstring ptr = UTFToWchar( UTF_ENCOD_UTF32, @utf32(0), NULL, @utf16chars )
CU_ASSERT( utf16chars = 2 )
CU_ASSERT( (*utf16)[0] = &hD864 )
CU_ASSERT( (*utf16)[1] = &hDEB1 )
#endif
END_TEST
'' If there's not enough room for a full surrogate pair in the destination buffer,
'' currently FB just writes the low surrogate (which is pretty weird, since it's
'' normally behind (at a higher address than) the high surrogate, in an array of
'' 16bit units).
TEST( buildPartialUtf16SurrogatePair )
#if defined( __FB_WIN32__ )
dim utf32(0 to 1) as ulong
utf32(0) = &h292B1
dim utf16chars as integer = 1
dim utf16 as wstring * 2
UTFToWchar( UTF_ENCOD_UTF32, @utf32(0), @utf16, @utf16chars )
CU_ASSERT( utf16chars = 1 )
CU_ASSERT( utf16[0] = &hDEB1 )
CU_ASSERT( utf16[1] = 0 )
#endif
END_TEST
END_SUITE