Skip to content

Commit

Permalink
Klarna fixes - in particular support for ISO 8859-15
Browse files Browse the repository at this point in the history
  • Loading branch information
willemdj committed Sep 2, 2011
1 parent 609ed2d commit 326e53e
Show file tree
Hide file tree
Showing 10 changed files with 3,133 additions and 970 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
*.swp
*.beam
config.log
config.status
include.mk
2,376 changes: 1,415 additions & 961 deletions doc/erlsom.htm

Large diffs are not rendered by default.

10 changes: 6 additions & 4 deletions src/erlsom_lib.erl
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,8 @@ detect_encoding3(Variables) ->
utf8;
'iso-8859-1' ->
iso_8859_1;
'iso-8859-15' ->
iso_8859_15;
_ -> throw({error, "Encoding " ++ Encoding ++ " not supported"})
end;
_ ->
Expand All @@ -601,10 +603,10 @@ encoding_type(Cs) when is_list(Cs) ->
"iso_8859_1" -> 'iso-8859-1';
"iso_8859-1" -> 'iso-8859-1';
"iso8859-1" -> 'iso-8859-1';
"iso-8859-15" -> 'iso-8859-1';
"iso_8859_15" -> 'iso-8859-1';
"iso_8859-15" -> 'iso-8859-1';
"iso8859-15" -> 'iso-8859-1';
"iso-8859-15" -> 'iso-8859-15';
"iso_8859_15" -> 'iso-8859-15';
"iso_8859-15" -> 'iso-8859-15';
"iso8859-15" -> 'iso-8859-15';
"utf-8" -> 'utf-8';
"utf_8" -> 'utf-8';
_ -> false
Expand Down
2 changes: 2 additions & 0 deletions src/erlsom_sax.erl
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ parseDocumentBinary(Encoding, Xml, State) ->
erlsom_sax_latin1:parse(Xml, State);
'iso_8859_1' ->
erlsom_sax_latin1:parse(Xml, State);
'iso_8859_15' ->
erlsom_sax_latin9:parse(Xml, State);
'list' ->
erlsom_sax_list:parse(Xml, State);
_ ->
Expand Down
63 changes: 62 additions & 1 deletion src/erlsom_sax_latin1.erl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
%% this file exists several times, but with different names:
%% erlsom_sax_utf8, erlsom_sax_latin1 etc.
%% The only difference to the content of these 2 files is the definition below:
%% it can be UTF8, LAT1, U16B or U16L. (The names have been chosen so that the
%% it can be UTF8, LAT1, LAT9, U16B or U16L. (The names have been chosen so that the
%% number of bytes in the file will be the same in either case, so that it is
%% easy to see whether the files are the same, although this check is obviously
%% rather primitive.)
Expand Down Expand Up @@ -129,6 +129,30 @@
-define(BOM3, no_match2).
-endif.

-ifdef(LAT9).
-module(erlsom_sax_latin9).
-define(BINARY, true).
-define(STR1(X), <<X>>).
-define(STR2(X1, X2), <<X1, X2>>).
-define(STR3(X1, X2, X3), <<X1, X2, X3>>).
-define(STR4(X1, X2, X3, X4), <<X1, X2, X3, X4>>).
-define(STR5(X1, X2, X3, X4, X5), <<X1, X2, X3, X4, X5>>).
-define(STR6(X1, X2, X3, X4, X5, X6), <<X1, X2, X3, X4, X5, X6>>).
-define(STR7(X1, X2, X3, X4, X5, X6, X7), <<X1, X2, X3, X4, X5, X6, X7>>).
-define(STR8(X1, X2, X3, X4, X5, X6, X7, X8), <<X1, X2, X3, X4, X5, X6, X7, X8>>).
-define(DONTCARE_T(Y), <<_, Y/binary>>).
-define(STR1_T(X, Y), <<X, Y/binary>>).
-define(STR2_T(X1, X2, Y), <<X1, X2, Y/binary>>).
-define(STR3_T(X1, X2, X3, Y), <<X1, X2, X3, Y/binary>>).
-define(STR4_T(X1, X2, X3, X4, Y), <<X1, X2, X3, X4, Y/binary>>).
-define(STR7_T(X1, X2, X3, X4, X5, X6, X7, Y), <<X1, X2, X3, X4, X5, X6, X7, Y/binary>>).
-define(STR8_T(X1, X2, X3, X4, X5, X6, X7, X8, Y), <<X1, X2, X3, X4, X5, X6, X7, X8, Y/binary>>).
-define(STR9_T(X1, X2, X3, X4, X5, X6, X7, X8, X9, Y), <<X1, X2, X3, X4, X5, X6, X7, X8, X9, Y/binary>>).
-define(BOM1(X), [no_match | X]).
-define(BOM2, no_match).
-define(BOM3, no_match2).
-endif.

-ifdef(LIST).
-module(erlsom_sax_list).
-define(EMPTY, []).
Expand Down Expand Up @@ -352,6 +376,33 @@ decodeChar(Tail, State) ->
end.
-endif.

-ifdef(LAT9).
decodeChar(Tail, State) ->
case Tail of
?EMPTY -> ?CF3(Tail, State, fun decodeChar/2);
?STR1_T(C, T) -> {latin9toUnicode(C), T, State}
end.

latin9toUnicode(16#A4) -> % EURO SIGN
16#20AC;
latin9toUnicode(16#A6) -> % LATIN CAPITAL LETTER S WITH CARON
16#0160;
latin9toUnicode(16#A8) -> % LATIN SMALL LETTER S WITH CARON
16#0161;
latin9toUnicode(16#B4) -> % LATIN CAPITAL LETTER Z WITH CARON
16#017D;
latin9toUnicode(16#B8) -> % LATIN SMALL LETTER Z WITH CARON
16#017E;
latin9toUnicode(16#BC) -> % LATIN CAPITAL LIGATURE OE
16#0152;
latin9toUnicode(16#BD) -> % LATIN SMALL LIGATURE OE
16#0153;
latin9toUnicode(16#BE) -> % LATIN CAPITAL LETTER Y WITH DIAERESIS
16#0178;
latin9toUnicode(Char) ->
Char.
-endif.

-ifdef(LIST).
decodeChar(Tail, State) ->
case Tail of
Expand Down Expand Up @@ -1078,6 +1129,11 @@ encode(List) ->
list_to_binary(List).
-endif.

-ifdef(LAT9).
encode(List) ->
list_to_binary(List).
-endif.

-ifdef(LIST).
encode(List) ->
List.
Expand Down Expand Up @@ -1319,6 +1375,11 @@ decode(Bin) ->
Value.
-endif.

-ifdef(LAT9).
decode(Bin) ->
[latin9toUnicode(Char) || Char <- binary_to_list(Bin)].
-endif.

-ifdef(U16B).
decode(Bin) ->
{Value, _} = erlsom_ucs:from_utf16be(Bin),
Expand Down
Loading

0 comments on commit 326e53e

Please sign in to comment.