Skip to content

Commit

Permalink
Support all HTML5 entities
Browse files Browse the repository at this point in the history
  • Loading branch information
etrepum committed Oct 15, 2011
1 parent 57f6d12 commit 38875c5
Show file tree
Hide file tree
Showing 3 changed files with 2,194 additions and 272 deletions.
45 changes: 45 additions & 0 deletions scripts/entities.erl
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env escript
%% -*- mode: erlang -*-
-export([main/1]).

%% @doc Script used to generate mochiweb_charref.erl table.

main(_) ->
application:start(inets),
code:add_patha("ebin"),
{ok, {_, _, HTML}} = httpc:request("http://www.w3.org/TR/html5/named-character-references.html"),
print(lists:sort(search(mochiweb_html:parse(HTML)))).

print([F | T]) ->
io:put_chars([clause(F), ";\n"]),
print(T);
print([]) ->
io:put_chars(["entity(_) -> undefined.\n"]),
ok.

clause({Title, [Codepoint]}) ->
["entity(\"", Title, "\") -> 16#", Codepoint];
clause({Title, [First | Rest]}) ->
["entity(\"", Title, "\") -> [16#", First,
[[", 16#", Codepoint] || Codepoint <- Rest],
"]"].


search(Elem) ->
search(Elem, []).

search({<<"tr">>, [{<<"id">>, <<"entity-", _/binary>>} | _], Children}, Acc) ->
%% HTML5 charrefs can have more than one code point(!)
[{<<"td">>, _, [{<<"code">>, _, [TitleSemi]}]},
{<<"td">>, [], [RawCPs]} | _] = Children,
L = byte_size(TitleSemi) - 1,
<<Title:L/binary, $;>> = TitleSemi,
{match, Matches} = re:run(RawCPs, "(?:\\s*U\\+)([a-fA-F0-9]+)",
[{capture, all, binary}, global]),
[{Title, [CP || [_, CP] <- Matches]} | Acc];
search({Tag, Attrs, [H | T]}, Acc) ->
search({Tag, Attrs, T}, search(H, Acc));
search({_Tag, _Attrs, []}, Acc) ->
Acc;
search(<<_/binary>>, Acc) ->
Acc.
Loading

0 comments on commit 38875c5

Please sign in to comment.