Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 140 lines (134 sloc) 8.752 kb
41ee5fa5 »
2011-06-27 HTML parser
1 -module(jerome_html_scanner).
2
3 -export([scan/1]).
4
5 scan(HTML) ->
6 scan(HTML, [], {1, 1}, text).
7
8 scan([], Scanned, _, _) ->
9 {ok, lists:reverse(
10 lists:map(fun
11 ({text, Pos, Text}) ->
12 {text, Pos, lists:reverse(Text)};
13 ({open_url, Pos, Value}) ->
14 {open_url, Pos, lists:reverse(Value)};
15 (Token) ->
16 Token
17 end, Scanned))};
18
19 scan([$<, B, $> |T], Scanned, {Row, Column} = Pos, text) when B =:= $B; B =:= $b ->
20 scan(T, [{open_bold, Pos}|Scanned], {Row, Column + length("<b>")}, text);
21 scan([$<, $/, B, $> |T], Scanned, {Row, Column} = Pos, text) when B =:= $B; B =:= $b ->
22 scan(T, [{close_bold, Pos}|Scanned], {Row, Column + length("</b>")}, text);
23 scan([$<, I, $> |T], Scanned, {Row, Column} = Pos, text) when I =:= $I; I =:= $i ->
24 scan(T, [{open_italic, Pos}|Scanned], {Row, Column + length("<i>")}, text);
25 scan([$<, $/, I, $> |T], Scanned, {Row, Column} = Pos, text) when I =:= $I; I =:= $i ->
26 scan(T, [{close_italic, Pos}|Scanned], {Row, Column + length("</i>")}, text);
27 scan([$<, U, $> |T], Scanned, {Row, Column} = Pos, text) when U =:= $U; U =:= $u ->
28 scan(T, [{open_underline, Pos}|Scanned], {Row, Column + length("<u>")}, text);
29 scan([$<, $/, U, $> |T], Scanned, {Row, Column} = Pos, text) when U =:= $U; U =:= $u ->
30 scan(T, [{close_underline, Pos}|Scanned], {Row, Column + length("</u>")}, text);
31 scan("<strong>"++T, Scanned, {Row, Column} = Pos, text) ->
32 scan(T, [{open_bold, Pos}|Scanned], {Row, Column + length("<strong>")}, text);
33 scan("<STRONG>"++T, Scanned, {Row, Column} = Pos, text) ->
34 scan(T, [{open_bold, Pos}|Scanned], {Row, Column + length("<strong>")}, text);
35 scan("</strong>"++T, Scanned, {Row, Column} = Pos, text) ->
36 scan(T, [{close_bold, Pos}|Scanned], {Row, Column + length("</strong>")}, text);
37 scan("</STRONG>"++T, Scanned, {Row, Column} = Pos, text) ->
38 scan(T, [{close_bold, Pos}|Scanned], {Row, Column + length("</strong>")}, text);
39 scan("<em>"++T, Scanned, {Row, Column} = Pos, text) ->
40 scan(T, [{open_italic, Pos}|Scanned], {Row, Column + length("<em>")}, text);
41 scan("<EM>"++T, Scanned, {Row, Column} = Pos, text) ->
42 scan(T, [{open_italic, Pos}|Scanned], {Row, Column + length("<em>")}, text);
43 scan("</em>"++T, Scanned, {Row, Column} = Pos, text) ->
44 scan(T, [{close_italic, Pos}|Scanned], {Row, Column + length("</em>")}, text);
45 scan("</EM>"++T, Scanned, {Row, Column} = Pos, text) ->
46 scan(T, [{close_italic, Pos}|Scanned], {Row, Column + length("</em>")}, text);
47 scan("<sup>"++T, Scanned, {Row, Column} = Pos, text) ->
48 scan(T, [{open_superscript, Pos}|Scanned], {Row, Column + length("<sup>")}, text);
49 scan("<SUP>"++T, Scanned, {Row, Column} = Pos, text) ->
50 scan(T, [{open_superscript, Pos}|Scanned], {Row, Column + length("<sup>")}, text);
51 scan("</sup>"++T, Scanned, {Row, Column} = Pos, text) ->
52 scan(T, [{close_superscript, Pos}|Scanned], {Row, Column + length("</sup>")}, text);
53 scan("</SUP>"++T, Scanned, {Row, Column} = Pos, text) ->
54 scan(T, [{close_superscript, Pos}|Scanned], {Row, Column + length("</sup>")}, text);
55 scan("<sub>"++T, Scanned, {Row, Column} = Pos, text) ->
56 scan(T, [{open_subscript, Pos}|Scanned], {Row, Column + length("<sub>")}, text);
57 scan("<SUB>"++T, Scanned, {Row, Column} = Pos, text) ->
58 scan(T, [{open_subscript, Pos}|Scanned], {Row, Column + length("<sub>")}, text);
59 scan("</sub>"++T, Scanned, {Row, Column} = Pos, text) ->
60 scan(T, [{close_subscript, Pos}|Scanned], {Row, Column + length("</sub>")}, text);
61 scan("</SUB>"++T, Scanned, {Row, Column} = Pos, text) ->
62 scan(T, [{close_subscript, Pos}|Scanned], {Row, Column + length("</sub>")}, text);
63 scan("<br>"++T, Scanned, {Row, Column} = Pos, text) ->
64 scan(T, [{newline, Pos}|Scanned], {Row, Column + length("<br>")}, text);
65 scan("<BR>"++T, Scanned, {Row, Column} = Pos, text) ->
66 scan(T, [{newline, Pos}|Scanned], {Row, Column + length("<br>")}, text);
4fadd44f »
2011-06-27 Fix shift/reduce conflicts in textile parser
67 scan("<div>"++T, Scanned, {Row, Column} = Pos, text) ->
68 scan(T, [{newline, Pos}|Scanned], {Row, Column + length("<div>")}, text);
41ee5fa5 »
2011-06-27 HTML parser
69 scan("<a href=\""++T, Scanned, {Row, Column} = Pos, text) ->
70 scan(T, [{open_url, Pos, ""}|Scanned], {Row, Column + length("<a href=\"")}, in_url);
71 scan("<A HREF=\""++T, Scanned, {Row, Column} = Pos, text) ->
72 scan(T, [{open_url, Pos, ""}|Scanned], {Row, Column + length("<a href=\"")}, in_url);
73 scan("\">"++T, Scanned, {Row, Column}, in_url) ->
74 scan(T, Scanned, {Row, Column + length("\">")}, text);
75 scan("&amp;"++T, [{open_url, Pos, Value}|Scanned], {Row, Column}, in_url) ->
76 scan(T, [{open_url, Pos, [$&|Value]}|Scanned], {Row, Column + length("&amp;")}, in_url);
77 scan([H|T], [{open_url, Pos, Value}|Scanned], {Row, Column}, in_url) ->
78 scan(T, [{open_url, Pos, [H|Value]}|Scanned], {Row, Column + 1}, in_url);
79 scan([$<, $/, A, $> |T], Scanned, {Row, Column} = Pos, text) when A =:= $A; A =:= $a ->
80 scan(T, [{close_url, Pos}|Scanned], {Row, Column} = Pos, text);
81 scan("\r\n"++T, [{text, TPos, Text}|Scanned], {Row, _Column}, text) ->
b559560d »
2011-06-27 HTML/BBCode fixes
82 scan(T, [{text, TPos, [$\ |Text]}|Scanned], {Row + 1, 0}, text);
83 scan("\r\n"++T, [{text, TPos, " "++Text}|Scanned], {Row, _Column}, text) ->
84 scan(T, [{text, TPos, [$\ |Text]}|Scanned], {Row + 1, 0}, text);
85 scan("\r\n"++T, Scanned, {Row, _Column}, text) ->
86 scan(T, Scanned, {Row + 1, 0}, text);
41ee5fa5 »
2011-06-27 HTML parser
87 scan("\n"++T, [{text, TPos, Text}|Scanned], {Row, _Column}, text) ->
b559560d »
2011-06-27 HTML/BBCode fixes
88 scan(T, [{text, TPos, [$\ |Text]}|Scanned], {Row + 1, 0}, text);
89 scan("\n"++T, [{text, TPos, " "++Text}|Scanned], {Row, _Column}, text) ->
90 scan(T, [{text, TPos, [$\ |Text]}|Scanned], {Row + 1, 0}, text);
91 scan("\n"++T, Scanned, {Row, _Column}, text) ->
92 scan(T, Scanned, {Row + 1, 0}, text);
41ee5fa5 »
2011-06-27 HTML parser
93 scan("<"++T, Scanned, {Row, Column}, text) ->
94 scan(T, Scanned, {Row, Column + 1}, in_tag);
95 scan("\""++T, Scanned, {Row, Column}, in_tag) ->
96 scan(T, Scanned, {Row, Column + 1}, in_double_quote);
97 scan("\""++T, Scanned, {Row, Column}, in_double_quote) ->
98 scan(T, Scanned, {Row, Column + 1}, in_tag);
99 scan("\'"++T, Scanned, {Row, Column}, in_tag) ->
100 scan(T, Scanned, {Row, Column + 1}, in_single_quote);
101 scan("\'"++T, Scanned, {Row, Column}, in_single_quote) ->
102 scan(T, Scanned, {Row, Column + 1}, in_tag);
103 scan(">"++T, Scanned, {Row, Column}, in_tag) ->
104 scan(T, Scanned, {Row, Column + 1}, text);
b559560d »
2011-06-27 HTML/BBCode fixes
105 scan([_H|T], Scanned, {Row, Column}, State) when State =:= in_tag; State =:= in_double_quote; State =:= in_single_quote ->
106 scan(T, Scanned, {Row, Column + 1}, State);
41ee5fa5 »
2011-06-27 HTML parser
107 scan("&amp;"++T, Scanned, {Row, Column} = Pos, text) ->
108 scan(T, append_text(Scanned, Pos, [$&]), {Row, Column + length("&amp;")}, text);
109 scan("&quot;"++T, Scanned, {Row, Column} = Pos, text) ->
110 scan(T, append_text(Scanned, Pos, [$"]), {Row, Column + length("&quot;")}, text);
111 scan("&lt;"++T, Scanned, {Row, Column} = Pos, text) ->
112 scan(T, append_text(Scanned, Pos, [$<]), {Row, Column + length("&lt;")}, text);
113 scan("&gt;"++T, Scanned, {Row, Column} = Pos, text) ->
114 scan(T, append_text(Scanned, Pos, [$>]), {Row, Column + length("&gt;")}, text);
115 scan("&nbsp;"++T, Scanned, {Row, Column} = Pos, text) ->
116 scan(T, append_text(Scanned, Pos, [$\ ]), {Row, Column + length("&nbsp;")}, text);
117 scan("&#x"++T, Scanned, {Row, Column}, text) ->
118 scan(T, Scanned, {Row, Column + length("&#x")}, {in_hex, 0});
119 scan("&#"++T, Scanned, {Row, Column}, text) ->
120 scan(T, Scanned, {Row, Column + length("&#")}, {in_decimal, 0});
121 scan([H|T], Scanned, {Row, Column}, {in_hex, Value}) when H >= $A andalso H =< $F ->
122 scan(T, Scanned, {Row, Column + 1}, {in_hex, Value * 16 + H - $A});
123 scan([H|T], Scanned, {Row, Column}, {in_hex, Value}) when H >= $a andalso H =< $f ->
124 scan(T, Scanned, {Row, Column + 1}, {in_hex, Value * 16 + H - $a});
125 scan([H|T], Scanned, {Row, Column}, {in_hex, Value}) when H >= $0 andalso H =< $9 ->
126 scan(T, Scanned, {Row, Column + 1}, {in_hex, Value * 16 + H - $0});
127 scan([H|T], Scanned, {Row, Column}, {in_decimal, Value}) when H >= $0 andalso H =< $9 ->
128 scan(T, Scanned, {Row, Column + 1}, {in_decimal, Value * 10 + H - $0});
129 scan(";"++T, Scanned, {Row, Column} = Pos, {in_hex, Value}) ->
130 scan(T, append_text(Scanned, Pos, [Value]), {Row, Column + 1}, text);
131 scan(";"++T, Scanned, {Row, Column} = Pos, {in_decimal, Value}) ->
132 scan(T, append_text(Scanned, Pos, [Value]), {Row, Column + 1}, text);
133 scan([H|T], Scanned, {Row, Column} = Pos, text) ->
134 scan(T, append_text(Scanned, Pos, [H]), {Row, Column + 1}, text).
135
136 append_text([{text, TPos, Chars}|Scanned], _Pos, Text) ->
137 [{text, TPos, lists:reverse(Text, Chars)}|Scanned];
138 append_text(Scanned, Pos, Text) ->
139 [{text, Pos, lists:reverse(Text)}|Scanned].
Something went wrong with that request. Please try again.