Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 204 lines (189 sloc) 10.266 kB
feb181a @evanmiller First cut: R/W Textile and BBCode
authored
1 -module(jerome_rtf_consumer).
2
3 -include("jerome.hrl").
4
6d3a263 @evanmiller Support for in-line and external images.
authored
5 -export([consume/2]).
feb181a @evanmiller First cut: R/W Textile and BBCode
authored
6
7 recognize_word("fldinst") -> true;
41ee5fa HTML parser
Evan Miller authored
8 recognize_word("shppict") -> true;
feb181a @evanmiller First cut: R/W Textile and BBCode
authored
9 recognize_word(_) -> false.
10
6d3a263 @evanmiller Support for in-line and external images.
authored
11 consume(Binary, ImageFun) when is_binary(Binary) ->
25dbad1 Bug-fixes in RTF image processing
Evan Miller authored
12 {ok, Tokens} = jerome_rtf_scanner:scan(binary_to_list(Binary)),
feb181a @evanmiller First cut: R/W Textile and BBCode
authored
13 {ok, ParseTree} = jerome_rtf_parser:parse(Tokens),
14 PrunedTree = prune(ParseTree),
6d3a263 @evanmiller Support for in-line and external images.
authored
15 process_tree(PrunedTree, ImageFun).
feb181a @evanmiller First cut: R/W Textile and BBCode
authored
16
6d3a263 @evanmiller Support for in-line and external images.
authored
17 process_tree(PrunedTree, ImageFun) ->
18 {Ast, _Context} = process_tree(PrunedTree, [], #jerome_ctx{ image_fun = ImageFun }),
feb181a @evanmiller First cut: R/W Textile and BBCode
authored
19 Ast.
20
21 process_tree([], Acc, Context) ->
22 {jerome:consolidate(lists:reverse(Acc)), Context};
23 process_tree([{control_word, _, "pc"}|Rest], Acc, Context) ->
24 process_tree(Rest, Acc, Context#jerome_ctx{ ansi_code_page = 437 });
25 process_tree([{control_word, _, "pca"}|Rest], Acc, Context) ->
26 process_tree(Rest, Acc, Context#jerome_ctx{ ansi_code_page = 850 });
27 process_tree([{control_word, _, "ansicpg", CodePage}|Rest], Acc, Context) ->
28 process_tree(Rest, Acc, Context#jerome_ctx{ ansi_code_page = CodePage });
29 process_tree([{control_word, _, "i"}|Rest], Acc, Context) ->
30 process_tree(Rest, Acc, Context#jerome_ctx{ italic = true });
31 process_tree([{control_word, _, "i", 0}|Rest], Acc, Context) ->
32 process_tree(Rest, Acc, Context#jerome_ctx{ italic = false });
33 process_tree([{control_word, _, "b"}|Rest], Acc, Context) ->
34 process_tree(Rest, Acc, Context#jerome_ctx{ bold = true });
35 process_tree([{control_word, _, "b", 0}|Rest], Acc, Context) ->
36 process_tree(Rest, Acc, Context#jerome_ctx{ bold = false });
37 process_tree([{control_word, _, "plain"}|Rest], Acc, Context) ->
38 process_tree(Rest, Acc, Context#jerome_ctx{ italic = false, bold = false, underline = false });
39 process_tree([{control_word, _, "sub"}|Rest], Acc, Context) ->
40 process_tree(Rest, Acc, Context#jerome_ctx{ subscript = true, superscript = false });
41 process_tree([{control_word, _, "super"}|Rest], Acc, Context) ->
42 process_tree(Rest, Acc, Context#jerome_ctx{ subscript = false, superscript = true });
43 process_tree([{control_word, _, "nosupersub"}|Rest], Acc, Context) ->
44 process_tree(Rest, Acc, Context#jerome_ctx{ subscript = false, superscript = false });
45 process_tree([{control_word, _, "uc", ByteSize}|Rest], Acc, Context) ->
46 process_tree(Rest, Acc, Context#jerome_ctx{ unicode_size = ByteSize });
47 process_tree([{control_word, _, "u", UnicodePoint}|Rest], Acc, #jerome_ctx{ unicode_size = 0 } = Context) ->
48 process_tree(Rest, [{text, [UnicodePoint], jerome:text_properties(Context)}|Acc], Context);
49 process_tree([{control_word, _, "ql"}|Rest], Acc, Context) ->
50 process_tree(Rest, Acc, Context#jerome_ctx{ paragraph_alignment = left });
51 process_tree([{control_word, _, "qc"}|Rest], Acc, Context) ->
52 process_tree(Rest, Acc, Context#jerome_ctx{ paragraph_alignment = center });
53 process_tree([{control_word, _, "qr"}|Rest], Acc, Context) ->
54 process_tree(Rest, Acc, Context#jerome_ctx{ paragraph_alignment = right });
55 process_tree([{control_word, _, "qj"}|Rest], Acc, Context) ->
56 process_tree(Rest, Acc, Context#jerome_ctx{ paragraph_alignment = justified });
57 process_tree([{control_word, _, "pard"}|Rest], Acc, Context) ->
58 process_tree(Rest, Acc, Context#jerome_ctx{ paragraph_alignment = left, table = false });
59 process_tree([{control_word, _, "intbl"}|Rest], Acc, Context) ->
60 process_tree(Rest, Acc, Context#jerome_ctx{ table = true });
6d3a263 @evanmiller Support for in-line and external images.
authored
61 process_tree([{group, [{control_word, _, "NeXTGraphic"},
62 {text, _, Graphic}|_]}|Rest], Acc, Context) ->
25dbad1 Bug-fixes in RTF image processing
Evan Miller authored
63 {ok, Image} = (Context#jerome_ctx.image_fun)(string:strip(Graphic)),
6d3a263 @evanmiller Support for in-line and external images.
authored
64 process_tree(Rest, [{image, Image}|Acc], Context);
feb181a @evanmiller First cut: R/W Textile and BBCode
authored
65 process_tree([{table_row, Tree}|Rest], [{table, Rows}|Acc], Context) ->
66 {Ast, Context1} = process_tree(Tree, [], Context),
67 process_tree(Rest, [{table, Rows ++ [{table_row, Ast}]}|Acc], Context1);
68 process_tree([{table_row, Tree}|Rest], Acc, Context) ->
69 {Ast, Context1} = process_tree(Tree, [], Context),
70 process_tree(Rest, [{table, [{table_row, Ast}]}|Acc], Context1);
71 process_tree([{table_cell, Tree}|Rest], Acc, Context) ->
72 {Ast, Context1} = process_tree(Tree, [], Context),
73 process_tree(Rest, [{table_cell, Ast}|Acc], Context1);
74 process_tree([{group, [{control_word, _, "info"}|_]}|Rest], Acc, Context) ->
75 process_tree(Rest, Acc, Context);
76 process_tree([{group, [{control_word, _, "fonttbl"}|_]}|Rest], Acc, Context) ->
77 process_tree(Rest, Acc, Context);
78 process_tree([{group, [{control_word, _, "colortbl"}|_]}|Rest], Acc, Context) ->
79 process_tree(Rest, Acc, Context);
80 process_tree([{group, [{control_word, _, "field"},
81 {group, [{control_word, _, "fldinst"},
82 {group, [{text, _, "HYPERLINK "++Hyperlink}]}]},
83 {group, [{control_word, _, "fldrslt"}|Text]}]}|Rest], Acc, Context) ->
84 {Ast, _} = process_tree(Text, [], Context#jerome_ctx{ hyperlink = lists:sublist(Hyperlink, 2, length(Hyperlink)-2)}),
85 process_tree(Rest, lists:reverse(Ast, Acc), Context);
41ee5fa HTML parser
Evan Miller authored
86 process_tree([{group, [{control_word, _, "shppict"},
87 {group, [{control_word, _, "pict"}|Pict]}]}|Rest], Acc, Context) ->
88 {bin, _, Image} = hd(lists:reverse(Pict)),
89 process_tree(Rest, [{image, Image}|Acc], Context);
feb181a @evanmiller First cut: R/W Textile and BBCode
authored
90 process_tree([{list_item, _Bullet, Contents}|Rest], [{list, ListItems}|Acc], Context) ->
91 {Ast, Context1} = process_tree(Contents, [], Context),
92 process_tree(Rest, [{list, [{list_item, Ast}|ListItems]}|Acc], Context1);
93 process_tree([{list_item, _Bullet, Contents}|Rest], Acc, Context) ->
94 {Ast, Context1} = process_tree(Contents, [], Context),
95 process_tree(Rest, [{list, [{list_item, Ast}]}|Acc], Context1);
96 process_tree([{group, Tree}|Rest], Acc, Context) ->
97 {Ast, _} = process_tree(Tree, [], Context),
98 process_tree(Rest, lists:reverse(Ast, Acc), Context);
99 process_tree([{text, _, Text}|Rest], Acc, Context) ->
100 process_tree(Rest, [{text, Text, jerome:text_properties(Context)}|Acc], Context);
101 process_tree([{new_paragraph, _}|Rest], Acc, Context) ->
102 process_tree(Rest, [{paragraph, Context#jerome_ctx.paragraph_alignment}|Acc], Context);
103 process_tree([{control_word, _, "line"}|Rest], Acc, Context) ->
104 process_tree(Rest, [break|Acc], Context);
105 % UNICODE ALERT
106 process_tree([{control_word, _, "bullet"}|Rest], Acc, Context) ->
107 process_tree(Rest, [{text, "", jerome:text_properties(Context)}|Acc], Context);
108 process_tree([{control_word, _, "lquote"}|Rest], Acc, Context) ->
109 process_tree(Rest, [{text, "", jerome:text_properties(Context)}|Acc], Context);
110 process_tree([{control_word, _, "rquote"}|Rest], Acc, Context) ->
111 process_tree(Rest, [{text, "", jerome:text_properties(Context)}|Acc], Context);
112 process_tree([{control_word, _, "ldblquote"}|Rest], Acc, Context) ->
113 process_tree(Rest, [{text, "", jerome:text_properties(Context)}|Acc], Context);
114 process_tree([{control_word, _, "rdblquote"}|Rest], Acc, Context) ->
115 process_tree(Rest, [{text, "", jerome:text_properties(Context)}|Acc], Context);
116 process_tree([{control_word, _, "emdash"}|Rest], Acc, Context) ->
117 process_tree(Rest, [{text, "", jerome:text_properties(Context)}|Acc], Context);
118 process_tree([{control_word, _, "endash"}|Rest], Acc, Context) ->
119 process_tree(Rest, [{text, "", jerome:text_properties(Context)}|Acc], Context);
120 process_tree([{control_word, _, "emspace"}|Rest], Acc, Context) ->
121 process_tree(Rest, [{text, "", jerome:text_properties(Context)}|Acc], Context);
122 process_tree([{control_word, _, "enspace"}|Rest], Acc, Context) ->
123 process_tree(Rest, [{text, "", jerome:text_properties(Context)}|Acc], Context);
124 process_tree([{control_word, _, "qmspace"}|Rest], Acc, Context) ->
125 process_tree(Rest, [{text, "", jerome:text_properties(Context)}|Acc], Context);
126 % END UNICODE ALERT
127 process_tree([{control_char, _, '~'}|Rest], Acc, Context) ->
128 process_tree(Rest, [{nonbreaking_space, jerome:text_properties(Context)}|Acc], Context);
129 process_tree([{control_char, _, '-'}|Rest], Acc, Context) ->
130 process_tree(Rest, [{optional_hyphen, jerome:text_properties(Context)}|Acc], Context);
131 process_tree([{control_char, _, '_'}|Rest], Acc, Context) ->
132 process_tree(Rest, [{nonbreaking_hyphen, jerome:text_properties(Context)}|Acc], Context);
133 process_tree([{control_hex, _, Char}|Rest], Acc, Context) ->
134 process_tree(Rest, [{text, [to_unicode(Char)], jerome:text_properties(Context)}|Acc], Context);
135 process_tree([{control_word, _, _}|Rest], Acc, Context) ->
136 process_tree(Rest, Acc, Context);
137 process_tree([{control_word, _, _, _}|Rest], Acc, Context) ->
138 process_tree(Rest, Acc, Context).
139
140 prune(ParseTree) ->
141 prune(ParseTree, []).
142
143 prune([], Acc) ->
144 lists:reverse(Acc);
145
146 prune([{group, Tree}|Rest], Acc) ->
147 case prune(Tree, []) of
148 [] -> prune(Rest, Acc);
149 PrunedTree -> prune(Rest, [{group, PrunedTree}|Acc])
150 end;
151
152 prune([{control_char, _, '*'}, {control_word, _, Word} = ControlToken |Rest], Acc) ->
153 case recognize_word(Word) of
154 true -> prune(Rest, [ControlToken|Acc]);
155 false -> []
156 end;
157
158 prune([{control_char, _, '*'}, {control_word, _, Word, _} = ControlToken |Rest], Acc) ->
159 case recognize_word(Word) of
160 true -> prune(Rest, [ControlToken|Acc]);
161 false -> []
162 end;
163
164 prune([Other|Rest], Acc) ->
165 prune(Rest, [Other|Acc]).
166
167
168 to_unicode(C) when C < 16#80; C >= 16#A0 ->
169 C;
170 to_unicode(C) ->
171 case C of
172 16#80 -> 16#20AC;
173 16#82 -> 16#201A;
174 16#83 -> 16#0192;
175 16#84 -> 16#201E;
176 16#85 -> 16#2026;
177 16#86 -> 16#2020;
178 16#87 -> 16#2021;
179 16#88 -> 16#02C6;
180 16#89 -> 16#2030;
181 16#8A -> 16#0160;
182 16#8B -> 16#2039;
183 16#8C -> 16#0152;
184 % 0x8D UNDEFINED
185 16#8E -> 16#017D;
186 % 0x8F UNDEFINED
187 % 0x90 UNDEFINED
188 16#91 -> 16#2018;
189 16#92 -> 16#2019;
190 16#93 -> 16#201C;
191 16#94 -> 16#201D;
192 16#95 -> 16#2022;
193 16#96 -> 16#2013;
194 16#97 -> 16#2014;
195 16#98 -> 16#02DC;
196 16#99 -> 16#2122;
197 16#9A -> 16#0161;
198 16#9B -> 16#203A;
199 16#9C -> 16#0153;
200 % 0x9D UNDEFINED
201 16#9E -> 16#017E;
202 16#9F -> 16#0178
203 end.
Something went wrong with that request. Please try again.