Browse files

First commit

  • Loading branch information...
0 parents commit 4af4b7b4fd1fa5c9340086b3c415880314d564c8 @evanmiller committed Jun 5, 2011
Showing with 417 additions and 0 deletions.
  1. +12 −0 Makefile
  2. +10 −0 README.md
  3. BIN rebar
  4. +1 −0 rebar.config
  5. +9 −0 src/jerome.app.src
  6. +18 −0 src/jerome.erl
  7. +29 −0 src/jerome_html.erl
  8. +168 −0 src/jerome_rtf.erl
  9. +42 −0 src/jerome_rtf_parser.yrl
  10. +128 −0 src/jerome_rtf_scanner.erl
12 Makefile
@@ -0,0 +1,12 @@
+ERL=erl
+REBAR=./rebar
+
+
+all: compile
+
+compile:
+ @$(REBAR) compile
+
+clean:
+ @$(REBAR) clean
+ rm -fv erl_crash.dump
10 README.md
@@ -0,0 +1,10 @@
+Jerome: Erlang rich-text processor
+==================================
+
+Jerome is designed to read and write many rich-text formats. Right now it only reads RTF and emits HTML. Usage:
+
+ % Read
+ RichText = jerome:read("/path/to/file.rtf", rtf)
+
+ % Write
+ jerome:write(RichText, "/path/to/file.html", html)
BIN rebar
Binary file not shown.
1 rebar.config
@@ -0,0 +1 @@
+{erl_opts, [debug_info]}.
9 src/jerome.app.src
@@ -0,0 +1,9 @@
+%% -*- mode: erlang -*-
+{application, jerome,
+ [{description, "Rich-text processor"},
+ {vsn, "0.1.0"},
+ {modules, [
+ ]},
+ {applications, [kernel, stdlib]},
+ {registered, []}
+ ]}.
18 src/jerome.erl
@@ -0,0 +1,18 @@
+-module(jerome).
+
+-compile(export_all).
+
+% Jerome - a rich-text reader/writer
+
+read(Path, Format) ->
+ {ok, Binary} = file:read_file(Path),
+ case Format of
+ rtf -> jerome_rtf:read(Binary)
+ end.
+
+write(Ast, Path, Format) ->
+ IOList = case Format of
+ html ->
+ jerome_html:write(Ast)
+ end,
+ file:write_file(Path, IOList).
29 src/jerome_html.erl
@@ -0,0 +1,29 @@
+-module(jerome_html).
+
+-compile(export_all).
+
+
+write_text(Text, [bold|Rest]) ->
+ ["<strong>", write_text(Text, Rest), "</strong>"];
+write_text(Text, [italic|Rest]) ->
+ ["<em>", write_text(Text, Rest), "</em>"];
+write_text(Text, [{hyperlink, Destination}|Rest]) ->
+ ["<a href=\"", Destination, "\">", write_text(Text, Rest), "</a>"];
+write_text(Text, []) ->
+ Text.
+
+write(Ast) ->
+ write(Ast, []).
+
+write([], Acc) ->
+ lists:reverse(Acc);
+write([{text, Text, Properties}|Rest], Acc) ->
+ write(Rest, [write_text(Text, Properties)|Acc]);
+write([{table, Rows}|Rest], Acc) ->
+ write(Rest, lists:reverse(["<table>", write(Rows, []), "</table>"], Acc));
+write([{table_row, Ast}|Rest], Acc) ->
+ write(Rest, lists:reverse(["<tr>", write(Ast, []), "</tr>"], Acc));
+write([{table_cell, Ast}|Rest], Acc) ->
+ write(Rest, lists:reverse(["<td>", write(Ast, []), "</td>"], Acc));
+write([{paragraph, _}|Rest], Acc) ->
+ write(Rest, ["<br>"|Acc]).
168 src/jerome_rtf.erl
@@ -0,0 +1,168 @@
+-module(jerome_rtf).
+
+-compile(export_all).
+
+-record(rtf_context, {
+ italic = false,
+ bold = false,
+ underline = false,
+ paragraph_alignment = left,
+ ansi_code_page = undefined,
+ hyperlink = undefined,
+ table = false,
+ unicode_size = 1
+ }).
+
+recognize_word("fldinst") -> true;
+recognize_word(_) -> false.
+
+text_properties(#rtf_context{ italic = true } = Ctx) ->
+ [italic] ++ text_properties(Ctx#rtf_context{ italic = false });
+text_properties(#rtf_context{ bold = true } = Ctx) ->
+ [bold] ++ text_properties(Ctx#rtf_context{ bold = false });
+text_properties(#rtf_context{ underline = true } = Ctx) ->
+ [underline] ++ text_properties(Ctx#rtf_context{ underline = false });
+text_properties(#rtf_context{ hyperlink = Link } = Ctx) when Link =/= undefined ->
+ [{hyperlink, lists:sublist(Link, 2, length(Link)-2)}] ++
+ text_properties(Ctx#rtf_context{ hyperlink = undefined });
+text_properties(_) -> [].
+
+read(Binary) when is_binary(Binary) ->
+ {ok, Tokens} = jerome_rtf_scanner:scan(binary_to_list(Binary)),
+ {ok, ParseTree} = jerome_rtf_parser:parse(Tokens),
+ PrunedTree = prune(ParseTree),
+ process_tree(PrunedTree).
+
+process_tree(PrunedTree) ->
+ {Ast, _Context} = process_tree(PrunedTree, [], #rtf_context{}),
+ lists:reverse(Ast).
+
+process_tree([], Acc, Context) ->
+ {Acc, Context};
+process_tree([{control_word, _, "pc"}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context#rtf_context{ ansi_code_page = 437 });
+process_tree([{control_word, _, "pca"}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context#rtf_context{ ansi_code_page = 850 });
+process_tree([{control_word, _, "ansicpg", CodePage}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context#rtf_context{ ansi_code_page = CodePage });
+process_tree([{control_word, _, "i"}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context#rtf_context{ italic = true });
+process_tree([{control_word, _, "i", 0}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context#rtf_context{ italic = false });
+process_tree([{control_word, _, "b"}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context#rtf_context{ bold = true });
+process_tree([{control_word, _, "b", 0}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context#rtf_context{ bold = false });
+process_tree([{control_word, _, "plain"}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context#rtf_context{ italic = false, bold = false, underline = false });
+process_tree([{control_word, _, "uc", ByteSize}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context#rtf_context{ unicode_size = ByteSize });
+process_tree([{control_word, _, "u", UnicodePoint}|Rest], Acc, #rtf_context{ unicode_size = 0 } = Context) ->
+ process_tree(Rest, [{text, unicode:characters_to_binary([UnicodePoint], unicode, utf8), text_properties(Context)}|Acc], Context);
+process_tree([{control_word, _, "ql"}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context#rtf_context{ paragraph_alignment = left });
+process_tree([{control_word, _, "qc"}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context#rtf_context{ paragraph_alignment = center });
+process_tree([{control_word, _, "qr"}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context#rtf_context{ paragraph_alignment = right });
+process_tree([{control_word, _, "qj"}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context#rtf_context{ paragraph_alignment = justified });
+process_tree([{control_word, _, "pard"}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context#rtf_context{ paragraph_alignment = left, table = false });
+process_tree([{control_word, _, "intbl"}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context#rtf_context{ table = true });
+process_tree([{table_row, Tree}|Rest], [{table, Rows}|Acc], Context) ->
+ {Ast, Context1} = process_tree(Tree, [], Context),
+ process_tree(Rest, [{table, Rows ++ [{table_row, Ast}]}|Acc], Context1);
+process_tree([{table_row, Tree}|Rest], Acc, Context) ->
+ {Ast, Context1} = process_tree(Tree, [], Context),
+ process_tree(Rest, [{table, [{table_row, Ast}]}|Acc], Context1);
+process_tree([{table_cell, Tree}|Rest], Acc, Context) ->
+ {Ast, Context1} = process_tree(Tree, [], Context),
+ process_tree(Rest, [{table_cell, Ast}|Acc], Context1);
+process_tree([{group, [{control_word, _, "info"}|_]}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context);
+process_tree([{group, [{control_word, _, "fonttbl"}|_]}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context);
+process_tree([{group, [{control_word, _, "colortbl"}|_]}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context);
+process_tree([{group, [{control_word, _, "field"},
+ {group, [{control_word, _, "fldinst"},
+ {group, [{text, _, "HYPERLINK "++Hyperlink}]}]},
+ {group, [{control_word, _, "fldrslt"}|Text]}]}|Rest], Acc, Context) ->
+ {Ast, _} = process_tree(Text, [], Context#rtf_context{ hyperlink = Hyperlink }),
+ process_tree(Rest, Ast ++ Acc, Context);
+process_tree([{group, Tree}|Rest], Acc, Context) ->
+ {Ast, _} = process_tree(Tree, [], Context),
+ process_tree(Rest, Ast ++ Acc, Context);
+process_tree([{text, _, Text}|Rest], Acc, Context) ->
+ process_tree(Rest, [{text, Text, text_properties(Context)}|Acc], Context);
+process_tree([{control_word, _, "par"}|Rest], Acc, Context) ->
+ process_tree(Rest, [{paragraph, Context#rtf_context.paragraph_alignment}|Acc], Context);
+process_tree([{control_word, _, "line"}|Rest], Acc, Context) ->
+ process_tree(Rest, [break|Acc], Context);
+% UNICODE ALERT
+process_tree([{control_word, _, "bullet"}|Rest], Acc, Context) ->
+ process_tree(Rest, [{text, "", text_properties(Context)}|Acc], Context);
+process_tree([{control_word, _, "lquote"}|Rest], Acc, Context) ->
+ process_tree(Rest, [{text, "", text_properties(Context)}|Acc], Context);
+process_tree([{control_word, _, "rquote"}|Rest], Acc, Context) ->
+ process_tree(Rest, [{text, "", text_properties(Context)}|Acc], Context);
+process_tree([{control_word, _, "ldblquote"}|Rest], Acc, Context) ->
+ process_tree(Rest, [{text, "", text_properties(Context)}|Acc], Context);
+process_tree([{control_word, _, "rdblquote"}|Rest], Acc, Context) ->
+ process_tree(Rest, [{text, "", text_properties(Context)}|Acc], Context);
+process_tree([{control_word, _, "emdash"}|Rest], Acc, Context) ->
+ process_tree(Rest, [{text, "", text_properties(Context)}|Acc], Context);
+process_tree([{control_word, _, "endash"}|Rest], Acc, Context) ->
+ process_tree(Rest, [{text, "", text_properties(Context)}|Acc], Context);
+process_tree([{control_word, _, "emspace"}|Rest], Acc, Context) ->
+ process_tree(Rest, [{text, "", text_properties(Context)}|Acc], Context);
+process_tree([{control_word, _, "enspace"}|Rest], Acc, Context) ->
+ process_tree(Rest, [{text, "", text_properties(Context)}|Acc], Context);
+process_tree([{control_word, _, "qmspace"}|Rest], Acc, Context) ->
+ process_tree(Rest, [{text, "", text_properties(Context)}|Acc], Context);
+% END UNICODE ALERT
+process_tree([{control_char, _, '~'}|Rest], Acc, Context) ->
+ process_tree(Rest, [{nonbreaking_space, text_properties(Context)}|Acc], Context);
+process_tree([{control_char, _, '-'}|Rest], Acc, Context) ->
+ process_tree(Rest, [{optional_hyphen, text_properties(Context)}|Acc], Context);
+process_tree([{control_char, _, '_'}|Rest], Acc, Context) ->
+ process_tree(Rest, [{nonbreaking_hyphen, text_properties(Context)}|Acc], Context);
+process_tree([{control_hex, _, Char}|Rest], Acc, Context) ->
+ process_tree(Rest, [{text, [to_unicode(Char, Context#rtf_context.ansi_code_page)], text_properties(Context)}|Acc], Context);
+process_tree([{control_word, _, _}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context);
+process_tree([{control_word, _, _, _}|Rest], Acc, Context) ->
+ process_tree(Rest, Acc, Context).
+
+
+to_unicode(_, _) -> % TODO
+ $..
+
+prune(ParseTree) ->
+ prune(ParseTree, []).
+
+prune([], Acc) ->
+ lists:reverse(Acc);
+
+prune([{group, Tree}|Rest], Acc) ->
+ case prune(Tree, []) of
+ [] -> prune(Rest, Acc);
+ PrunedTree -> prune(Rest, [{group, PrunedTree}|Acc])
+ end;
+
+prune([{control_char, _, '*'}, {control_word, _, Word} = ControlToken |Rest], Acc) ->
+ case recognize_word(Word) of
+ true -> prune(Rest, [ControlToken|Acc]);
+ false -> []
+ end;
+
+prune([{control_char, _, '*'}, {control_word, _, Word, _} = ControlToken |Rest], Acc) ->
+ case recognize_word(Word) of
+ true -> prune(Rest, [ControlToken|Acc]);
+ false -> []
+ end;
+
+prune([Other|Rest], Acc) ->
+ prune(Rest, [Other|Acc]).
42 src/jerome_rtf_parser.yrl
@@ -0,0 +1,42 @@
+%% RTF parser. Doesn't do much besides putting control codes into groups.
+
+Nonterminals
+ Elements
+ Group
+ TableRow
+ Cells
+ Cell.
+
+Terminals
+ text
+ bin
+ control_word
+ control_char
+ control_bin
+ control_hex
+ open_brace
+ close_brace
+ begin_row
+ end_row
+ last_row
+ end_cell.
+
+Rootsymbol
+ Elements.
+
+Elements -> '$empty' : [].
+Elements -> Elements Group : '$1' ++ ['$2'].
+Elements -> Elements TableRow : '$1' ++ ['$2'].
+Elements -> Elements text : '$1' ++ ['$2'].
+Elements -> Elements bin : '$1' ++ ['$2'].
+Elements -> Elements control_word : '$1' ++ ['$2'].
+Elements -> Elements control_bin : '$1' ++ ['$2'].
+Elements -> Elements control_char : '$1' ++ ['$2'].
+Elements -> Elements control_hex : '$1' ++ ['$2'].
+
+Group -> open_brace Elements close_brace : {group, '$2'}.
+TableRow -> begin_row Cells end_row : {table_row, '$2'}.
+TableRow -> begin_row Cells last_row end_row : {table_row, '$2'}.
+Cells -> '$empty' : [].
+Cells -> Cells Cell : '$1' ++ ['$2'].
+Cell -> Elements end_cell : {table_cell, '$1'}.
128 src/jerome_rtf_scanner.erl
@@ -0,0 +1,128 @@
+-module(jerome_rtf_scanner).
+
+-compile(export_all).
+
+
+scan(RTF) ->
+ scan(RTF, [], {1, 1}, in_text).
+
+scan([], Scanned, _, in_text) ->
+ {ok, lists:reverse(
+ lists:map(fun
+ ({bin, Pos, Binary}) ->
+ {bin, Pos, lists:reverse(Binary)};
+ ({text, Pos, Text}) ->
+ {text, Pos, lists:reverse(Text)};
+ ({control_word, Pos, RevCode}) ->
+ case lists:reverse(RevCode) of
+ "trowd" ->
+ {begin_row, Pos};
+ "row" ->
+ {end_row, Pos};
+ "lastrow" ->
+ {last_row, Pos};
+ "cell" ->
+ {end_cell, Pos};
+ Code ->
+ {control_word, Pos, Code}
+ end;
+ ({control_word, Pos, Code, Param}) ->
+ {control_word, Pos, lists:reverse(Code), Param};
+ (Token) ->
+ Token
+ end, Scanned))};
+
+scan([$\\, $b, $i, $n, D | T], Scanned, {Row, Column} = Pos, in_text) when D>=$0, D=<$9 ->
+ scan(T, [{control_bin, Pos, D-$0}|Scanned], {Row, Column + length("\\binX")}, in_bin_word);
+
+scan([H|T], [{control_bin, CPos, Len}|Scanned], {Row, Column}, in_bin_word) when H>=$0, H=<$9 ->
+ scan(T, [{control_bin, CPos, Len * 10 + (H-$0)}|Scanned], {Row, Column + 1}, in_bin_word);
+
+scan(" " ++ T, [{control_bin, CPos, Len}|Scanned], {Row, Column} = Pos, in_bin_word) ->
+ scan(T, [{bin, Pos, []}, {control_bin, CPos, Len}|Scanned], {Row, Column + 1}, {in_bin, Len});
+
+scan([H|T], [{bin, BPos, Binary}|Scanned], {Row, Column}, {in_bin, 0}) ->
+ scan(T, [{bin, BPos, [H|Binary]}|Scanned], {Row, Column + 1}, in_text);
+
+scan([H|T], [{bin, BPos, Binary}|Scanned], {Row, Column}, {in_bin, BytesLeft}) ->
+ scan(T, [{bin, BPos, [H|Binary]}|Scanned], {Row, Column + 1}, {in_bin, BytesLeft - 1});
+
+
+scan("{" ++ T, Scanned, {Row, Column} = Pos, _State) ->
+ scan(T, [{open_brace, Pos, '{'}|Scanned], {Row, Column + 1}, in_text);
+scan("}" ++ T, Scanned, {Row, Column} = Pos, _State) ->
+ scan(T, [{close_brace, Pos, '}'}|Scanned], {Row, Column + 1}, in_text);
+
+scan("\\*" ++ T, Scanned, {Row, Column} = Pos, _State) ->
+ scan(T, [{control_char, Pos, '*'}|Scanned], {Row, Column + 2}, in_text);
+scan("\\~" ++ T, Scanned, {Row, Column} = Pos, _State) ->
+ scan(T, [{control_char, Pos, '~'}|Scanned], {Row, Column + 2}, in_text);
+scan("\\-" ++ T, Scanned, {Row, Column} = Pos, _State) ->
+ scan(T, [{control_char, Pos, '-'}|Scanned], {Row, Column + 2}, in_text);
+scan("\\_" ++ T, Scanned, {Row, Column} = Pos, _State) ->
+ scan(T, [{control_char, Pos, '_'}|Scanned], {Row, Column + 2}, in_text);
+scan("\\:" ++ T, Scanned, {Row, Column} = Pos, _State) ->
+ scan(T, [{control_char, Pos, ':'}|Scanned], {Row, Column + 2}, in_text);
+scan("\\|" ++ T, Scanned, {Row, Column} = Pos, _State) ->
+ scan(T, [{control_char, Pos, '|'}|Scanned], {Row, Column + 2}, in_text);
+scan([$\\, $', H1, H2 | T], Scanned, {Row, Column} = Pos, _State) when ((H1>=$0 andalso H1=<$9) orelse
+ (H1>=$A andalso H1 =<$Z)) andalso
+ ((H2>=$0 andalso H2 =<$9) orelse
+ (H2>=$A andalso H2=<$Z))->
+ scan(T, [{control_hex, Pos, hexchar_to_int(H1) * 16 + hexchar_to_int(H2)}|Scanned], {Row, Column + 4}, in_text);
+
+
+scan([$\t | T], Scanned, {Row, Column}, _State) ->
+ scan(T, [{control_word, {Row, Column}, lists:reverse("tab")}|Scanned], {Row, Column + 2}, in_text);
+scan([$\\, $\r, $\n |T], Scanned, {Row, Column}, _State) ->
+ scan(T, [{control_word, {Row, Column}, lists:reverse("par")}|Scanned], {Row + 1, 0}, in_text);
+scan([$\\, $\r |T], Scanned, {Row, Column}, _State) ->
+ scan(T, [{control_word, {Row, Column}, lists:reverse("par")}|Scanned], {Row + 1, 0}, in_text);
+scan([$\\, $\n |T], Scanned, {Row, Column}, _State) ->
+ scan(T, [{control_word, {Row, Column}, lists:reverse("par")}|Scanned], {Row + 1, 0}, in_text);
+scan([$\\, H|T], [{text, TPos, Text}|Scanned], {Row, Column}, in_text) when H=:=${; H=:=$}; H=:=$\\ ->
+ scan(T, [{text, TPos, [H|Text]}|Scanned], {Row, Column + 2}, in_text);
+scan([$\\, H|T], Scanned, {Row, Column}, in_text) when H=:=${; H=:=$}; H=:=$\\ ->
+ scan(T, [{text, {Row, Column}, [H]}|Scanned], {Row, Column + 2}, in_text);
+scan([$\\|T], Scanned, {Row, Column}, _State) ->
+ scan(T, [{control_word, {Row, Column}, ""}|Scanned], {Row, Column + 1}, in_word);
+
+scan([$\r, $\n |T], Scanned, {Row, _Column}, _State) ->
+ scan(T, Scanned, {Row + 1, 0}, in_text);
+scan([$\r |T], Scanned, {Row, _Column}, _State) ->
+ scan(T, Scanned, {Row + 1, 0}, in_text);
+scan([$\n |T], Scanned, {Row, _Column}, _State) ->
+ scan(T, Scanned, {Row + 1, 0}, in_text);
+
+scan([$-, H|T], [{control_word, Pos, Code}|Scanned], {Row, Column}, in_word) when H>=$0, H=<$9 ->
+ scan(T, [{control_word, Pos, Code, -(H-$0)}|Scanned], {Row, Column + 2}, in_word_param);
+
+scan([H|T], [{control_word, Pos, Code}|Scanned], {Row, Column}, in_word) when H>=$0, H=<$9 ->
+ scan(T, [{control_word, Pos, Code, H-$0}|Scanned], {Row, Column + 1}, in_word_param);
+
+scan([H|T], [{control_word, Pos, Code}|Scanned], {Row, Column}, in_word) when H>=$a, H=<$z; H>=$A, H=<$Z ->
+ scan(T, [{control_word, Pos, [H|Code]}|Scanned], {Row, Column + 1}, in_word);
+
+scan([H|T], [{control_word, Pos, Code, Param}|Scanned], {Row, Column}, in_word_param) when H>=$0, H=<$9, Param > 0 ->
+ scan(T, [{control_word, Pos, Code, Param * 10 + (H-$0)}|Scanned], {Row, Column + 1}, in_word_param);
+scan([H|T], [{control_word, Pos, Code, Param}|Scanned], {Row, Column}, in_word_param) when H>=$0, H=<$9, Param < 0 ->
+ scan(T, [{control_word, Pos, Code, Param * 10 - (H-$0)}|Scanned], {Row, Column + 1}, in_word_param);
+
+scan([$\ |T], Scanned, {Row, Column}, State) when State =:= in_word; State =:= in_word_param ->
+ scan(T, Scanned, {Row, Column + 1}, in_text);
+
+scan([_H|T], Scanned, {Row, Column}, State) when State =:= in_word; State =:= in_word_param ->
+ scan(T, Scanned, {Row, Column + 1}, in_text);
+
+scan([H|T], [{text, TPos, Text}|Scanned], {Row, Column}, in_text) ->
+ scan(T, [{text, TPos, [H|Text]}|Scanned], {Row, Column + 1}, in_text);
+
+scan([H|T], Scanned, {Row, Column} = Pos, in_text) ->
+ scan(T, [{text, Pos, [H]}|Scanned], {Row, Column+1}, in_text).
+
+
+
+hexchar_to_int(C) when C>=$0, C=<$9 ->
+ C-$0;
+hexchar_to_int(C) when C>=$A, C=<$Z ->
+ C-$A.

0 comments on commit 4af4b7b

Please sign in to comment.