Skip to content

Commit

Permalink
Fixed the disarming/rearming of HTML tags in the online revision.
Browse files Browse the repository at this point in the history
  • Loading branch information
Luca de Alfaro committed Aug 21, 2008
1 parent 07dcd63 commit cb95b03
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 54 deletions.
44 changes: 22 additions & 22 deletions batch/analysis/chdiff.ml
Original file line number Diff line number Diff line change
Expand Up @@ -856,15 +856,15 @@ if false then begin
Array.make l 0
in

let ta1 = Text.split_into_words (Vec.singleton ts1) in
let ta2 = Text.split_into_words (Vec.singleton ts2) in
let ta3 = Text.split_into_words (Vec.singleton ts3) in
let ta4 = Text.split_into_words (Vec.singleton ts4) in
let ta5 = Text.split_into_words (Vec.singleton ts5) in
let ta6 = Text.split_into_words (Vec.singleton ts6) in
let ta7 = Text.split_into_words (Vec.singleton ts7) in
let ta8 = Text.split_into_words (Vec.singleton ts8) in
let ta9 = Text.split_into_words (Vec.singleton ts9) in
let ta1 = Text.split_into_words false (Vec.singleton ts1) in
let ta2 = Text.split_into_words false (Vec.singleton ts2) in
let ta3 = Text.split_into_words false (Vec.singleton ts3) in
let ta4 = Text.split_into_words false (Vec.singleton ts4) in
let ta5 = Text.split_into_words false (Vec.singleton ts5) in
let ta6 = Text.split_into_words false (Vec.singleton ts6) in
let ta7 = Text.split_into_words false (Vec.singleton ts7) in
let ta8 = Text.split_into_words false (Vec.singleton ts8) in
let ta9 = Text.split_into_words false (Vec.singleton ts9) in

let ia1 = make_init_attr ta1 in

Expand Down Expand Up @@ -977,7 +977,7 @@ if false then begin
let ts10 = "In generale, il bene comune non coincide con quello individuale, dato che le persone non badano a quello comune se non quando gli fa comodo." in

let tsa = [| ts1; ts2; ts3; ts4; ts5; ts6; ts7; ts8; ts9; ts10 |] in
let taa = Array.map (function x -> Text.split_into_words (Vec.singleton x)) tsa in
let taa = Array.map (function x -> Text.split_into_words false (Vec.singleton x)) tsa in
let (c, l) = text_tracking [| taa.(0) |] taa.(1) in
Text.print_words taa.(0);
Text.print_words taa.(1);
Expand Down Expand Up @@ -1013,8 +1013,8 @@ if false then
let text3b = "dopo che mi sono svegliato, a me piace bere il mio caffe'" in

let test_edit_diff t1 t2 =
let w1 = Text.split_into_words (Vec.singleton t1) in
let w2 = Text.split_into_words (Vec.singleton t2) in
let w1 = Text.split_into_words false (Vec.singleton t1) in
let w2 = Text.split_into_words false (Vec.singleton t2) in
let i2 = make_index_diff w2 in
let e = edit_diff w1 w2 i2 in
Text.print_words w1;
Expand All @@ -1040,16 +1040,16 @@ if false then begin
let ts9 = ts7 in
let ts10 = "In generale, il bene comune non coincide con quello individuale. Questo e' causato dal fatto che le persone badano al loro bene privato, piuttosto che al bene comune." in

let ta1 = Text.split_into_words (Vec.singleton ts1) in
let ta2 = Text.split_into_words (Vec.singleton ts2) in
let ta3 = Text.split_into_words (Vec.singleton ts3) in
let ta4 = Text.split_into_words (Vec.singleton ts4) in
let ta5 = Text.split_into_words (Vec.singleton ts5) in
let ta6 = Text.split_into_words (Vec.singleton ts6) in
let ta7 = Text.split_into_words (Vec.singleton ts7) in
let ta8 = Text.split_into_words (Vec.singleton ts8) in
let ta9 = Text.split_into_words (Vec.singleton ts9) in
let ta10 = Text.split_into_words (Vec.singleton ts10) in
let ta1 = Text.split_into_words false (Vec.singleton ts1) in
let ta2 = Text.split_into_words false (Vec.singleton ts2) in
let ta3 = Text.split_into_words false (Vec.singleton ts3) in
let ta4 = Text.split_into_words false (Vec.singleton ts4) in
let ta5 = Text.split_into_words false (Vec.singleton ts5) in
let ta6 = Text.split_into_words false (Vec.singleton ts6) in
let ta7 = Text.split_into_words false (Vec.singleton ts7) in
let ta8 = Text.split_into_words false (Vec.singleton ts8) in
let ta9 = Text.split_into_words false (Vec.singleton ts9) in
let ta10 = Text.split_into_words false (Vec.singleton ts10) in

let t = [|ta1; ta2; ta3; ta4; ta5; ta6; ta7; ta8; ta9; ta10|] in
let len = Array.length (t) in
Expand Down
22 changes: 11 additions & 11 deletions batch/analysis/compute_edlist.ml
Original file line number Diff line number Diff line change
Expand Up @@ -376,17 +376,17 @@ if false then begin
let ts9 = ts7 in
let ts10 = "In generale, il bene comune non coincide con quello individuale. Questo e' causato dal fatto che le persone badano al loro bene privato, piuttosto che al bene comune." in

let ta0 = Text.split_into_words (Vec.singleton ts0) in
let ta1 = Text.split_into_words (Vec.singleton ts1) in
let ta2 = Text.split_into_words (Vec.singleton ts2) in
let ta3 = Text.split_into_words (Vec.singleton ts3) in
let ta4 = Text.split_into_words (Vec.singleton ts4) in
let ta5 = Text.split_into_words (Vec.singleton ts5) in
let ta6 = Text.split_into_words (Vec.singleton ts6) in
let ta7 = Text.split_into_words (Vec.singleton ts7) in
let ta8 = Text.split_into_words (Vec.singleton ts8) in
let ta9 = Text.split_into_words (Vec.singleton ts9) in
let ta10 = Text.split_into_words (Vec.singleton ts10) in
let ta0 = Text.split_into_words false (Vec.singleton ts0) in
let ta1 = Text.split_into_words false (Vec.singleton ts1) in
let ta2 = Text.split_into_words false (Vec.singleton ts2) in
let ta3 = Text.split_into_words false (Vec.singleton ts3) in
let ta4 = Text.split_into_words false (Vec.singleton ts4) in
let ta5 = Text.split_into_words false (Vec.singleton ts5) in
let ta6 = Text.split_into_words false (Vec.singleton ts6) in
let ta7 = Text.split_into_words false (Vec.singleton ts7) in
let ta8 = Text.split_into_words false (Vec.singleton ts8) in
let ta9 = Text.split_into_words false (Vec.singleton ts9) in
let ta10 = Text.split_into_words false (Vec.singleton ts10) in

let w = [|ts0; ts1; ts2; ts3; ts4; ts5; ts6; ts7; ts8; ts9; ts10|] in
let t = [|ta0; ta1; ta2; ta3; ta4; ta5; ta6; ta7; ta8; ta9; ta10|] in
Expand Down
4 changes: 2 additions & 2 deletions batch/analysis/revision.ml
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ class plain_revision
object (self)
inherit revision id page_id timestamp time contributor user_id ip_addr username is_minor comment text_init

val words : word array = Text.split_into_words text_init
val words : word array = Text.split_into_words true text_init

method get_words : word array = words
method get_n_words : int = Array.length words
Expand Down Expand Up @@ -408,7 +408,7 @@ class trust_revision
(comment: string)
(text_init: string Vec.t) (* Text of the revision, still to be split into words *)
=
let (t, _, _, swi, s) = Text.split_into_words_seps_and_info text_init in
let (t, _, _, swi, s) = Text.split_into_words_seps_and_info true text_init in

object (self)
inherit revision id page_id timestamp time contributor user_id ip_addr username is_minor comment text_init
Expand Down
31 changes: 18 additions & 13 deletions batch/analysis/text.ml
Original file line number Diff line number Diff line change
Expand Up @@ -675,11 +675,13 @@ let lt_r = Str.regexp "<"
let gt_r = Str.regexp ">"
(* This function splits the whitespace,
taking also care of the &lt; and &gt; substitution *)
let separate_whitespace (v: piece_t Vec.t) : piece_t Vec.t =
let separate_whitespace (arm: bool) (v: piece_t Vec.t) : piece_t Vec.t =
(* The function rearm re-arms the < and > tags *)
let rearm (s: string) =
let s' = Str.global_replace lt_r "&lt;" s in
Str.global_replace gt_r "&gt;" s'
if arm then begin
let s' = Str.global_replace lt_r "&lt;" s in
Str.global_replace gt_r "&gt;" s'
end else s
in
(* The function f is folded over v *)
let f (d: piece_t) (piece_v: piece_t Vec.t) : piece_t Vec.t =
Expand Down Expand Up @@ -718,10 +720,13 @@ let separate_whitespace (v: piece_t Vec.t) : piece_t Vec.t =
let a_lt_r = Str.regexp "&lt;"
let a_gt_r = Str.regexp "&gt;"
(* This function splits a string respecting the Wiki markup language. *)
let split_string_preserving_markup (text: string) : piece_t Vec.t =
(* First, I replace &lt; and &gt; with < and >, otherwise, it's just too hard *)
let text1 = Str.global_replace a_lt_r "<" text in
let text2 = Str.global_replace a_gt_r ">" text1 in
let split_string_preserving_markup (arm: bool) (text: string) : piece_t Vec.t =
(* First, I replace &lt; and &gt; with < and > if requested *)
let text2 =
if arm
then Str.global_replace a_gt_r ">" (Str.global_replace a_lt_r "<" text)
else text
in
let text3 = remove_html_comments text2 in
(* Makes sure the string begins with \n, to find markup at the beginning of a line *)
if String.length text3 = 0
Expand All @@ -732,7 +737,7 @@ let split_string_preserving_markup (text: string) : piece_t Vec.t =
(* Now does the splitting *)
let p = Vec.singleton (TXT_splittable text') in
let split =
separate_whitespace (
separate_whitespace arm (
separate_table_tags (
separate_line_tags (
separate_titles (
Expand Down Expand Up @@ -784,10 +789,10 @@ let normalize_ws (s: string) : string =
- the array of seps, where words, etc, have their position in the word array
annotated.
*)
let split_into_words_seps_and_info (text_v: string Vec.t)
let split_into_words_seps_and_info (arm: bool) (text_v: string Vec.t)
: (word array) * (float array) * (int array) * (int array) * (sep_t array) =
(* First, uses a visitor to construct a piece_t Vec.t called piece_v *)
let vn l d r = Vec.concat (Vec.concat l (split_string_preserving_markup d)) r in
let vn l d r = Vec.concat (Vec.concat l (split_string_preserving_markup arm d)) r in
let piece_v = Vec.visit_post Vec.empty vn text_v in


Expand Down Expand Up @@ -969,9 +974,9 @@ let split_into_words_seps_and_info (text_v: string Vec.t)
entirely compatible with the one used for trust analysis, and it is
consequently slower. *)

let split_into_words (text_v: string Vec.t) : word array =
let split_into_words (arm: bool) (text_v: string Vec.t) : word array =
(* First, we generate a word Vec.t *)
let (word_a, _, _, _, _) = split_into_words_seps_and_info text_v in
let (word_a, _, _, _, _) = split_into_words_seps_and_info arm text_v in
word_a;;


Expand Down Expand Up @@ -1006,7 +1011,7 @@ if false then begin
let f x =
Printf.printf "Original:\n%S\n" x;
let x_v = Vec.singleton x in
let (word_v, trust_v, orig_v, _, sep_v) = split_into_words_seps_and_info x_v in
let (word_v, trust_v, orig_v, _, sep_v) = split_into_words_seps_and_info true x_v in
print_string "Words:\n";
let g0 s = Printf.printf "%S " s in
Array.iter g0 word_v;
Expand Down
10 changes: 6 additions & 4 deletions batch/analysis/text.mli
Original file line number Diff line number Diff line change
Expand Up @@ -70,19 +70,21 @@ type sep_t =
| Redirect of string * int
(** redirection tag, along with the position in the word array *)

val split_into_words : string Vec.t -> word array
(** [split_into_words sv] splits a Vec of strings [sv] into an array of words.
val split_into_words : bool -> string Vec.t -> word array
(** [split_into_words arm sv] splits a Vec of strings [sv] into an array of words.
[arm] denotes whether < and > have to be rearmed into &lt; and &gt;
Used for reputation analysis. *)

val split_into_words_seps_and_info :
string Vec.t -> ((word array) * (float array) * (int array) * (int array) * (sep_t array))
(** [split_into_words_and_seps sv] splits a Vec of strings [sv] into:
bool -> string Vec.t -> ((word array) * (float array) * (int array) * (int array) * (sep_t array))
(** [split_into_words_seps_and_info arm sv] splits a Vec of strings [sv] into:
- an array of words (excluding separators, such as white space, etc)
- an array of trust values of words (float)
- an array of origins of words (int)
- an array giving, for each word, its place in the sep array (int)
- the array of seps, where words, etc, have their position in the word array
annotated.
[arm] denotes whether < and > have to be rearmed into &lt; and &gt;
*)

val print_words : word array -> unit
Expand Down
4 changes: 2 additions & 2 deletions online/analysis/online_revision.ml
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class revision
method read_text : unit =
try
let text_vec = Vec.singleton (db#read_rev_text text_id) in
let (w, t, o, s_idx, s) = Text.split_into_words_seps_and_info text_vec in
let (w, t, o, s_idx, s) = Text.split_into_words_seps_and_info false text_vec in
words <- w;
seps <- s;
sep_word_idx <- s_idx;
Expand All @@ -127,7 +127,7 @@ class revision
we let the error pop up, so that the caller knows that the revision
needs to be colored. *)
let text_vec = Vec.singleton (db#read_colored_markup rev_id) in
let (w, t, o, s_idx, s) = Text.split_into_words_seps_and_info text_vec in
let (w, t, o, s_idx, s) = Text.split_into_words_seps_and_info false text_vec in
words <- w;
trust <- t;
origin <- o;
Expand Down

0 comments on commit cb95b03

Please sign in to comment.