Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

sexplib: added Sexplib.Conv to multiple files

+ plus whitespace fixes
  • Loading branch information...
commit daeb42ce5f60c9754932fa72033ce1f7ba298067 1 parent c3f421b
Bo Adler authored
Showing with 161 additions and 155 deletions.
  1. +27 −26 analysis/author_sig.ml
  2. +71 −68 analysis/editlist.ml
  3. +63 −61 analysis/online_types.ml
View
53 analysis/author_sig.ml
@@ -36,11 +36,12 @@ POSSIBILITY OF SUCH DAMAGE.
TYPE_CONV_PATH "UCSC_WIKI_RESEARCH"
open Eval_defs
+open Sexplib.Conv
(** Type of author signature *)
type packed_author_signature_t = int with sexp
type unpacked_author_signature_t = int * int * int
-type author_signature_t = int
+type author_signature_t = int
let mask = 0o1777
let offset = 10
@@ -54,42 +55,42 @@ let empty_sigs = 0
let sexp_of_sigs = Sexplib.Conv.sexp_of_int
let sigs_of_sexp = Sexplib.Conv.int_of_sexp
*)
-let sexp_of_sigs x =
- let s = Printf.sprintf "%x" x in
+let sexp_of_sigs x =
+ let s = Printf.sprintf "%x" x in
Sexplib.Conv.sexp_of_string s
-let sigs_of_sexp x =
- let s = Sexplib.Conv.string_of_sexp x in
- let get_sig y = y in
- Scanf.sscanf s "%x" get_sig
+let sigs_of_sexp x =
+ let s = Sexplib.Conv.string_of_sexp x in
+ let get_sig y = y in
+ Scanf.sscanf s "%x" get_sig
-let pack (a0: int) (a1: int) (a2: int) : packed_author_signature_t =
+let pack (a0: int) (a1: int) (a2: int) : packed_author_signature_t =
a0 lor ((a1 lor (a2 lsl offset)) lsl offset)
-let unpack (p: packed_author_signature_t) : unpacked_author_signature_t =
+let unpack (p: packed_author_signature_t) : unpacked_author_signature_t =
let a0 = p land mask in
- let b1 = p lsr offset in
- let a1 = b1 land mask in
- let b2 = b1 lsr offset in
- let a2 = b2 land mask in
+ let b1 = p lsr offset in
+ let a1 = b1 land mask in
+ let b2 = b1 lsr offset in
+ let a2 = b2 land mask in
(a0, a1, a2)
-(** [is_author_in_sigs id w sigs] returns [true] if author [id] is in the signatures [sigs] of
+(** [is_author_in_sigs id w sigs] returns [true] if author [id] is in the signatures [sigs] of
word [w], and returns [false] otherwise. *)
-let is_author_in_sigs (id: int) (w: string) (sigs: packed_author_signature_t) : bool =
- if is_anonymous id then true
- else
- let (a0, a1, a2) = unpack sigs in
- let h = hash (id, w) in
+let is_author_in_sigs (id: int) (w: string) (sigs: packed_author_signature_t) : bool =
+ if is_anonymous id then true
+ else
+ let (a0, a1, a2) = unpack sigs in
+ let h = hash (id, w) in
(h = a0 || h = a1 || h = a2)
-(** [add_author id word sigs] adds author id to the signatures [sigs] for word [word],
- and returns the new signature. It assumes that the author was not already in the
+(** [add_author id word sigs] adds author id to the signatures [sigs] for word [word],
+ and returns the new signature. It assumes that the author was not already in the
list. *)
-let add_author (id: int) (w: string) (sigs: packed_author_signature_t) : packed_author_signature_t =
- if is_anonymous id then sigs
- else
- let (a0, a1, a2) = unpack sigs in
- let h = hash (id, w) in
+let add_author (id: int) (w: string) (sigs: packed_author_signature_t) : packed_author_signature_t =
+ if is_anonymous id then sigs
+ else
+ let (a0, a1, a2) = unpack sigs in
+ let h = hash (id, w) in
pack h a0 a1
View
139 analysis/editlist.ml
@@ -33,49 +33,52 @@ POSSIBILITY OF SUCH DAMAGE.
*)
+
TYPE_CONV_PATH "UCSC_WIKI_RESEARCH"
+open Sexplib.Conv
+
(* editlist.ml : this file contains the types related to edit lists *)
-type edit =
+type edit =
Ins of int * int (* Ins (i, l) means add l words at position i *)
| Del of int * int (* Del (i, l) means delete l words from position i *)
(* Mov (i, j, l) means move l words from pos i to pos l *)
- | Mov of int * int * int
-with sexp
+ | Mov of int * int * int
+with sexp
(* same as edit, but for the case when the lhs and rhs are lists of chunks *)
-type medit =
+type medit =
(* Mins (i, l) means insert l words at pos i of chunk 0 *)
- Mins of int * int
+ Mins of int * int
(* Mdel (i, k, l) means del l words at pos i of chunk k *)
- | Mdel of int * int * int
+ | Mdel of int * int * int
(* Mmov (i, k, j, n, l) means mov l words from pos i of chunk k
to pos j of chunk n *)
- | Mmov of int * int * int * int * int
+ | Mmov of int * int * int * int * int
(** Useful for debugging purposes *)
-let rec diff_to_string l : string =
- match l with
+let rec diff_to_string l : string =
+ match l with
d :: l' ->
begin
- let s = match d with
- Ins (i, l) -> Printf.sprintf "Ins(%d, %d) " i l
- | Del (i, l) -> Printf.sprintf "Del(%d, %d) " i l
- | Mov (i, j, l) -> Printf.sprintf "Mov(%d, %d, %d) " i j l
+ let s = match d with
+ Ins (i, l) -> Printf.sprintf "Ins(%d, %d) " i l
+ | Del (i, l) -> Printf.sprintf "Del(%d, %d) " i l
+ | Mov (i, j, l) -> Printf.sprintf "Mov(%d, %d, %d) " i j l
in s ^ diff_to_string l'
end
| [] -> "";;
-let rec mdiff_to_string l : string =
- match l with
+let rec mdiff_to_string l : string =
+ match l with
d :: l' ->
- begin
- let s = match d with
- Mins (i, l) -> Printf.sprintf "Ins(%d, 0) %d " i l
- | Mdel (i, k, l) -> Printf.sprintf "Del(%d, %d) %d " i k l
- | Mmov (i, k, j, n, l) -> Printf.sprintf "Mov(%d, %d) (%d, %d) %d " i k j n l
+ begin
+ let s = match d with
+ Mins (i, l) -> Printf.sprintf "Ins(%d, 0) %d " i l
+ | Mdel (i, k, l) -> Printf.sprintf "Del(%d, %d) %d " i k l
+ | Mmov (i, k, j, n, l) -> Printf.sprintf "Mov(%d, %d) (%d, %d) %d " i k j n l
in s ^ mdiff_to_string l'
end
| [] -> "";;
@@ -103,7 +106,7 @@ let create_index (g: graph) : index =
let h : index = Hashtbl.create 10 in
(* Function f is iterated on the elements of the graph. *)
let f (i: int) = function
- Del (n, k) -> begin
+ Del (n, k) -> begin
Hashtbl.add h (false, n) i;
Hashtbl.add h (false, n + k - 1) i
end
@@ -127,7 +130,7 @@ let compute_edges (g: graph) (h: index) (len_lhs: int) (len_rhs: int) : unit =
let add_edge i j = begin
g_edges.(i) <- j :: g_edges.(i);
g_edges.(j) <- i :: g_edges.(j)
- end in
+ end in
(* The function f is iterated on the elements of the graph,
and connects them to other elements, using the index
to find them. *)
@@ -139,7 +142,7 @@ let compute_edges (g: graph) (h: index) (len_lhs: int) (len_rhs: int) : unit =
the other side. *)
if Hashtbl.mem h (true, 0) then begin
let j = Hashtbl.find h (true, 0) in
- match g_elements.(j) with
+ match g_elements.(j) with
Ins (0, _) -> add_edge i j;
| _ -> ()
end
@@ -149,7 +152,7 @@ let compute_edges (g: graph) (h: index) (len_lhs: int) (len_rhs: int) : unit =
let j = Hashtbl.find h (false, n - 1) in
match g_elements.(j) with
Del _ -> add_edge i j;
- | Mov (n', m', k') ->
+ | Mov (n', m', k') ->
(* Finds what ends at the bottom of the Mov on the right. *)
if m' + k' < len_rhs then begin
if Hashtbl.mem h (true, m' + k') then begin
@@ -192,7 +195,7 @@ let compute_edges (g: graph) (h: index) (len_lhs: int) (len_rhs: int) : unit =
end
end
end
- | Ins (n, _) ->
+ | Ins (n, _) ->
if n > 0 then begin
if Hashtbl.mem h (true, n - 1) then begin
let j = Hashtbl.find h (true, n - 1) in
@@ -269,101 +272,101 @@ let contribution_insdel edits : float =
(* Measures the contribution. *)
measure_insdel g h
-
+
let contribution_mov edits l : float =
let rec filter_mov (el: edit list) : (int * int * int) list =
- match el with
+ match el with
[] -> []
| e :: l -> begin
- match e with
+ match e with
Mov (i, j, m) -> (i, j, m) :: (filter_mov l)
- | Ins (i, len)
+ | Ins (i, len)
| Del (i, len) -> filter_mov l
end
in
- let mov_l = filter_mov edits in
+ let mov_l = filter_mov edits in
(* Computes the contribution of movs *)
(* Makes an array of the moves *)
- let a = Array.of_list mov_l in
+ let a = Array.of_list mov_l in
(* comparison for sorting *)
- let cmp m1 m2 =
- let (i1, j1, l1) = m1 in
- let (i2, j2, l2) = m2 in
- let d = i1 - i2 in
+ let cmp m1 m2 =
+ let (i1, j1, l1) = m1 in
+ let (i2, j2, l2) = m2 in
+ let d = i1 - i2 in
if d > 0 then 1
- else if d < 0 then -1
+ else if d < 0 then -1
else 0
in
(* sorts the array *)
Array.sort cmp a;
- (* now we sort it wrt the move destination,
+ (* now we sort it wrt the move destination,
adding contributions as we go along *)
- let tot_mov = ref 0 in
+ let tot_mov = ref 0 in
(* sorts between lower_b and upper_b *)
- let lower_b = ref 0 in
- let upper_b = ref ((Array.length a) - 1) in
- while !upper_b > !lower_b do
- begin
+ let lower_b = ref 0 in
+ let upper_b = ref ((Array.length a) - 1) in
+ while !upper_b > !lower_b do
+ begin
(* first, we go up *)
- let change = ref 0 in
- for i = !lower_b to !upper_b - 1 do
+ let change = ref 0 in
+ for i = !lower_b to !upper_b - 1 do
begin
- let (i1, j1, l1) = a.(i) in
- let (i2, j2, l2) = a.(i+1) in
- if j2 < j1 then
+ let (i1, j1, l1) = a.(i) in
+ let (i2, j2, l2) = a.(i+1) in
+ if j2 < j1 then
begin
(* swaps, and takes cost into consideration *)
- let m = a.(i) in
+ let m = a.(i) in
a.(i) <- a.(i+1);
- a.(i+1) <- m;
- tot_mov := !tot_mov + l1 * l2;
+ a.(i+1) <- m;
+ tot_mov := !tot_mov + l1 * l2;
(* keeps track of the upper change in sort order *)
change := i
end
end
- done;
- upper_b := !change;
+ done;
+ upper_b := !change;
(* then we go down *)
- change := !upper_b;
- for i = !upper_b downto !lower_b + 1 do
- begin
- let (i2, j2, l2) = a.(i) in
- let (i1, j1, l1) = a.(i-1) in
- if j2 < j1 then
+ change := !upper_b;
+ for i = !upper_b downto !lower_b + 1 do
+ begin
+ let (i2, j2, l2) = a.(i) in
+ let (i1, j1, l1) = a.(i-1) in
+ if j2 < j1 then
begin
(* swaps, and takes cost into consideration *)
- let m = a.(i) in
+ let m = a.(i) in
a.(i) <- a.(i-1);
- a.(i-1) <- m;
- tot_mov := !tot_mov + l1 * l2;
+ a.(i-1) <- m;
+ tot_mov := !tot_mov + l1 * l2;
(* keeps track of the upper change in sort order *)
change := i
end
end
- done;
- lower_b := !change;
+ done;
+ lower_b := !change;
end
done;
(* computes the distance *)
- let mov' = float_of_int !tot_mov in
- let len' = (if l = 0 then 1.0 else float_of_int l) in
+ let mov' = float_of_int !tot_mov in
+ let len' = (if l = 0 then 1.0 else float_of_int l) in
(mov' /. len');;
-let edit_distance (edits: edit list) (l: int) : float =
+let edit_distance (edits: edit list) (l: int) : float =
let id_contr = contribution_insdel edits in
let mov_contr = contribution_mov edits l in
id_contr +. mov_contr
(** Unit test for edit distance *)
-if false then begin
+if false then begin
let e = [Mov (0, 0, 2); Mov (6, 4, 3); Del (2, 4); Ins (2, 2)] in
print_string (string_of_float (edit_distance e 20));
print_newline ();
let e = [Del (0, 2); Mov (2, 0, 3); Ins (3, 3)] in
print_string (string_of_float (edit_distance e 20));
print_newline ();
- let e = [Del (0, 2); Del (4, 2); Del (6, 1); Del (11, 3); Mov (2, 0, 2); Mov (7, 2, 2);
+ let e = [Del (0, 2); Del (4, 2); Del (6, 1); Del (11, 3); Mov (2, 0, 2); Mov (7, 2, 2);
Mov (9, 9, 2); Ins(4, 2); Ins (6, 3); Ins (11, 3)] in
print_string (string_of_float (edit_distance e 20));
print_newline ();
View
124 analysis/online_types.ml
@@ -34,37 +34,39 @@ POSSIBILITY OF SUCH DAMAGE.
*)
-(** This file contains types that are used by several modules of the
+(** This file contains types that are used by several modules of the
online WikiTrust implementation. It belongs to the batch implementation
- because the batch trust analysis needs to be able to prepare data
+ because the batch trust analysis needs to be able to prepare data
in the format used by the online analysis. *)
-TYPE_CONV_PATH "UCSC_WIKI_RESEARCH"
+TYPE_CONV_PATH "UCSC_WIKI_RESEARCH"
-(** Type of an author in the annotated text. Choose int if you wish to
- annotate text with author ids, and string if you wish to annotate
+open Sexplib.Conv
+
+(** Type of an author in the annotated text. Choose int if you wish to
+ annotate text with author ids, and string if you wish to annotate
with author names. *)
type author_t = string with sexp
-(** A chunk is a portion of text that used to be part of an article, but that
+(** A chunk is a portion of text that used to be part of an article, but that
has since been deleted. We associate a chunk list with each page. *)
type chunk_t = {
- (** The timestamp is the time at which the chunk was deleted from the page.
- This is to make it possible to delete chunks that have been deleted for
+ (** The timestamp is the time at which the chunk was deleted from the page.
+ This is to make it possible to delete chunks that have been deleted for
very long (otherwise, they could accumulate). *)
- mutable timestamp: float;
- (** Number of revisions for which a chunk has been deleted.
+ mutable timestamp: float;
+ (** Number of revisions for which a chunk has been deleted.
The purpose is similar to above *)
- mutable n_del_revisions: int;
- (** This is the array of words. Note that we store the words, not the
- seps. This because all we need to know of the deleted chunks is if they
+ mutable n_del_revisions: int;
+ (** This is the array of words. Note that we store the words, not the
+ seps. This because all we need to know of the deleted chunks is if they
are re-inserted, via text comparison, which is based on words. *)
- text: string array;
+ text: string array;
(** This is the trust of the text that has been deleted. *)
trust: float array;
(** These are the author signatures for the trust *)
sigs: Author_sig.packed_author_signature_t array;
- (** This is the revision_id where each word of the text of these
+ (** This is the revision_id where each word of the text of these
deleted chunks was first introduced. *)
origin: int array;
(** This is the author of each word *)
@@ -86,61 +88,61 @@ type page_sig_disk_t = (int * sig_t) list with sexp
(** These are the coefficients used for the evaluation. *)
type trust_coeff_t = {
(** Number of revision to use for trust computation *)
- mutable n_revs_to_consider : int;
+ mutable n_revs_to_consider : int;
(** Length of list of previous high reputation versions of the page *)
- mutable len_hi_rep_revs: int;
+ mutable len_hi_rep_revs: int;
(** Length of list of previous high trust versions of the page *)
- mutable len_hi_trust_revs: int;
+ mutable len_hi_trust_revs: int;
(** Threshold of reputation for an author to be included in a
hi-reputation list *)
mutable hi_rep_list_threshold: float;
(** Max time a chunk can be deleted before it is discarded *)
- mutable max_del_time_chunk : float;
+ mutable max_del_time_chunk : float;
(** max n. of revisions for which a chunk can be deleted before
being discarded *)
- mutable max_del_revs_chunk : int;
+ mutable max_del_revs_chunk : int;
(** Max n. of words in a deleted chunk (if longer, it is truncated) *)
mutable max_dead_chunk_len : int;
(** how much reputation is lent as trust for new text *)
- mutable lends_rep : float;
- (** how much the text of revised articles raises in trust towards the
+ mutable lends_rep : float;
+ (** how much the text of revised articles raises in trust towards the
reputation of the editor *)
- mutable read_all : float;
+ mutable read_all : float;
(** how much the text of revised articles, in the portion of article
directly edited, raises in trust towards the reputation of the
editor *)
- mutable read_part: float;
+ mutable read_part: float;
(** how much the trust of text is lost when text is deleted *)
- mutable kill_decrease: float;
+ mutable kill_decrease: float;
(** how much trust propagates from the edges of block moves *)
- mutable cut_rep_radius: float;
+ mutable cut_rep_radius: float;
(** The text of revised articles that is local to an edit increases
more in trust when revised (see read_part). This coefficient
says how fast this "locality" effect decays at the border of a
local area, into the non-local area. A value of 0 is perfectly
fine. *)
- mutable local_decay: float;
+ mutable local_decay: float;
(** scaling for reputation increments *)
- mutable rep_scaling: float;
+ mutable rep_scaling: float;
(** a function which returns a value based on how mature the page is. *)
mutable dynamic_rep_scaling: int -> int -> float;
(** maximum reputation *)
mutable max_rep: float;
(** Whether to equate anonymous users, regardless of their IP. *)
- mutable equate_anons: bool;
+ mutable equate_anons: bool;
(** Interval of time for nixing *)
- mutable nix_interval: float;
+ mutable nix_interval: float;
(** Negative quality below which nixing happens *)
mutable nix_threshold: float;
- (** The high-median of the reputations is used for the white value.
+ (** The high-median of the reputations is used for the white value.
We choose it so that 90% of work is done below that value. *)
mutable hi_median_perc: float;
- (** This is a similar median, but is used to renormalize the weights of
+ (** This is a similar median, but is used to renormalize the weights of
authors during the initial phase of a wiki *)
mutable hi_median_perc_boost: float;
(** This is the characteristic time, in seconds, of an edit.
If the time is much shorter than this, the effect on text trust is
- proportionately diminished; if the characteristic time is greater,
+ proportionately diminished; if the characteristic time is greater,
the effect on text trust is the full effect. *)
mutable edit_time_constant: float
};;
@@ -151,8 +153,8 @@ let n_past_revs = 8;;
(* We compute the reputation scaling dynamically taking care of the
size of the recent_revision list and the union of the recent
revision list, hig reputation list and high trust list *)
-let default_dynamic_rep_scaling n_recent_revs max_n_recent_revs =
- let n_revs_judged = max 1 (min (n_recent_revs - 2) (max_n_recent_revs / 2)) in
+let default_dynamic_rep_scaling n_recent_revs max_n_recent_revs =
+ let n_revs_judged = max 1 (min (n_recent_revs - 2) (max_n_recent_revs / 2)) in
1. /. (float_of_int n_revs_judged)
let default_trust_coeff = {
@@ -168,8 +170,8 @@ let default_trust_coeff = {
read_part = 0.18;
kill_decrease = (log 2.0) /. 9.0;
cut_rep_radius = 2.0;
- local_decay = 0.5 ** (1. /. 10.);
- (* The reputation scaling is 73.24 when we use n_revs_to_consider = 12,
+ local_decay = 0.5 ** (1. /. 10.);
+ (* The reputation scaling is 73.24 when we use n_revs_to_consider = 12,
and varies quadratically with n_revs_to_consider - 1. *)
rep_scaling = 1. /. (73.24 *. ( ((float_of_int n_past_revs) -. 1.) /. 11.) ** 2.);
dynamic_rep_scaling = default_dynamic_rep_scaling;
@@ -181,19 +183,19 @@ let default_trust_coeff = {
hi_median_perc_boost = 0.7;
edit_time_constant = 24. *. 60. *. 60.; (* 1 day *)
};;
-
+
let get_default_coeff : trust_coeff_t = default_trust_coeff ;;
(** This is the quality information we store with revisions *)
type qual_info_t = {
(** Number of times the revision has been judged *)
- mutable n_edit_judges: int;
+ mutable n_edit_judges: int;
(** Total weight of the judges for the revision. *)
mutable judge_weight: float;
(** Total edit quality: the average is given by dividing by judge_weight *)
mutable total_edit_quality: float;
(** Minimum edit quality of all judgements *)
- mutable min_edit_quality: float;
+ mutable min_edit_quality: float;
(** Nix bit (see the techrep) *)
mutable nix_bit: bool;
(** Delta, or the amount of change done *)
@@ -213,25 +215,25 @@ let coeff_spam_reduction = 0.05
(** This is the type of an edit list, annotated *)
type edit_list_t = {
(** version of text analysis algo *)
- split_version : string;
+ split_version : string;
(** to which version *)
- to_version : int;
+ to_version : int;
(** the edit list proper *)
- editlist : Editlist.edit list
+ editlist : Editlist.edit list
} with sexp
type edit_lists_of_rev_t = edit_list_t list with sexp
(** This is the information associated with a page *)
-type page_info_t = {
- (** List of revision by hi rep authors:
+type page_info_t = {
+ (** List of revision by hi rep authors:
list of (rev_id, user_id, author_rep) triples *)
mutable past_hi_rep_revs : (int * int * float) list;
(** List of revision with high trust: list of (rev_id, rev_trust) pairs *)
- mutable past_hi_trust_revs : (int * float) list;
-} with sexp
+ mutable past_hi_trust_revs : (int * float) list;
+} with sexp
-let page_info_default = {
+let page_info_default = {
past_hi_rep_revs = [];
past_hi_trust_revs = [];
}
@@ -280,13 +282,13 @@ type request_type_t = Vote | Coloring
type wiki_page_t = {
page_id : int;
page_namespace : int;
- page_title : string;
+ page_title : string;
page_restrictions : string;
page_counter : int;
page_is_redirect : bool;
page_is_new : bool;
page_random : float;
- page_touched : string;
+ page_touched : string;
page_latest : int;
page_len : int
} with sexp
@@ -308,25 +310,25 @@ type wiki_revision_t = {
(** High-m%-Median of an array *)
let compute_hi_median (a: float array) (m: float) =
- let total = Array.fold_left (+.) 0. a in
- let mass_below = ref (total *. m) in
- let median = ref 0. in
- let i = ref 0 in
- while (!mass_below > 0.) && (!i < Eval_defs.max_rep_val) do begin
- if a.(!i) > !mass_below then begin
+ let total = Array.fold_left (+.) 0. a in
+ let mass_below = ref (total *. m) in
+ let median = ref 0. in
+ let i = ref 0 in
+ while (!mass_below > 0.) && (!i < Eval_defs.max_rep_val) do begin
+ if a.(!i) > !mass_below then begin
(* Median is in this column *)
median := !median +. !mass_below /. a.(!i);
- mass_below := 0.;
- end else begin
+ mass_below := 0.;
+ end else begin
(* Median is above this column *)
- mass_below := !mass_below -. a.(!i);
+ mass_below := !mass_below -. a.(!i);
i := !i + 1;
- median := !median +. 1.
+ median := !median +. 1.
end
end done;
!median
-let compute_reputation_median a =
+let compute_reputation_median a =
compute_hi_median a default_trust_coeff.hi_median_perc
(* Decides whether a user is a robot, checking against a hashtable,
Please sign in to comment.
Something went wrong with that request. Please try again.