Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
796 lines (741 sloc) 31.1 KB
extern crate docopt;
extern crate rustc_serialize;
extern crate petgraph;
extern crate capngraph;
extern crate rand;
extern crate rayon;
#[macro_use]
extern crate slog;
extern crate slog_stream;
extern crate slog_term;
extern crate slog_json;
extern crate bit_set;
extern crate itertools;
extern crate rusty_machine;
#[macro_use]
extern crate ndarray;
extern crate openblas_src;
extern crate num_traits;
extern crate mvdist;
#[macro_use]
extern crate serde_derive;
extern crate serde_pickle;
extern crate serde_json;
extern crate nlopt;
#[cfg(test)]
#[macro_use]
extern crate quickcheck;
#[macro_use]
extern crate lazy_static;
mod permute;
mod arest;
mod objective;
mod ris;
mod cover;
mod im_simple;
mod stats;
mod kempe;
mod eps;
mod proper_im;
mod greedy;
mod ssa;
mod imm;
mod imm_obj;
use petgraph::prelude::*;
use slog::{Logger, DrainExt};
use docopt::Docopt;
use std::fs::File;
use bit_set::BitSet;
use std::collections::BinaryHeap;
use std::iter::FromIterator;
use rayon::prelude::*;
use itertools::join;
use capngraph::load_graph;
use ndarray::prelude::*;
use ndarray::stack;
use stats::{L2Norm, Covariance};
use eps::*;
use serde_pickle::ser as pickle;
use serde_json::to_string as json_string;
use arest::*;
use objective::*;
use im_simple::{generic_im, simple_im};
use cover::MaxCover;
use kempe::PseudoKempeIC;
use greedy::*;
use proper_im::RobustIM;
const NAME: &'static str = env!("CARGO_PKG_NAME");
const VERSION: &'static str = env!("CARGO_PKG_VERSION");
#[cfg_attr(rustfmt, rustfmt_skip)]
const USAGE: &'static str = "
Curvature estimation tool.
Usage:
curv sequences <problem> <graph> [options]
curv solve-ratio <problem> <graph> <k> <alpha> <beta> <delta> [--diff-sequence] [--bias-steps <s>] [--vary-eps] [options]
curv (-h | --help)
curv --version
Arguments:
<problem> Use objective for problem <type>.
<graph> Problem data to load.
<alpha> Relative change stopping condition for ε.
<beta> Relative change stopping conditiono for Γ.
<delta> Probability of error margin holding.
<k> Solution size.
Ratio Solver Options:
--diff-sequence Output a sequence of ratios by varying the number of assumed differences
between the optimal and greedy solutions.
--bias-steps <s> Vary the (in)dependency bias in s steps from 0.0 to 1.0. When unset,
the bias is set to |D(x)|/|X| for each x in X. When combined
with --diff-sequence, outputs a matrix of ratios with the diff sequences as
the rows.
--vary-eps Solve with varying ε values. Note that this takes *significantly longer*.
Options:
--threads <count> Number of threads to use.
--log <out> Location to output the log file.
--quiet Print only errors and warnings to the terminal.
-h --help Show this screen.
--version Show version information.
";
#[derive(Debug, RustcDecodable)]
#[allow(non_camel_case_types)]
enum ProblemType {
AReST,
Simple_IM,
Proper_IM,
SSA_IC,
SSA_LT,
IMM_IC,
IMM_LT,
MaxCover,
Kempe_IC,
Robust_IC_LT,
}
#[derive(Debug, RustcDecodable)]
struct Args {
arg_problem: ProblemType,
arg_graph: String,
arg_alpha: Option<f64>,
arg_beta: Option<f64>,
arg_delta: Option<f64>,
arg_k: Option<usize>,
flag_log: Option<String>,
flag_diff_sequence: bool,
flag_bias_steps: Option<usize>,
flag_quiet: bool,
flag_vary_eps: bool,
flag_threads: Option<usize>,
cmd_sequences: bool,
cmd_solve_ratio: bool,
}
macro_rules! oh {
($e:expr) => {
if let Some(x) = $e {
x
} else {
return None;
}
}
}
/// Compute the curvature sequence for a node `u` in radius 2 in a greedy
/// fashion. This is *not* the complete computation -- it relies on the curvatures of future
/// selections not getting significantly larger (yet).
fn rooted_greedy_curvature<O: Objective + Sync>(obj: &O,
u: O::Element,
log: &Logger)
-> Result<Vec<f32>, CurvError<O>>
where O::Element: Send + Sync,
O::State: Send + Sync
{
let sol = BitSet::new();
let mut state = O::State::default();
let candidates = obj.depends(u, &state)?;
let mut vec: Vec<WeightedNode<O>> = Vec::with_capacity(candidates.len());
candidates.par_iter()
.map(|&v| {
WeightedNode {
weight: obj.nabla(u, v, &sol, &state).unwrap(),
node: v,
}
})
.collect_into(&mut vec);
let mut heap: BinaryHeap<WeightedNode<O>> = vec.into();
let mut result = Vec::with_capacity(candidates.len());
let mut sol = sol;
while let Some(top) = heap.pop() {
result.push(top.weight);
sol.insert(top.node.into());
obj.insert_mut(top.node, &mut state)?;
let mut vec: Vec<Option<WeightedNode<O>>> = Vec::with_capacity(candidates.len());
heap.par_iter().map(|&ref wn| {
let res = obj.nabla(u, wn.node, &sol, &state);
if let Ok(nabla) = res {
Some(WeightedNode {
weight: nabla,
node: wn.node
})
} else {
warn!(log, "skipping node in reheap"; "node" => wn.node.into(), "reason" => String::from(res.unwrap_err()));
None
}
}).collect_into(&mut vec);
// TODO: doing the filtering here may throw away all the benefits of parallelism from the above.
heap = vec.into_iter().filter_map(|x| x).collect();
}
Ok(result)
}
/// Represents a sequence of curvature values.
#[derive(Debug, PartialEq, Clone)]
struct CurvSeq {
/// The product of curvature values in `seq`
total: f32,
/// The sequence of curvatures *in reverse* (for fast appends)
seq: Vec<(NodeIndex, f32)>,
}
impl Eq for CurvSeq {
// lies and damned lies...
}
impl PartialOrd for CurvSeq {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
self.total.partial_cmp(&other.total)
}
}
impl Ord for CurvSeq {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.partial_cmp(other).unwrap()
}
}
fn estimate_distributions<O: Objective + Sync>(obj: O, log: &Logger)
where O::Element: Send + Sync,
O::State: Send + Sync
{
obj.elements().into_par_iter().for_each(|u| {
info!(log, "estimating distribution"; "node" => u.into());
let dist = match rooted_greedy_curvature(&obj, u, &log) {
Ok(dist) => dist,
Err(CurvError::NoCandidates(v)) => {
assert!(u == v);
warn!(log, "skipping sink node"; "node" => v.into());
return;
},
err => panic!("{:?}", err)
};
let prod: f32 = dist.iter().cloned().product();
info!(log, "curvature_product"; "node" => u.into(), "product" => prod, "distribution" => join(dist, ","));
});
}
fn estimate_gamma<O: Objective + Sync>(obj: &O,
sol: &BitSet,
state: &O::State,
k: usize,
alpha: f64,
beta: f64,
delta: f64,
bias: Option<f32>,
vary: bool,
log: &Logger)
-> Result<(Vec<f32>, Vec<f64>), CurvError<O>>
where O::Element: Send + Sync,
O::State: Sync
{
use rand::{thread_rng, Rng};
let mut samples: Array<f64, _> = Array::zeros((0, k));
let mut next_samples = 5_000;
let mut rng = thread_rng();
let mut i = 0;
let mut prev_value = None;
let elements = obj.elements()
.into_iter()
.filter(|&x| !sol.contains(x.into()))
.collect::<Vec<_>>();
info!(log, "beginning sampling for Γ estimation..."; "bias" => bias.map(|b| b.to_string()).unwrap_or("none".to_string()), "varying ε" => vary);
let sampling_log = log.new(o!("section" => "gamma-estimation"));
loop {
info!(sampling_log, "beginning iteration"; "iter" => i, "next samples" => next_samples);
let mut v: Vec<Result<_, _>> = Vec::with_capacity(next_samples);
(0..next_samples)
.map(|_| rng.choose(&elements).unwrap()) // unwrap because elements.len() != 0
.collect::<Vec<_>>()
.into_par_iter()
.map(|&u| {
match obj.sample_sequence(u, k, bias, sol, state) {
Ok(seq) => obj.gamma_seq(u, k, sol.clone(), seq, state.clone()),
Err(CurvError::NoCandidates(_)) => Ok(vec![1.0; k]),
Err(e) => Err(e),
}
})
.collect_into(&mut v);
debug!(sampling_log, "sampling for iteration complete"; "iter" => i);
let flat: Result<Vec<Vec<f32>>, _> = v.into_iter().map(|val| val).collect();
let flat = flat?;
assert!(flat.len() == next_samples);
let ar = Array::from_shape_vec((next_samples, k),
flat.into_iter().flat_map(|x| x).collect::<Vec<f32>>())
.unwrap();
samples = stack(Axis(0), &[samples.view(), ar.mapv(|f| f as f64).view()]).unwrap();
let means = samples.mean(Axis(0));
let covariance = samples.cov(); // always gives centered covariance
info!(sampling_log, "covariance"; "cov" => json_string(&covariance).unwrap());
let num_nans = samples.iter().filter(|f| f.is_nan()).count();
if num_nans > 0 {
crit!(sampling_log, "samples contain NaN values"; "count" => num_nans);
panic!("samples contain NaN values");
}
let epsilon = if covariance != Array::zeros((k, k)) {
info!(sampling_log, "estimating ε");
Array1::<f64>::from_vec(estimate_eps(&covariance,
means.len(),
samples.len(),
delta,
if let Some((ref eps, _)) = prev_value {
Some(eps)
} else {
None
},
vary,
log.new(o!("section" => "ε estimation")))
.map_err(|s| CurvError::Other(s))?
.1)
} else {
assert!(bias == Some(0.0));
// no variance, this happens at the endpoint bias == 0.0
Array1::<f64>::zeros(k)
};
debug!(sampling_log, "estimated bound"; "estimate" => json_string(&epsilon).unwrap(), "δ" => delta);
if let Some((eps_p, gamma_p)) = prev_value {
let a: f64 = if epsilon.norm() > 0.0 {
let upper: Array1<f64> = &epsilon - &eps_p;
upper.norm() / epsilon.norm()
} else {
0.0
};
let upper: Array1<f64> = &means - &gamma_p;
let b: f64 = upper.norm() / means.norm();
if a <= alpha && b <= beta {
info!(sampling_log, "bound converged"; "ε estimate" => json_string(&epsilon).unwrap(), "δ" => delta, "α" => a, "β" => b);
prev_value = Some((epsilon, means));
break;
} else {
info!(sampling_log, "insufficient samples; continuing";
"α" => a,
"β" => b,
"samples" => samples.len_of(Axis(0)),
"μ" => json_string(&means).unwrap());
}
} else {
info!(sampling_log, "first pass completed"; "ε estimate" => json_string(&epsilon).unwrap(), "δ" => delta);
}
prev_value = Some((epsilon, means));
i += 1;
next_samples = samples.len_of(Axis(0));
}
let mut f = File::create("samples.mat").unwrap();
pickle::to_writer(&mut f, &samples, true).unwrap();
let (epsilon, _) = prev_value.unwrap();
let gamma = Vec::from_iter(samples.mean(Axis(0)).into_iter().map(|&f| f as f32));
info!(log, "final estimated gamma"; "gamma" => json_string(&gamma).unwrap(), "ε" => json_string(&epsilon).unwrap());
Ok((gamma, epsilon.into_iter().map(|&x| x).collect()))
}
fn estimate_ratio<O: Objective + Sync>(obj: &O,
sol: &BitSet,
state: &O::State,
f: f32,
k: usize,
alpha: f64,
beta: f64,
delta: f64,
bias: Option<f32>,
vary: bool,
log: &Logger)
-> Result<Vec<f32>, CurvError<O>>
where O::Element: Send + Sync,
O::State: Sync
{
let (gamma, est) = estimate_gamma(obj, &sol, &state, k, alpha, beta, delta, bias, vary, &log)?;
let WeightedNode::<O> { weight: f2, .. } = obj.elements()
.into_iter()
.filter(|&u| !sol.contains(u.into()))
.map(|u| {
WeightedNode {
node: u,
weight: obj.delta(u, &sol, &state).unwrap(), // TODO: don't unwrap
}
})
.max()
.unwrap();
// let gamma_sum: f32 = gamma.into_iter().take(k).sum();
let ratio_seq = gamma.into_iter()
.take(k)
.zip(est.into_iter())
.map(|(g, eps)| (g + eps as f32).min(obj.max_gamma().unwrap_or(std::f32::INFINITY)))
.fold((1.0, Vec::with_capacity(k)),
|(lambda, mut vec), gammaest| {
let lambda = lambda + gammaest;
vec.push(1.0 / (1.0 + (f2 / f) * lambda));
(lambda, vec)
})
.1;
Ok(ratio_seq)
}
fn do_sequences(args: Args, log: Logger) {
match args.arg_problem {
ProblemType::AReST => {
let g = load_graph(args.arg_graph.as_str()).unwrap();
estimate_distributions(AReSTObjective::new_random(g, &log), &log)
}
ProblemType::Simple_IM => {
let g = load_graph(args.arg_graph.as_str()).unwrap();
estimate_distributions(simple_im(&g, 10_000, &log), &log)
}
ProblemType::Proper_IM => {
let g = load_graph(args.arg_graph.as_str()).unwrap();
estimate_distributions(proper_im::InfMax::new::<ris::IC<_, _, _>>(&g,
10_000,
log.new(o!("section" => "proper_im"))),
&log)
}
ProblemType::SSA_IC => {
let g = load_graph(args.arg_graph.as_str()).unwrap();
estimate_distributions(ssa::ssa_unweighted::<ris::IC<_, _, _>>(&g,
0.02,
1.0 / g.node_count() as f64,
args.arg_k.unwrap(),
log.new(o!("section" => "im_ssa")))
.unwrap()
.1,
&log)
}
ProblemType::SSA_LT => {
let g = load_graph(args.arg_graph.as_str()).unwrap();
estimate_distributions(ssa::ssa_unweighted::<ris::LT<_, _, _>>(&g,
0.02,
1.0 / g.node_count() as f64,
args.arg_k.unwrap(),
log.new(o!("section" => "im_ssa")))
.unwrap()
.1,
&log)
}
ProblemType::Robust_IC_LT => {
use std::f32::consts::E;
let g = load_graph(args.arg_graph.as_str()).unwrap();
let (f_ic, ic) =
ssa::ssa_unweighted::<ris::IC<_, _, _>>(&g,
0.02,
1.0 / g.node_count() as f64,
args.arg_k.unwrap(),
log.new(o!("section" => "im_ssa")))
.unwrap();
let (f_lt, lt) =
ssa::ssa_unweighted::<ris::LT<_, _, _>>(&g,
0.02,
1.0 / g.node_count() as f64,
args.arg_k.unwrap(),
log.new(o!("section" => "im_ssa")))
.unwrap();
estimate_distributions(RobustIM::new(&g,
vec![ic, lt],
vec![f_ic as f32 / (1.0 - 1.0 / E),
f_lt as f32 / (1.0 - 1.0 / E)],
log.new(o!("section" => "robust_ssa"))),
&log)
}
ProblemType::IMM_IC => {
let g = load_graph(args.arg_graph.as_str()).unwrap();
estimate_distributions(imm::imm_sampling::<ris::IC<_, _, _>>(&g,
0.02,
1.0,
args.arg_k.unwrap(),
log.new(o!("section" => "im_imm")))
.unwrap(),
&log)
}
ProblemType::IMM_LT => {
let g = load_graph(args.arg_graph.as_str()).unwrap();
estimate_distributions(imm::imm_sampling::<ris::LT<_, _, _>>(&g,
0.02,
1.0,
args.arg_k.unwrap(),
log.new(o!("section" => "im_imm")))
.unwrap(),
&log)
}
ProblemType::Kempe_IC => {
let g = load_graph(args.arg_graph.as_str()).unwrap();
estimate_distributions(PseudoKempeIC::new(g, 10_000, log.new(o!("section" => "kempe"))),
&log)
}
ProblemType::MaxCover => {
let data: cover::Instance =
serde_json::from_reader(File::open(args.arg_graph.as_str()).unwrap()).unwrap();
estimate_distributions(MaxCover::new(data.sets, true, &log), &log)
}
};
}
fn compute_ratios<O: Objective + Sync>(obj: &O,
k: usize,
alpha: f64,
beta: f64,
delta: f64,
bias: Option<usize>,
diffseq: bool,
vary: bool,
log: &Logger)
where O::Element: Sync + Send,
O::State: Sync
{
let ratio_log = log.new(o!("k" => k, "α" => alpha, "β" => beta, "δ" => delta));
let (f, sol_vec, state) = greedy(obj, k, &log).unwrap();
let sol = BitSet::from_iter(sol_vec.into_iter().map(|i| i.into()));
let ratios = |bias| {
estimate_ratio(obj,
&sol,
&state,
f,
k,
alpha,
beta,
delta,
bias,
vary,
&ratio_log)
.unwrap()
};
if bias.is_none() {
let ratios = ratios(None);
if diffseq {
info!(&ratio_log, "approximation ratio"; "ratios" => json_string(&ratios).unwrap(), "diff sequence" => true);
} else {
info!(&ratio_log, "approximation ratio"; "ratio" => *ratios.last().unwrap());
}
} else {
let num_steps = bias.unwrap();
let steps = Array::linspace(0., 1., num_steps);
let mut seq = Vec::with_capacity(num_steps * k);
for &step in &steps {
let mut rat = ratios(Some(step));
seq.append(&mut rat);
}
let ratios = Array::from_vec(seq).into_shape((num_steps, k)).unwrap();
if diffseq {
info!(&ratio_log, "approximation ratio"; "diff sequence" => true, "bias steps" => num_steps, "ratios" => json_string(&ratios).unwrap());
} else {
info!(&ratio_log, "approximation_ratio"; "bias steps" => num_steps, "ratios" => json_string(&ratios.slice(s![.., -1..])).unwrap());
}
}
}
fn do_solve_ratio(args: Args, log: Logger) {
let k = args.arg_k.unwrap();
let alpha = args.arg_alpha.unwrap();
let beta = args.arg_beta.unwrap();
let delta = args.arg_delta.unwrap();
let bias = args.flag_bias_steps;
let diffseq = args.flag_diff_sequence;
let vary = args.flag_vary_eps;
match args.arg_problem {
ProblemType::AReST => {
let g = load_graph(args.arg_graph.as_str()).unwrap();
compute_ratios(&AReSTObjective::new_random(g, &log),
k,
alpha,
beta,
delta,
bias,
diffseq,
vary,
&log)
}
ProblemType::Simple_IM => {
let g = load_graph(args.arg_graph.as_str()).unwrap();
// let g = im_simple::reweight_indegree(&g);
compute_ratios(&generic_im::<_, _, ris::IC<_, _, _>>(&g, 10_000, &log),
k,
alpha,
beta,
delta,
bias,
diffseq,
vary,
&log)
}
ProblemType::Proper_IM => {
let g = load_graph(args.arg_graph.as_str()).unwrap();
// let g = im_simple::reweight_indegree(&g);
compute_ratios(&proper_im::InfMax::new::<ris::IC<_, _, _>>(&g,
10_000,
log.new(o!("section" => "proper_im"))),
k,
alpha,
beta,
delta,
bias,
diffseq,
vary,
&log)
}
ProblemType::SSA_IC => {
let g = load_graph(args.arg_graph.as_str()).unwrap();
// let g = im_simple::reweight_indegree(&g);
compute_ratios(&ssa::ssa_unweighted::<ris::IC<_,
_,
_>>(&g,
0.02,
1.0 / g.node_count() as f64,
k,
log.new(o!("section" => "ssa", "model" => "IC")))
.unwrap()
.1,
k,
alpha,
beta,
delta,
bias,
diffseq,
vary,
&log)
}
ProblemType::SSA_LT => {
let g = load_graph(args.arg_graph.as_str()).unwrap();
// let g = im_simple::reweight_indegree(&g);
compute_ratios(&ssa::ssa_unweighted::<ris::LT<_,
_,
_>>(&g,
0.02,
1.0 / g.node_count() as f64,
k,
log.new(o!("section" => "ssa", "model" => "LT")))
.unwrap()
.1,
k,
alpha,
beta,
delta,
bias,
diffseq,
vary,
&log)
}
ProblemType::Robust_IC_LT => {
use std::f32::consts::E;
let g = load_graph(args.arg_graph.as_str()).unwrap();
let (f_ic, ic) =
ssa::ssa_unweighted::<ris::IC<_, _, _>>(&g,
0.02,
1.0 / g.node_count() as f64,
args.arg_k.unwrap(),
log.new(o!("section" => "im_ssa")))
.unwrap();
let (f_lt, lt) =
ssa::ssa_unweighted::<ris::LT<_, _, _>>(&g,
0.02,
1.0 / g.node_count() as f64,
args.arg_k.unwrap(),
log.new(o!("section" => "im_ssa")))
.unwrap();
compute_ratios(&RobustIM::new(&g,
vec![ic, lt],
vec![f_ic as f32 / (1.0 - 1.0 / E),
f_lt as f32 / (1.0 - 1.0 / E)],
log.new(o!("section" => "robust_ssa"))),
k,
alpha,
beta,
delta,
bias,
diffseq,
vary,
&log)
}
ProblemType::IMM_IC => {
let g = load_graph(args.arg_graph.as_str()).unwrap();
// let g = im_simple::reweight_indegree(&g);
compute_ratios(&imm::imm_sampling::<ris::IC<_, _, _>>(&g,
0.02,
1.0,
k,
log.new(o!("section" => "imm", "model" => "IC")))
.unwrap(),
k,
alpha,
beta,
delta,
bias,
diffseq,
vary,
&log)
}
ProblemType::IMM_LT => {
let g = load_graph(args.arg_graph.as_str()).unwrap();
// let g = im_simple::reweight_indegree(&g);
compute_ratios(&imm::imm_sampling::<ris::LT<_, _, _>>(&g,
0.02,
1.0,
k,
log.new(o!("section" => "imm", "model" => "LT")))
.unwrap(),
k,
alpha,
beta,
delta,
bias,
diffseq,
vary,
&log)
}
ProblemType::Kempe_IC => {
let g = load_graph(args.arg_graph.as_str()).unwrap();
// let g = im_simple::reweight_indegree(&g);
compute_ratios(&PseudoKempeIC::new(g, 10_000, log.new(o!("section" => "kempe"))),
k,
alpha,
beta,
delta,
bias,
diffseq,
vary,
&log)
}
ProblemType::MaxCover => {
let data: cover::Instance =
serde_json::from_reader(File::open(args.arg_graph.as_str()).unwrap()).unwrap();
compute_ratios(&MaxCover::new(data.sets, true, &log),
k,
alpha,
beta,
delta,
bias,
diffseq,
vary,
&log)
}
};
}
fn main() {
let args: Args = Docopt::new(USAGE)
.and_then(|d| d.version(Some(format!("{} {}", NAME, VERSION))).decode())
.unwrap_or_else(|e| e.exit());
if let Some(limit) = args.flag_threads {
rayon::initialize(rayon::Configuration::new().set_num_threads(limit)).unwrap();
}
let log = {
let tdrain = slog::LevelFilter::new(slog_term::streamer().color().compact().build(),
if args.flag_quiet {
slog::Level::Warning
} else {
slog::Level::Trace
});
if let &Some(ref logpath) = &args.flag_log {
let drain = slog::Duplicate::new(tdrain,
slog_stream::stream(File::create(logpath).unwrap(),
slog_json::default()))
.fuse();
Logger::root(drain, o!("version" => env!("CARGO_PKG_VERSION")))
} else {
Logger::root(tdrain.fuse(), o!("version" => env!("CARGO_PKG_VERSION")))
}
};
if args.cmd_sequences {
do_sequences(args, log);
} else if args.cmd_solve_ratio {
do_solve_ratio(args, log);
}
}