Skip to content
This repository has been archived by the owner on Sep 27, 2023. It is now read-only.

Commit

Permalink
Initial release of cron job to do parallel analysis and generate a
Browse files Browse the repository at this point in the history
dashboard.  See pipeline/README.md for details.
  • Loading branch information
andychu committed Dec 18, 2015
1 parent 148b5a2 commit f70cb6f
Show file tree
Hide file tree
Showing 35 changed files with 4,108 additions and 2 deletions.
4 changes: 2 additions & 2 deletions bin/test.sh
Expand Up @@ -100,7 +100,7 @@ EOF
cat _tmp/reports.csv _tmp/bad_rows.txt > _tmp/reports_bad_rows.csv

# Define a string variable and a boolean varaible.
cat >_tmp/schema.csv <<EOF
cat >_tmp/rappor-vars.csv <<EOF
metric, var, var_type, params
m,domain,string,m_params
m,flag..HTTPS,boolean,m_params
Expand Down Expand Up @@ -132,7 +132,7 @@ EOF
decode-assoc() {
time ./decode-assoc \
--metric-name m \
--schema _tmp/schema.csv \
--schema _tmp/rappor-vars.csv \
--reports _tmp/reports.csv \
--params-dir _tmp \
--var1 domain \
Expand Down
52 changes: 52 additions & 0 deletions pipeline/README.md
@@ -0,0 +1,52 @@
pipeline
========

This directory contains tools and scripts for running a cron job that does
RAPPOR analysis and generates an HTML dashboard.

It works like this:

1. `task_spec.py` generates a text file where each line corresponds to a process
to be run (a "task"). The process is `bin/decode-dist` or
`bin/decode-assoc`. The line contains the task parameters.

2. `xargs -P` is used to run processes in parallel. Our analysis is generally
single-threaded (i.e. because R is single-threaded), so this helps utilize
the machine fully. Each task places its output in a different subdirectory.

3. `cook.sh` calls `combine_results.py` to combine analysis results into a time
series. It also calls `combine_status.py` to keep track of task data for
"meta-analysis". `metric_status.R` generates more summary CSV files.

4. `ui.sh` calls `csv_to_html.py` to generate an HTML fragments from the CSV
files.

5. The JavaScript in `ui/ui.js` is loaded from static HTML, and makes AJAX calls
to retrieve the HTML fragments. The page is made interactive with
`ui/table-lib.js`.

`dist.sh` and `assoc.sh` contain functions which coordinate this process.

`alarm-lib.sh` is used to kill processes that have been running for too long.

Testing
-------

`pipeline/regtest.sh` contains end-to-end demos of this process. Right now it
depends on testdata from elsewhere in the tree:


rappor$ ./demo.sh run # prepare dist testdata
rappor$ cd bin

bin$ ./test.sh write-assoc-testdata # prepare assoc testdata
bin$ cd ../pipeline

pipeline$ ./regtest.sh dist
pipeline$ ./regtest.sh assoc

pipeline$ python -m SimpleHTTPServer # start a static web server

http://localhost:8000/_tmp/


124 changes: 124 additions & 0 deletions pipeline/alarm-lib.sh
@@ -0,0 +1,124 @@
#!/bin/bash
#
# Alarm tool.
#
# Usage:
# ./alarm.sh <function name>

# You can source this file and use the alarm-status function.

set -o nounset
set -o pipefail
set -o errexit

# Run a command with a timeout, and print its status to a directory.
#
# Usage:
# alarm-status job_dir/STATUS 10 \
# flaky_command ...

alarm-status() {
set +o errexit
local status_file=$1
shift # everything except the status file goes to perl

# NOTE: It would be nice to setpgrp() before exec? And then can the signal
# be delivered to the entire group, like kill -SIGALRM -PID?

# NOTE: If we did this in Python, the error message would also be clearer.
perl -e 'alarm shift; exec @ARGV or die "ERROR: after exec @ARGV"' "$@"
local exit_code=$?

set -o errexit

local result=''
case $exit_code in
0)
# Would be nice to show elapsed time?
result='OK'
;;
9)
# decode_assoc.R will exit 9 if there are no reports AFTER
# --remove-bad-rows. A task can also be marked SKIPPED before running
# the child process (see backfill.sh).
result='SKIPPED by child process'
;;
# exit code 142 means SIGALARM. 128 + 14 = 142. See 'kill -l'.
142)
local seconds=$1
result="TIMEOUT after $seconds seconds"
;;
*)
result="FAIL with status $exit_code"
;;
esac
echo "$result"
echo "$result" > $status_file
}

_work() {
local n=10 # 2 seconds
for i in $(seq $n); do
echo $i - "$@"
sleep 0.2
done
}

_succeed() {
_work "$@"
exit 0
}

_fail() {
_work "$@"
exit 1
}

_skip() {
exit 9
}

# http://perldoc.perl.org/functions/alarm.html
#
# Delivers alarm. But how to get the process to have a distinct exit code?

demo() {
mkdir -p _tmp

# timeout
alarm-status _tmp/A 1 $0 _succeed foo
echo

# ok
alarm-status _tmp/B 3 $0 _succeed bar
echo

# fail
alarm-status _tmp/C 3 $0 _fail baz
echo

# skip
alarm-status _tmp/D 3 $0 _skip baz
echo

head _tmp/{A,B,C,D}
}

test-simple() {
alarm-status _tmp/status.txt 1 sleep 2
}

test-bad-command() {
alarm-status _tmp/status.txt 1 nonexistent_sleep 2
}

# BUG
test-perl() {
set +o errexit
perl -e 'alarm shift; exec @ARGV or die "ERROR after exec @ARGV"' 1 _sleep 2
echo $?
}

if test $(basename $0) = 'alarm-lib.sh'; then
"$@"
fi
148 changes: 148 additions & 0 deletions pipeline/assoc.sh
@@ -0,0 +1,148 @@
#!/bin/bash
#
# Usage:
# ./assoc.sh <function name>

set -o nounset
set -o pipefail
set -o errexit

readonly THIS_DIR=$(dirname $0)
readonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd)

source $RAPPOR_SRC/util.sh # log, banner
source $RAPPOR_SRC/pipeline/tools-lib.sh
source $RAPPOR_SRC/pipeline/alarm-lib.sh

# Run a single decode-assoc process, to analyze one variable pair for one
# metric. The arguments to this function are one row of the task spec.
decode-one() {
# Job constants, from decode-many
local rappor_src=$1
local timeout_secs=$2
local min_reports=$3
local job_dir=$4
local sample_size=$5

# Task spec variables, from task_spec.py
local num_reports=$6
local metric_name=$7
local date=$8 # for output naming only
local reports=$9 # file with reports
local var1=${10}
local var2=${11}
local map1=${12}
local output_dir=${13}

local log_file=$output_dir/assoc-log.txt
local status_file=$output_dir/assoc-status.txt
mkdir --verbose -p $output_dir

# Flags drived from job constants
local schema=$job_dir/config/rappor-vars.csv
local params_dir=$job_dir/config
local em_executable=$rappor_src/analysis/cpp/_tmp/fast_em

# TODO:
# - Skip jobs with few reports, like ./backfill.sh analyze-one.

# Output the spec for combine_status.py.
echo "$@" > $output_dir/assoc-spec.txt

# NOTE: Not passing --num-cores since we're parallelizing already.

# NOTE: --tmp-dir is the output dir. Then we just delete all the .bin files
# afterward so we don't copy them to x20 (they are big).

{ time \
alarm-status $status_file $timeout_secs \
$rappor_src/bin/decode-assoc \
--create-bool-map \
--remove-bad-rows \
--em-executable $em_executable \
--schema $schema \
--params-dir $params_dir \
--metric-name $metric_name \
--reports $reports \
--var1 $var1 \
--var2 $var2 \
--map1 $map1 \
--reports-sample-size $sample_size \
--tmp-dir $output_dir \
--output-dir $output_dir
} >$log_file 2>&1
}

test-decode-one() {
decode-one $RAPPOR_SRC
}

readonly DEFAULT_MIN_REPORTS=5000

#readonly DEFAULT_TIMEOUT_SECONDS=300 # 5 minutes as a quick test.
readonly DEFAULT_TIMEOUT_SECONDS=3600 # 1 hour

readonly DEFAULT_MAX_PROCS=6 # TODO: Share with backfill.sh

# Limit to 1M for now. Raise it when we have a full run.
readonly DEFAULT_SAMPLE_SIZE=1000000

readonly NUM_ARGS=8 # number of tokens in the task spec, used for xargs

# Run many decode-assoc processes in parallel.
decode-many() {
local job_dir=$1
local spec_list=$2

# These 3 params affect speed
local timeout_secs=${3:-$DEFAULT_TIMEOUT_SECONDS}
local sample_size=${4:-$DEFAULT_SAMPLE_SIZE}
local max_procs=${5:-$DEFAULT_MAX_PROCS}

local rappor_src=${6:-$RAPPOR_SRC}
local min_reports=${7:-$DEFAULT_MIN_REPORTS}

time cat $spec_list \
| xargs --verbose -n $NUM_ARGS -P $max_procs --no-run-if-empty -- \
$0 decode-one $rappor_src $timeout_secs $min_reports $job_dir $sample_size
}

# Combine assoc results and render HTML.

combine-and-render-html() {
local jobs_base_dir=$1
local job_dir=$2

banner "Combining assoc task status"
TOOLS-cook combine-assoc-task-status $jobs_base_dir $job_dir

banner "Combining assoc results"
TOOLS-cook combine-assoc-results $jobs_base_dir $job_dir

banner "Splitting out status per metric, and writing overview"
TOOLS-cook assoc-metric-status $job_dir

TOOLS-gen-ui symlink-static assoc $job_dir

banner "Building overview .part.html from CSV"
TOOLS-gen-ui assoc-overview-part-html $job_dir

banner "Building metric .part.html from CSV"
TOOLS-gen-ui assoc-metric-part-html $job_dir

banner "Building pair .part.html from CSV"
TOOLS-gen-ui assoc-pair-part-html $job_dir

banner "Building day .part.html from CSV"
TOOLS-gen-ui assoc-day-part-html $job_dir
}

# Temp files left over by the fast_em R <-> C++.
list-and-remove-bin() {
local job_dir=$1
# If everything failed, we might not have anything to list/delete.
find $job_dir -name \*.bin | xargs --no-run-if-empty -- ls -l --si
find $job_dir -name \*.bin | xargs --no-run-if-empty -- rm -f --verbose
}

"$@"

0 comments on commit f70cb6f

Please sign in to comment.