This repository has been archived by the owner on Sep 27, 2023. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial release of cron job to do parallel analysis and generate a
dashboard. See pipeline/README.md for details.
- Loading branch information
Showing
35 changed files
with
4,108 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
pipeline | ||
======== | ||
|
||
This directory contains tools and scripts for running a cron job that does | ||
RAPPOR analysis and generates an HTML dashboard. | ||
|
||
It works like this: | ||
|
||
1. `task_spec.py` generates a text file where each line corresponds to a process | ||
to be run (a "task"). The process is `bin/decode-dist` or | ||
`bin/decode-assoc`. The line contains the task parameters. | ||
|
||
2. `xargs -P` is used to run processes in parallel. Our analysis is generally | ||
single-threaded (i.e. because R is single-threaded), so this helps utilize | ||
the machine fully. Each task places its output in a different subdirectory. | ||
|
||
3. `cook.sh` calls `combine_results.py` to combine analysis results into a time | ||
series. It also calls `combine_status.py` to keep track of task data for | ||
"meta-analysis". `metric_status.R` generates more summary CSV files. | ||
|
||
4. `ui.sh` calls `csv_to_html.py` to generate an HTML fragments from the CSV | ||
files. | ||
|
||
5. The JavaScript in `ui/ui.js` is loaded from static HTML, and makes AJAX calls | ||
to retrieve the HTML fragments. The page is made interactive with | ||
`ui/table-lib.js`. | ||
|
||
`dist.sh` and `assoc.sh` contain functions which coordinate this process. | ||
|
||
`alarm-lib.sh` is used to kill processes that have been running for too long. | ||
|
||
Testing | ||
------- | ||
|
||
`pipeline/regtest.sh` contains end-to-end demos of this process. Right now it | ||
depends on testdata from elsewhere in the tree: | ||
|
||
|
||
rappor$ ./demo.sh run # prepare dist testdata | ||
rappor$ cd bin | ||
|
||
bin$ ./test.sh write-assoc-testdata # prepare assoc testdata | ||
bin$ cd ../pipeline | ||
|
||
pipeline$ ./regtest.sh dist | ||
pipeline$ ./regtest.sh assoc | ||
|
||
pipeline$ python -m SimpleHTTPServer # start a static web server | ||
|
||
http://localhost:8000/_tmp/ | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
#!/bin/bash | ||
# | ||
# Alarm tool. | ||
# | ||
# Usage: | ||
# ./alarm.sh <function name> | ||
|
||
# You can source this file and use the alarm-status function. | ||
|
||
set -o nounset | ||
set -o pipefail | ||
set -o errexit | ||
|
||
# Run a command with a timeout, and print its status to a directory. | ||
# | ||
# Usage: | ||
# alarm-status job_dir/STATUS 10 \ | ||
# flaky_command ... | ||
|
||
alarm-status() { | ||
set +o errexit | ||
local status_file=$1 | ||
shift # everything except the status file goes to perl | ||
|
||
# NOTE: It would be nice to setpgrp() before exec? And then can the signal | ||
# be delivered to the entire group, like kill -SIGALRM -PID? | ||
|
||
# NOTE: If we did this in Python, the error message would also be clearer. | ||
perl -e 'alarm shift; exec @ARGV or die "ERROR: after exec @ARGV"' "$@" | ||
local exit_code=$? | ||
|
||
set -o errexit | ||
|
||
local result='' | ||
case $exit_code in | ||
0) | ||
# Would be nice to show elapsed time? | ||
result='OK' | ||
;; | ||
9) | ||
# decode_assoc.R will exit 9 if there are no reports AFTER | ||
# --remove-bad-rows. A task can also be marked SKIPPED before running | ||
# the child process (see backfill.sh). | ||
result='SKIPPED by child process' | ||
;; | ||
# exit code 142 means SIGALARM. 128 + 14 = 142. See 'kill -l'. | ||
142) | ||
local seconds=$1 | ||
result="TIMEOUT after $seconds seconds" | ||
;; | ||
*) | ||
result="FAIL with status $exit_code" | ||
;; | ||
esac | ||
echo "$result" | ||
echo "$result" > $status_file | ||
} | ||
|
||
_work() { | ||
local n=10 # 2 seconds | ||
for i in $(seq $n); do | ||
echo $i - "$@" | ||
sleep 0.2 | ||
done | ||
} | ||
|
||
_succeed() { | ||
_work "$@" | ||
exit 0 | ||
} | ||
|
||
_fail() { | ||
_work "$@" | ||
exit 1 | ||
} | ||
|
||
_skip() { | ||
exit 9 | ||
} | ||
|
||
# http://perldoc.perl.org/functions/alarm.html | ||
# | ||
# Delivers alarm. But how to get the process to have a distinct exit code? | ||
|
||
demo() { | ||
mkdir -p _tmp | ||
|
||
# timeout | ||
alarm-status _tmp/A 1 $0 _succeed foo | ||
echo | ||
|
||
# ok | ||
alarm-status _tmp/B 3 $0 _succeed bar | ||
echo | ||
|
||
# fail | ||
alarm-status _tmp/C 3 $0 _fail baz | ||
echo | ||
|
||
# skip | ||
alarm-status _tmp/D 3 $0 _skip baz | ||
echo | ||
|
||
head _tmp/{A,B,C,D} | ||
} | ||
|
||
test-simple() { | ||
alarm-status _tmp/status.txt 1 sleep 2 | ||
} | ||
|
||
test-bad-command() { | ||
alarm-status _tmp/status.txt 1 nonexistent_sleep 2 | ||
} | ||
|
||
# BUG | ||
test-perl() { | ||
set +o errexit | ||
perl -e 'alarm shift; exec @ARGV or die "ERROR after exec @ARGV"' 1 _sleep 2 | ||
echo $? | ||
} | ||
|
||
if test $(basename $0) = 'alarm-lib.sh'; then | ||
"$@" | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
#!/bin/bash | ||
# | ||
# Usage: | ||
# ./assoc.sh <function name> | ||
|
||
set -o nounset | ||
set -o pipefail | ||
set -o errexit | ||
|
||
readonly THIS_DIR=$(dirname $0) | ||
readonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd) | ||
|
||
source $RAPPOR_SRC/util.sh # log, banner | ||
source $RAPPOR_SRC/pipeline/tools-lib.sh | ||
source $RAPPOR_SRC/pipeline/alarm-lib.sh | ||
|
||
# Run a single decode-assoc process, to analyze one variable pair for one | ||
# metric. The arguments to this function are one row of the task spec. | ||
decode-one() { | ||
# Job constants, from decode-many | ||
local rappor_src=$1 | ||
local timeout_secs=$2 | ||
local min_reports=$3 | ||
local job_dir=$4 | ||
local sample_size=$5 | ||
|
||
# Task spec variables, from task_spec.py | ||
local num_reports=$6 | ||
local metric_name=$7 | ||
local date=$8 # for output naming only | ||
local reports=$9 # file with reports | ||
local var1=${10} | ||
local var2=${11} | ||
local map1=${12} | ||
local output_dir=${13} | ||
|
||
local log_file=$output_dir/assoc-log.txt | ||
local status_file=$output_dir/assoc-status.txt | ||
mkdir --verbose -p $output_dir | ||
|
||
# Flags drived from job constants | ||
local schema=$job_dir/config/rappor-vars.csv | ||
local params_dir=$job_dir/config | ||
local em_executable=$rappor_src/analysis/cpp/_tmp/fast_em | ||
|
||
# TODO: | ||
# - Skip jobs with few reports, like ./backfill.sh analyze-one. | ||
|
||
# Output the spec for combine_status.py. | ||
echo "$@" > $output_dir/assoc-spec.txt | ||
|
||
# NOTE: Not passing --num-cores since we're parallelizing already. | ||
|
||
# NOTE: --tmp-dir is the output dir. Then we just delete all the .bin files | ||
# afterward so we don't copy them to x20 (they are big). | ||
|
||
{ time \ | ||
alarm-status $status_file $timeout_secs \ | ||
$rappor_src/bin/decode-assoc \ | ||
--create-bool-map \ | ||
--remove-bad-rows \ | ||
--em-executable $em_executable \ | ||
--schema $schema \ | ||
--params-dir $params_dir \ | ||
--metric-name $metric_name \ | ||
--reports $reports \ | ||
--var1 $var1 \ | ||
--var2 $var2 \ | ||
--map1 $map1 \ | ||
--reports-sample-size $sample_size \ | ||
--tmp-dir $output_dir \ | ||
--output-dir $output_dir | ||
} >$log_file 2>&1 | ||
} | ||
|
||
test-decode-one() { | ||
decode-one $RAPPOR_SRC | ||
} | ||
|
||
readonly DEFAULT_MIN_REPORTS=5000 | ||
|
||
#readonly DEFAULT_TIMEOUT_SECONDS=300 # 5 minutes as a quick test. | ||
readonly DEFAULT_TIMEOUT_SECONDS=3600 # 1 hour | ||
|
||
readonly DEFAULT_MAX_PROCS=6 # TODO: Share with backfill.sh | ||
|
||
# Limit to 1M for now. Raise it when we have a full run. | ||
readonly DEFAULT_SAMPLE_SIZE=1000000 | ||
|
||
readonly NUM_ARGS=8 # number of tokens in the task spec, used for xargs | ||
|
||
# Run many decode-assoc processes in parallel. | ||
decode-many() { | ||
local job_dir=$1 | ||
local spec_list=$2 | ||
|
||
# These 3 params affect speed | ||
local timeout_secs=${3:-$DEFAULT_TIMEOUT_SECONDS} | ||
local sample_size=${4:-$DEFAULT_SAMPLE_SIZE} | ||
local max_procs=${5:-$DEFAULT_MAX_PROCS} | ||
|
||
local rappor_src=${6:-$RAPPOR_SRC} | ||
local min_reports=${7:-$DEFAULT_MIN_REPORTS} | ||
|
||
time cat $spec_list \ | ||
| xargs --verbose -n $NUM_ARGS -P $max_procs --no-run-if-empty -- \ | ||
$0 decode-one $rappor_src $timeout_secs $min_reports $job_dir $sample_size | ||
} | ||
|
||
# Combine assoc results and render HTML. | ||
|
||
combine-and-render-html() { | ||
local jobs_base_dir=$1 | ||
local job_dir=$2 | ||
|
||
banner "Combining assoc task status" | ||
TOOLS-cook combine-assoc-task-status $jobs_base_dir $job_dir | ||
|
||
banner "Combining assoc results" | ||
TOOLS-cook combine-assoc-results $jobs_base_dir $job_dir | ||
|
||
banner "Splitting out status per metric, and writing overview" | ||
TOOLS-cook assoc-metric-status $job_dir | ||
|
||
TOOLS-gen-ui symlink-static assoc $job_dir | ||
|
||
banner "Building overview .part.html from CSV" | ||
TOOLS-gen-ui assoc-overview-part-html $job_dir | ||
|
||
banner "Building metric .part.html from CSV" | ||
TOOLS-gen-ui assoc-metric-part-html $job_dir | ||
|
||
banner "Building pair .part.html from CSV" | ||
TOOLS-gen-ui assoc-pair-part-html $job_dir | ||
|
||
banner "Building day .part.html from CSV" | ||
TOOLS-gen-ui assoc-day-part-html $job_dir | ||
} | ||
|
||
# Temp files left over by the fast_em R <-> C++. | ||
list-and-remove-bin() { | ||
local job_dir=$1 | ||
# If everything failed, we might not have anything to list/delete. | ||
find $job_dir -name \*.bin | xargs --no-run-if-empty -- ls -l --si | ||
find $job_dir -name \*.bin | xargs --no-run-if-empty -- rm -f --verbose | ||
} | ||
|
||
"$@" |
Oops, something went wrong.