Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
tree: 736e0e19d4
Fetching contributors…

Cannot retrieve contributors at this time

executable file 161 lines (146 sloc) 4.449 kb
#!/bin/bash
# bashreduce: mapreduce in bash
# erik@fawx.com
function usage() {
printf "Usage: %s: [-m host1 [host2...]] [-m map] [-r reduce] [-i input] [-o output]\n" `basename $1`
printf " %s -h for help.\n" `basename $1`
exit 2
}
function showhelp() {
printf "Usage: %s: [-m host1 [host2...]] [-c column] [-r reduce] [-i input] [-o output]\n" `basename $1`
echo "bashreduce. Map an input file to many hosts, sort/reduce, merge"
echo " -m: hosts to use, can repeat hosts for multiple cores"
echo " default hosts from /etc/br.hosts"
echo " -c: column to partition, default = 1 (1-based)"
echo " -r: reduce function, default = identity"
echo " -i: input file, default = stdin"
echo " -o: output file, default = stdout"
echo " -t: tmp dir to use, default = /tmp"
echo " -S: memory to use for sort, default = 256M"
echo " -h: this help message"
exit 2
}
hosts=
mapcolumn=1
reduce=
input=
output=
tmp_dir=/tmp
sort_mem=256M
while getopts "m:c:r:i:o:t:S:h" name
do
case $name in
m) hosts=$OPTARG;;
c) mapcolumn=$OPTARG;;
r) reduce=$OPTARG;;
i) input=$OPTARG;;
o) output=$OPTARG;;
t) tmp_dir=$OPTARG;;
S) sort_mem=$OPTARG;;
h) showhelp $0;;
[?]) usage $0;;
esac
done
if [[ -z $hosts ]]
then
if [[ -e /etc/br.hosts ]]
then
hosts=`cat /etc/br.hosts`
else
printf "%s: must specify hosts with -m or provide /etc/br.hosts\n" `basename $0`
usage $0
fi
fi
if [[ ! -z $reduce ]]
then
reduce="| "$reduce
fi
# okay let's get started! first we need a name for our job
jobid=`uuidgen`
jobpath="$tmp_dir/br_job_$jobid"
nodepath="$tmp_dir/br_node_$jobid"
mkdir $jobpath
mkdir $jobpath/in
mkdir $jobpath/out
# now, for each host, set up in and out fifos (and a netcat for each), and ssh to each host to set up workers listening on netcat
port_in=8192
port_out=`expr $port_in + 1`
host_idx=0
out_files=
for host in $hosts
do
# our named pipes
mkfifo $jobpath/in/$host_idx
mkfifo $jobpath/out/$host_idx
# lets get the pid of our listener
ssh -n $host "mkdir -p $nodepath"
pid=`ssh -n $host "nc -l -p $port_out > $nodepath/in_$host_idx 2> /dev/null < /dev/null & jobs -l" | awk {'print $2'}`
ssh $host -n "tail -s0.1 -f --pid=$pid $nodepath/in_$host_idx 2> /dev/null < /dev/null | LC_ALL='$LC_ALL' sort -S$sort_mem -T$tmp_dir -k$mapcolumn,$mapcolumn $reduce 2>/dev/null | nc -q0 -l -p $port_in >& /dev/null &"
# our local forwarders
nc $host $port_in > $jobpath/in/$host_idx &
nc -q0 $host $port_out < $jobpath/out/$host_idx &
# our vars
out_files="$out_files $jobpath/out/$host_idx"
port_in=`expr $port_in + 2`
port_out=`expr $port_in + 1`
host_idx=`expr $host_idx + 1`
done
# okay, time to map
if [[ -z `which brp` ]]
then
# use awk if we don't have brp
# we're taking advantage of a special property that awk leaves its file handles open until its done
# i think this is universal
# we're also sending a zero length string to all the handles at the end, in case some pipe got no love
mapfunction="{
srand(\$$mapcolumn);
print \$0 >> \"$jobpath/out/\"int(rand()*$host_idx);
}
END {
for (i = 0; i != $host_idx; ++i)
{
printf \"\" >> \"$jobpath/out/\"i
}
}"
if [[ -z $input ]]
then
awk "$mapfunction"
else
pv $input | awk "$mapfunction"
fi
else
if [[ -z $input ]]
then
brp - `expr $mapcolumn - 1` $out_files
else
pv $input | brp - `expr $mapcolumn - 1` $out_files
fi
fi
# save it somewhere
if [[ -z `which brm` ]]
then
# use sort -m if we don't have brm
# sort -m creates tmp files if too many input files are specified
# brm doesn't do this
if [[ -z $output ]]
then
sort -k$mapcolumn,$mapcolumn -m $jobpath/in/*
else
sort -k$mapcolumn,$mapcolumn -m $jobpath/in/* | pv > $output
fi
else
if [[ -z $output ]]
then
brm - `expr $mapcolumn - 1` `find $jobpath/in/ -type p | xargs`
else
brm - `expr $mapcolumn - 1` `find $jobpath/in/ -type p | xargs` | pv > $output
fi
fi
# finally, clean up after ourselves
rm -rf $jobpath
for host in $hosts
do
ssh $host "rm -rf $nodepath"
done
# TODO: is there a safe way to kill subprocesses upon fail?
# this seems to work: /bin/kill -- -$$
Jump to Line
Something went wrong with that request. Please try again.