Skip to content


Subversion checkout URL

You can clone with
Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

85 lines (84 sloc) 3.651 kB
# This is basically the config file we use in production at Yelp, with some
# strategic edits. ;)
# If you don't have the yaml module installed, you'll have to use JSON instead,
# which would look something like this:
# {"runners": {
# "emr": {
# "aws_access_key_id": "HADOOPHADOOPBOBADOOP",
# "aws_region": "us-west-1",
# "base_tmp_dir": "/scratch/$USER"
# "bootstrap_python_packages": [
# "$BT/aws/python-packages/*.tar.gz"
# ],
# ...
# We run on in the west region because we're located on the west coast,
# and there are no eventual consistency issues with newly created S3 keys.
aws_region: us-west-1
# alternate tmp dir
base_tmp_dir: /scratch/$USER
# $BT is the path to our source tree. This lets us add modules to
# install on EMR by simply dumping them in this dir.
- $BT/aws/python-packages/*.tar.gz
# specifying an ssh key pair allows us to ssh tunnel to the job tracker
# and fetch logs via ssh
ec2_key_pair: EMR
ec2_key_pair_file: $BT/config/EMR.pem
# use beefier instances in production
ec2_instance_type: c1.xlarge
# but only use one unless overridden
num_ec2_instances: 1
# use our local time zone (this is important for deciding when
# days start and end, for instance)
TZ: America/Los_Angeles
# we create the src-tree.tar.gz tarball with a Makefile. It only contains
# a subset of our code
python_archives: &python_archives
- $BT/aws/src-tree.tar.gz
# our bucket also lives in the us-west region
s3_log_uri: s3://walrus/tmp/logs/
s3_scratch_uri: s3://walrus/tmp/
setup_cmds: &setup_cmds
# these files are different between dev and production, so they're
# uploaded separately. copying them into place isn't safe because
# src-tree.tar.gz is actually shared between several mappers/reducers.
# Another safe approach would be to add a rule to Makefile.emr that
# copies these files if they haven't already been copied (setup_cmds
# from two mappers/reducers won't run simultaneously on the same machine)
- ln -sf $(readlink -f src-tree.tar.gz/config/
- ln -sf $(readlink -f src-tree.tar.gz/config/
# run Makefile.emr to compile C code (EMR has a different architecture,
# so we can't just upload the .so files)
- cd src-tree.tar.gz; make -f Makefile.emr
# generally, we run jobs on a Linux server separate from our desktop
# machine. So the SSH tunnel needs to be open so a browser on our
# desktop machine can connect to it.
ssh_tunnel_is_open: true
ssh_tunnel_to_job_tracker: true
# upload these particular files on the fly because they're different
# between development and production
upload_files: &upload_files
- $BT/config/
- $BT/config/
# Note the use of YAML references to re-use parts of the EMR config.
# We don't currently run our own hadoop cluster, so this section is
# pretty boring.
base_tmp_dir: /scratch/$USER
python_archives: *python_archives
setup_cmds: *setup_cmds
upload_files: *upload_files
# We don't have gcc installed in production, so if we have to run an
# MRJob in local mode in production, don't run the Makefile
# and whatnot; just fall back on the original copy of the code.
base_tmp_dir: /scratch/$USER
Jump to Line
Something went wrong with that request. Please try again.