Skip to content

Commit

Permalink
Extend EMR deployment scripts with jupyter-scala and S3-backed notebo…
Browse files Browse the repository at this point in the history
…ok storage

Signed-off-by: jpolchlo <jpolchlopek@azavea.com>
  • Loading branch information
jpolchlo authored and echeipesh committed Jan 11, 2018
1 parent d109f28 commit a78517b
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 50 deletions.
97 changes: 54 additions & 43 deletions scripts/emr/terraform/bootstrap.sh
@@ -1,7 +1,9 @@
#!/bin/bash

OAUTH_CLIENT_ID=$1
OAUTH_CLIENT_SECRET=$2
S3_ACCESS_KEY=$1
S3_SECRET_KEY=$2
S3_NOTEBOOK_BUCKET=$3
S3_NOTEBOOK_PREFIX=$4

# Parses a configuration file put in place by EMR to determine the role of this node
is_master() {
Expand All @@ -13,8 +15,8 @@ is_master() {
}

if is_master; then
echo "Updating system software ..."
sudo yum -y -q update
echo "Installing system software ..."
#sudo yum -y -q update
curl -sL https://rpm.nodesource.com/setup_6.x | sudo -E bash -
sudo yum install -y -q nodejs

Expand All @@ -24,65 +26,74 @@ if is_master; then
sudo pip-3.4 -q install --upgrade notebook

sudo pip-3.4 -q install sudospawner
sudo pip-3.4 -q install "https://github.com/jupyterhub/oauthenticator/archive/f5e39b1ece62b8d075832054ed3213cc04f85030.zip"

curl -L -o /tmp/jupyter-scala https://raw.githubusercontent.com/jupyter-scala/jupyter-scala/98bac7034f07e3e51d101846953aecbdb7a4bb5d/jupyter-scala
chmod +x /tmp/jupyter-scala
/tmp/jupyter-scala
#sudo pip-3.4 -q install "https://github.com/jupyterhub/oauthenticator/archive/f5e39b1ece62b8d075832054ed3213cc04f85030.zip"
sudo pip-3.4 -q install s3contents

# Set up user account to manage JupyterHub
echo "Setting up user accounts ..."
sudo groupadd shadow
sudo chgrp shadow /etc/shadow
sudo chmod 640 /etc/shadow
sudo useradd -G shadow -r hublauncher
sudo groupadd jupyterhub

echo 'hublauncher ALL=(%jupyterhub) NOPASSWD: /usr/local/bin/sudospawner' | sudo tee -a /etc/sudoers
echo 'hublauncher ALL=(ALL) NOPASSWD: /usr/sbin/useradd' | sudo tee -a /etc/sudoers
echo 'hublauncher ALL=(hdfs) NOPASSWD: /usr/bin/hdfs' | sudo tee -a /etc/sudoers

# Environment setup
cat <<EOF > /tmp/oauth_profile.sh
export AWS_DNS_NAME=$(aws ec2 describe-network-interfaces --filters Name=private-ip-address,Values=$(hostname -i) | jq -r '.[] | .[] | .Association.PublicDnsName')
export OAUTH_CALLBACK_URL=http://\$AWS_DNS_NAME:8000/hub/oauth_callback
export OAUTH_CLIENT_ID=$OAUTH_CLIENT_ID
export OAUTH_CLIENT_SECRET=$OAUTH_CLIENT_SECRET
# Do setup for user accounts that can spawn jupyter notebook instances
sudo adduser -G hadoop,jupyterhub user
echo 'user:password' | sudo chpasswd

alias launch_hub='sudo -u hublauncher -E env "PATH=/usr/local/bin:$PATH" jupyterhub --JupyterHub.spawner_class=sudospawner.SudoSpawner --SudoSpawner.sudospawner_path=/usr/local/bin/sudospawner --Spawner.notebook_dir=/home/{username}'
EOF
sudo mv /tmp/oauth_profile.sh /etc/profile.d
. /etc/profile.d/oauth_profile.sh
echo "Installing jupyter-scala kernel ..."
curl -L -q -o /tmp/jupyter-scala https://raw.githubusercontent.com/jupyter-scala/jupyter-scala/98bac7034f07e3e51d101846953aecbdb7a4bb5d/jupyter-scala
chmod +x /tmp/jupyter-scala
sudo -u user /tmp/jupyter-scala > /dev/null

# Setup required scripts/configurations for launching JupyterHub
cat <<EOF > /tmp/new_user
#!/bin/bash
cat <<EOF > /tmp/jupyterhub_config.py
c = get_config()
user=\$1
# Let JupyterHub use sudospawner for spawning notebook instances
c.JupyterHub.spawner_class='sudospawner.SudoSpawner'
c.SudoSpawner.sudospawner_path='/usr/local/bin/sudospawner'
EOF

cat <<EOF > /tmp/per_user_jupyter_notebook_config.py
from s3contents import S3ContentsManager
sudo useradd -m -G jupyterhub,hadoop \$user
sudo -u hdfs hdfs dfs -mkdir /user/\$user
c = get_config()
sudo -u \$user /tmp/jupyter-scala
# Tell Jupyter to use S3ContentsManager for all storage.
c.NotebookApp.contents_manager_class = S3ContentsManager
c.S3ContentsManager.access_key_id = "$S3_ACCESS_KEY"
c.S3ContentsManager.secret_access_key = "$S3_SECRET_KEY"
c.S3ContentsManager.bucket_name = "$S3_NOTEBOOK_BUCKET"
c.S3ContentsManager.prefix = "$S3_NOTEBOOK_PREFIX"
EOF
chmod +x /tmp/new_user
sudo chown root:root /tmp/new_user
sudo mv /tmp/new_user /usr/local/bin

cat <<EOF > /tmp/jupyterhub_config.py
from oauthenticator.github import LocalGitHubOAuthenticator

c = get_config()
c.JupyterHub.authenticator_class = LocalGitHubOAuthenticator
c.LocalGitHubOAuthenticator.create_system_users = True
sudo -u user mkdir /home/user/.jupyter
sudo -u user cp /tmp/per_user_jupyter_notebook_config.py /home/user/.jupyter/jupyter_notebook_config.py

c.JupyterHub.spawner_class='sudospawner.SudoSpawner'
c.SudoSpawner.sudospawner_path='/usr/local/bin/sudospawner'
c.Spawner.notebook_dir='/home/{username}'
c.LocalAuthenticator.add_user_cmd = ['new_user']
# Fix a problem in the Jupyter notebook FileContentsManager
cat <<EOF > /tmp/manager.patch
33c33
<
---
> import notebook.transutils
EOF
patch /usr/local/lib/python3.4/site-packages/notebook/services/contents/manager.py -i /tmp/manager.patch -o /tmp/manager.py
sudo mv /tmp/manager.py /usr/local/lib/python3.4/site-packages/notebook/services/contents/manager.py
sudo chown root:root /usr/local/lib/python3.4/site-packages/notebook/services/contents/manager.py
sudo chmod 644 /usr/local/lib/python3.4/site-packages/notebook/services/contents/manager.py

# Environment setup
cat <<EOF > /tmp/jupyter_profile.sh
export AWS_DNS_NAME=$(aws ec2 describe-network-interfaces --filters Name=private-ip-address,Values=$(hostname -i) | jq -r '.[] | .[] | .Association.PublicDnsName')
alias launch_hub='sudo -u hublauncher -E env "PATH=/usr/local/bin:$PATH" jupyterhub -f /tmp/jupyterhub_config.py'
EOF
sudo mv /tmp/jupyter_profile.sh /etc/profile.d
. /etc/profile.d/jupyter_profile.sh

# Execute
cd /tmp
launch_hub &
sudo -u hublauncher -E env "PATH=/usr/local/bin:$PATH" jupyterhub -f /tmp/jupyterhub_config.py &
echo "Running at host $AWS_DNS_NAME"
fi
2 changes: 1 addition & 1 deletion scripts/emr/terraform/emr-spark.tf
Expand Up @@ -76,7 +76,7 @@ resource "aws_emr_cluster" "emr-spark-cluster" {
provisioner "remote-exec" {
inline=[
"chmod +x /tmp/bootstrap.sh",
"/tmp/bootstrap.sh ${var.oauth_client_id} ${var.oauth_client_secret}"
"/tmp/bootstrap.sh ${var.access_key} ${var.secret_key} ${var.s3_notebook_bucket} ${var.s3_notebook_prefix}"
]
connection {
type = "ssh"
Expand Down
12 changes: 6 additions & 6 deletions scripts/emr/terraform/variables.tf.json
Expand Up @@ -9,6 +9,12 @@
"pem_path": {
"description": "Path to your EC2 secret key"
},
"s3_notebook_bucket": {
"description": "The bucket name on S3 where notebooks are stored"
},
"s3_notebook_prefix": {
"description": "The prefix path inside the S3 notebook bucket"
},
"region": {
"default": "us-east-1",
"description": "Can be overridden if necessary"
Expand All @@ -33,11 +39,5 @@
"default": "anonymous",
"description": "User name applied to cluster"
},
"oauth_client_id": {
"description": "Client ID token for OAuth server"
},
"oauth_client_secret": {
"description": "Client secret token for OAuth server"
}
}
}

0 comments on commit a78517b

Please sign in to comment.