# Setup

This notebook creates a VM in the user's project with the airflow scheduler and webserver. A default GCP zone for the VM has been chosen (below). Feel free to change this as desired.

In [None]:
zone='us-central1-b'

In [None]:
from google.datalab import Context
import google.datalab.storage as storage

project = Context.default().project_id
vm_name = 'datalab-airflow'

# The name of this GCS bucket follows a convention between this notebook and 
# the 'BigQuery Pipeline' tutorial notebook, so don't change this.
gcs_dag_bucket_name = project + '-' + vm_name
gcs_dag_bucket = storage.Bucket(gcs_dag_bucket_name)
gcs_dag_bucket.create()

In [None]:
vm_startup_script_contents = """#!/bin/bash
apt-get update
apt-get --assume-yes install python-pip

pip install datalab==1.1.2
pip install apache-airflow==1.9.0
pip install pandas-gbq==0.3.0

export AIRFLOW_HOME=/airflow
export AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION=False
export AIRFLOW__CORE__LOAD_EXAMPLES=False
airflow initdb
airflow scheduler &
airflow webserver -p 8080 &d

# We append a gsutil rsync command to the cron file and have this run every minute to sync dags.
PROJECT_ID=$(gcloud info --format="get(config.project)")
GCS_DAG_BUCKET=$PROJECT_ID-datalab-airflow
AIRFLOW_CRON=temp_crontab.txt
crontab -l > $AIRFLOW_CRON
DAG_FOLDER="dags"
LOCAL_DAG_PATH=$AIRFLOW_HOME/$DAG_FOLDER
mkdir $LOCAL_DAG_PATH
echo "* * * * * gsutil rsync gs://$GCS_DAG_BUCKET/$DAG_FOLDER $LOCAL_DAG_PATH" >> $AIRFLOW_CRON
crontab $AIRFLOW_CRON
rm $AIRFLOW_CRON
EOF
"""
vm_startup_script_file_name = 'vm_startup_script.sh'
script_file = open(vm_startup_script_file_name, 'w')
script_file.write(vm_startup_script_contents)
script_file.close()
import subprocess
print subprocess.check_output([
    'gcloud', 'compute', '--project', project, 'instances', 'create', vm_name, 
    '--zone', zone,
    '--machine-type', 'n1-standard-1',
    '--network', 'default',
    '--maintenance-policy', 'MIGRATE',
    '--scopes', 'https://www.googleapis.com/auth/cloud-platform',
    '--image', 'debian-9-stretch-v20171025',
    '--min-cpu-platform', 'Automatic',
    '--image-project', 'debian-cloud',
    '--boot-disk-size', '10',
    '--boot-disk-type', 'pd-standard',
    '--boot-disk-device-name', vm_name,
    '--metadata-from-file', 'startup-script=' + vm_startup_script_file_name])

# Cleanup


In [None]:
#The following cleans up the VM and associated GCS bucket. Uncomment and run.
#!gsutil rm -r gs://$gcs_dag_bucket_name
#!gcloud compute instances delete datalab-airflow --zone us-central1-b --quiet

In [None]:
# This just verifies that cleanup actually worked. Should show an error like 
# "BucketNotFoundException: 404 ...". 
!gsutil ls gs://$gcs_dag_bucket_name