From 96d2cae27caeda2a5f2375246a457fdb2e34c5ef Mon Sep 17 00:00:00 2001 From: Abdou Seck Date: Wed, 28 Sep 2022 23:33:17 -0400 Subject: [PATCH] feat: Add a job to detect new raw schema fields to add to safe schema manual models --- .../DetectNewDBTManualModelsFields.groovy | 80 +++++++++++++++ dataeng/jobs/createJobsNew.groovy | 2 + .../detect-new-dbt-manual-models-fields.sh | 99 +++++++++++++++++++ 3 files changed, 181 insertions(+) create mode 100644 dataeng/jobs/analytics/DetectNewDBTManualModelsFields.groovy create mode 100644 dataeng/resources/detect-new-dbt-manual-models-fields.sh diff --git a/dataeng/jobs/analytics/DetectNewDBTManualModelsFields.groovy b/dataeng/jobs/analytics/DetectNewDBTManualModelsFields.groovy new file mode 100644 index 000000000..776381a30 --- /dev/null +++ b/dataeng/jobs/analytics/DetectNewDBTManualModelsFields.groovy @@ -0,0 +1,80 @@ +package analytics + +import static org.edx.jenkins.dsl.AnalyticsConstants.common_log_rotator +import static org.edx.jenkins.dsl.AnalyticsConstants.common_publishers +import static org.edx.jenkins.dsl.AnalyticsConstants.common_triggers +import static org.edx.jenkins.dsl.AnalyticsConstants.secure_scm +import static org.edx.jenkins.dsl.AnalyticsConstants.secure_scm_parameters + + +class DetectNewDBTManualModelsFields { + public static def job = { dslFactory, allVars -> + dslFactory.job("detect-new-dbt-manual-models-fields") { + // If the DISABLED is set to true by the job's extra vars, then disable the job. + disabled(allVars.get('DISABLED', false)) + description("This job detects new columns in tables in raw schemas that have yet to be manually added to safe schema models.") + // Set a definite log rotation, if defined. + logRotator common_log_rotator(allVars) + // Set the analytics-secure parameters for repo and branch from the common helpers + parameters secure_scm_parameters(allVars) + // Add parameters to use analytics-tools and warehouse-transforms + parameters { + stringParam('ANALYTICS_TOOLS_URL', allVars.get('ANALYTICS_TOOLS_URL'), 'URL for the analytics tools repo.') + stringParam('ANALYTICS_TOOLS_BRANCH', allVars.get('ANALYTICS_TOOLS_BRANCH'), 'Branch of analytics tools repo to use.') + stringParam('WAREHOUSE_TRANSFORMS_URL', allVars.get('WAREHOUSE_TRANSFORMS_URL'), 'URL for the warehouse-transforms repository.') + stringParam('WAREHOUSE_TRANSFORMS_BRANCH', allVars.get('WAREHOUSE_TRANSFORMS_BRANCH'), 'Branch of warehouse-transforms repository to use.') + stringParam('DBT_TARGET', allVars.get('DBT_TARGET'), 'DBT target from profiles.yml in analytics-secure.') + stringParam('DBT_PROFILE', allVars.get('DBT_PROFILE'), 'DBT profile from profiles.yml in analytics-secure.') + stringParam('DBT_PROJECT_PATH', allVars.get('DBT_PROJECT_PATH'), 'Path in warehouse-transforms to use as the dbt project, relative to "projects" (usually automated/applications or reporting).') + stringParam('NOTIFY', allVars.get('NOTIFY','$PAGER_NOTIFY'), 'Space separated list of emails to send notifications to.') + } + // Set the necessary VAULT kv paths of credentials as environment variables + environmentVariables { + env('JIRA_WEBHOOK_VAULT_KV_PATH', allVars.get('JIRA_WEBHOOK_VAULT_KV_PATH')) + env('JIRA_WEBHOOK_VAULT_KV_VERSION', allVars.get('JIRA_WEBHOOK_VAULT_KV_VERSION')) + env('AUTOMATION_TASK_USER_VAULT_KV_PATH', allVars.get('AUTOMATION_TASK_USER_VAULT_KV_PATH')) + env('AUTOMATION_TASK_USER_VAULT_KV_VERSION', allVars.get('AUTOMATION_TASK_USER_VAULT_KV_VERSION')) + } + // SCM settings for analytics-secure and analytics-tools + multiscm secure_scm(allVars) << { + git { + remote { + url('$WAREHOUSE_TRANSFORMS_URL') + branch('$WAREHOUSE_TRANSFORMS_BRANCH') + } + extensions { + relativeTargetDirectory('warehouse-transforms') + pruneBranches() + cleanAfterCheckout() + } + } + git { + remote { + url('$ANALYTICS_TOOLS_URL') + branch('$ANALYTICS_TOOLS_BRANCH') + credentials('1') + } + extensions { + relativeTargetDirectory('analytics-tools') + pruneBranches() + cleanAfterCheckout() + } + } + } + wrappers { + colorizeOutput('xterm') + timestamps() + credentialsBinding { + usernamePassword('ANALYTICS_VAULT_ROLE_ID', 'ANALYTICS_VAULT_SECRET_ID', 'analytics-vault') + } + } + // Set the trigger using cron + triggers common_triggers(allVars) + // Notifications on build failures + publishers common_publishers(allVars) + steps { + shell(dslFactory.readFileFromWorkspace('dataeng/resources/detect-new-dbt-manual-models-fields.sh')) + } + } + } +} diff --git a/dataeng/jobs/createJobsNew.groovy b/dataeng/jobs/createJobsNew.groovy index a52168894..f4282f652 100644 --- a/dataeng/jobs/createJobsNew.groovy +++ b/dataeng/jobs/createJobsNew.groovy @@ -2,6 +2,7 @@ import static analytics.DBTDocs.job as DBTDocsJob import static analytics.DBTRun.job as DBTRunJob import static analytics.DBTSourceFreshness.job as DBTSourceFreshnessJob import static analytics.DeployCluster.job as DeployClusterJob +import static analytics.DetectNewDBTManualModelsFields.job as DetectNewDBTManualModelsFields import static analytics.EmrCostReporter.job as EmrCostReporterJob import static analytics.ModelTransfers.job as ModelTransfersJob import static analytics.RetirementJobEdxTriggers.job as RetirementJobEdxTriggersJob @@ -45,6 +46,7 @@ def taskMap = [ DBT_RUN_JOB: DBTRunJob, DBT_SOURCE_FRESHNESS_JOB: DBTSourceFreshnessJob, DEPLOY_CLUSTER_JOB: DeployClusterJob, + DETECT_NEW_DBT_MANUAL_MODELS_FIELDS_JOB: DetectNewDBTManualModelsFields, EMR_COST_REPORTER_JOB: EmrCostReporterJob, MODEL_TRANSFERS_JOB: ModelTransfersJob, RETIREMENT_JOB_EDX_TRIGGERS_JOB: RetirementJobEdxTriggersJob, diff --git a/dataeng/resources/detect-new-dbt-manual-models-fields.sh b/dataeng/resources/detect-new-dbt-manual-models-fields.sh new file mode 100644 index 000000000..63c409544 --- /dev/null +++ b/dataeng/resources/detect-new-dbt-manual-models-fields.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +set -ex + +# Global variables +DBT_PROFILE_ARGS="--profiles-dir ${WORKSPACE}/analytics-secure/warehouse-transforms/ --profile ${DBT_PROFILE} --target ${DBT_TARGET}" +DBT_TARGET_PATH=${WORKSPACE}/warehouse-transforms/projects/${DBT_PROJECT_PATH}/target + +# Set up a virtual environment for warehouse-transforms +PYTHON38_VENV="py38_venv_warehouse_transforms" +virtualenv --python=python3.8 --clear "${PYTHON38_VENV}" +source "${PYTHON38_VENV}/bin/activate" +pip install -U pip +pip install -r ${WORKSPACE}/warehouse-transforms/requirements.txt + +# Go into the automated/applications project and compile the manual models +cd ${WORKSPACE}/warehouse-transforms/projects/${DBT_PROJECT_PATH} +dbt clean ${DBT_PROFILE_ARGS} +dbt deps ${DBT_PROFILE_ARGS} +dbt compile ${DBT_PROFILE_ARGS} --select tag:manual + +# Deactivate the virtual environment, so we can create and activate another one +deactivate + +# Setup a virtual environment for analytics-tools +PYTHON38_VENV="py38_venv_analytics_tools" +virtualenv --python=python3.8 --clear "${PYTHON38_VENV}" +source "${PYTHON38_VENV}/bin/activate" + +# Go into analytics-tools and install the dependencies +cd ${WORKSPACE}/analytics-tools/snowflake +make requirements + +# Create a function to clean up the credential files, and trap EXIT with it +function clean_up_files() { + rm -rf .snowflake_private_key .snowflake_private_key_passphrase +} +trap clean_up_files EXIT + +# Fetch credentials from vault +# Do not print commands in this function since they may contain secrets. +set +x + +# Retrieve a vault token corresponding to the jenkins AppRole. The token is then stored in the VAULT_TOKEN variable +# which is implicitly used by subsequent vault commands within this script. +# Instructions followed: https://learn.hashicorp.com/tutorials/vault/approle#step-4-login-with-roleid-secretid +export VAULT_TOKEN=$(vault write -field=token auth/approle/login \ + role_id=${ANALYTICS_VAULT_ROLE_ID} \ + secret_id=${ANALYTICS_VAULT_SECRET_ID} +) + +set -x + +# JIRA webhook URL and secret string from vault +JIRA_WEBHOOK_URL=$( + vault kv get \ + -version=${JIRA_WEBHOOK_VAULT_KV_VERSION} \ + -field=JIRA_WEBHOOK_URL \ + ${JIRA_WEBHOOK_VAULT_KV_PATH} \ +) +JIRA_WEBHOOK_SECRET=$( + vault kv get \ + -version=${JIRA_WEBHOOK_VAULT_KV_VERSION} \ + -field=JIRA_WEBHOOK_SECRET \ + ${JIRA_WEBHOOK_VAULT_KV_PATH} \ +) + +# Snowflake credentials from vault +SNOWFLAKE_ACCOUNT=$( + vault kv get \ + -version=${AUTOMATION_TASK_USER_VAULT_KV_VERSION} \ + -field=account \ + ${AUTOMATION_TASK_USER_VAULT_KV_PATH} \ +) + +SNOWFLAKE_USER=$( + vault kv get \ + -version=${AUTOMATION_TASK_USER_VAULT_KV_VERSION} \ + -field=user \ + ${AUTOMATION_TASK_USER_VAULT_KV_PATH} \ +) +# The detect_new_raw_columns.py script, much like all other scripts that connect to Snowflake, +# expects the private key and the privarte key passphrase to be in files. +# As a result, SNOWFLAKE_PRIVATE_KEY and SNOWFLAKE_PRIVATE_KEY_PASSPHRASE are stored in files. +vault kv get \ + -version=${AUTOMATION_TASK_USER_VAULT_KV_VERSION} \ + -field=private_key \ + ${AUTOMATION_TASK_USER_VAULT_KV_PATH} > .snowflake_private_key + +vault kv get \ + -version=${AUTOMATION_TASK_USER_VAULT_KV_VERSION} \ + -field=private_key_passphrase \ + ${AUTOMATION_TASK_USER_VAULT_KV_PATH} > .snowflake_private_key_passphrase + +# Invoke the script to detect new fields that need to be added manually +python detect_new_raw_columns.py ${DBT_TARGET_PATH} \ + --user ${SNOWFLAKE_USER} --account ${SNOWFLAKE_ACCOUNT} \ + --key-path .snowflake_private_key --passphrase-path .snowflake_private_key_passphrase \ + --jira-webhook-url ${JIRA_WEBHOOK_URL} \ + --jira-webhook-secret ${JIRA_WEBHOOK_SECRET}