diff --git a/envs/s3-spark-delta-sharing-minio/.gitignore b/envs/s3-spark-delta-sharing-minio/.gitignore new file mode 100644 index 0000000..34d4d07 --- /dev/null +++ b/envs/s3-spark-delta-sharing-minio/.gitignore @@ -0,0 +1 @@ +.s3-mount \ No newline at end of file diff --git a/envs/s3-spark-delta-sharing-minio/.gitkeep b/envs/s3-spark-delta-sharing-minio/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/envs/s3-spark-delta-sharing-minio/.whirl.env b/envs/s3-spark-delta-sharing-minio/.whirl.env new file mode 100644 index 0000000..4e396a8 --- /dev/null +++ b/envs/s3-spark-delta-sharing-minio/.whirl.env @@ -0,0 +1,16 @@ +AWS_ACCESS_KEY_ID=qwerty +AWS_SECRET_ACCESS_KEY=qwerty123 +AWS_SERVER=s3server +AWS_PORT=9000 +DEMO_BUCKET=demo-s3-output + +# Spark variables +SPARK_VERSION=3.1.1 +DELTA_VERSION=1.0.0 +DELTA_SHARING_VERSION=0.2.0 + +# Airflow env vars +AIRFLOW__CORE__EXPOSE_CONFIG=True +AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS=False +AIRFLOW__CORE__LOAD_EXAMPLES=False +AIRFLOW__WEBSERVER__EXPOSE_CONFIG=True diff --git a/envs/s3-spark-delta-sharing-minio/compose.setup.d/01_check_available_memory.sh b/envs/s3-spark-delta-sharing-minio/compose.setup.d/01_check_available_memory.sh new file mode 100644 index 0000000..f70f0a9 --- /dev/null +++ b/envs/s3-spark-delta-sharing-minio/compose.setup.d/01_check_available_memory.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +function check_docker_mem() { + echo "===============================================" + echo "== Check if there is enough available memory ==" + echo "===============================================" + MEM_12_POINT_5_GB=$(((1024 * 1024 * 1024 * 25)/2)) + AVAILABLE_MEM=$(docker info -f "{{json .MemTotal}}") + + if [ "${AVAILABLE_MEM}" -lt "${MEM_12_POINT_5_GB}" ]; then + echo "NOT ENOUGH MEMORY AVAILABLE ($(bc <<< "scale=1; $AVAILABLE_MEM / 1024 / 1024 / 1024")). Need at least 12.5GB" + exit 12; + fi +} + +check_docker_mem \ No newline at end of file diff --git a/envs/s3-spark-delta-sharing-minio/compose.setup.d/02_clean_s3_mount_dir.sh b/envs/s3-spark-delta-sharing-minio/compose.setup.d/02_clean_s3_mount_dir.sh new file mode 100644 index 0000000..4fd82a4 --- /dev/null +++ b/envs/s3-spark-delta-sharing-minio/compose.setup.d/02_clean_s3_mount_dir.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +function empty_s3_dir() { + echo "================================" + echo "== Cleanup local S3 mount dir ==" + echo "================================" + local SCRIPT_DIR=$( dirname ${BASH_SOURCE[0]} ) + S3_MOUNT_DIR="${SCRIPT_DIR}/../.s3-mount" + + if [ "$(ls -A ${S3_MOUNT_DIR})" ]; then + echo "${S3_MOUNT_DIR} is not empty. Clearing NOW!!" + find ${S3_MOUNT_DIR} -mindepth 1 -delete + else + echo "${S3_MOUNT_DIR} is empty. Continue" + fi +} + +empty_s3_dir \ No newline at end of file diff --git a/envs/s3-spark-delta-sharing-minio/config/core-site.xml b/envs/s3-spark-delta-sharing-minio/config/core-site.xml new file mode 100644 index 0000000..03c76f1 --- /dev/null +++ b/envs/s3-spark-delta-sharing-minio/config/core-site.xml @@ -0,0 +1,60 @@ + + + + + + fs.s3.awsAccessKeyId + qwerty + + + + fs.s3.awsSecretAccessKey + qwerty123 + + + + fs.s3n.awsAccessKeyId + qwerty + + + + fs.s3n.awsSecretAccessKey + qwerty123 + + + + fs.s3a.access.key + qwerty + + + + fs.s3a.secret.key + qwerty123 + + + + fs.s3a.connection.ssl.enabled + false + + + + fs.s3a.endpoint + s3server:9000 + + + + fs.s3a.path.style.access + true + + + + fs.s3a.impl + org.apache.hadoop.fs.s3a.S3AFileSystem + + + + fs.s3.impl + org.apache.hadoop.fs.s3a.S3AFileSystem + + + diff --git a/envs/s3-spark-delta-sharing-minio/config/delta-sharing.yml b/envs/s3-spark-delta-sharing-minio/config/delta-sharing.yml new file mode 100644 index 0000000..8ef0588 --- /dev/null +++ b/envs/s3-spark-delta-sharing-minio/config/delta-sharing.yml @@ -0,0 +1,33 @@ +# The format version of this config file +version: 1 +# Config shares/schemas/tables to share +shares: +- name: "airflow" + schemas: + - name: "spark" + tables: + - name: "table1" + location: "s3a://demo-s3-output/output/data/demo/spark/20210614/" + - name: "cars" + location: "s3a://demo-s3-output/output/data/demo/spark/cars/" + - name: "cars-all" + location: "s3a://demo-s3-output/output/data/demo/spark/cars-all/" + - name: "cars-python" + location: "s3a://demo-s3-output/output/data/demo/spark/cars-python/" +# Set the host name that the server will use +host: "0.0.0.0" +# Set the port that the server will listen on +port: 8080 +# Set the url prefix for the REST APIs +endpoint: "/delta-sharing" +# Set the timeout of S3 presigned url in seconds +preSignedUrlTimeoutSeconds: 900 +# How many tables to cache in the server +deltaTableCacheSize: 10 +# Whether we can accept working with a stale version of the table. This is useful when sharing +# static tables that will never be changed. +stalenessAcceptable: false +# Whether to evaluate user provided `predicateHints` +evaluatePredicateHints: false +authorization: + bearerToken: authTokenDeltaSharing432 \ No newline at end of file diff --git a/envs/s3-spark-delta-sharing-minio/config/log4j.properties b/envs/s3-spark-delta-sharing-minio/config/log4j.properties new file mode 100644 index 0000000..8910b5a --- /dev/null +++ b/envs/s3-spark-delta-sharing-minio/config/log4j.properties @@ -0,0 +1,337 @@ +# This sets the global logging level and specifies the appenders +log4j.rootLogger=INFO, console + +# settings for the console appender +# log4j.appender.myConsoleAppender=org.apache.log4j.ConsoleAppender +# log4j.appender.myConsoleAppender.layout=org.apache.log4j.PatternLayout +# log4j.appender.myConsoleAppender.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n + +log4j.logger.org.apache.hadoop=INFO +# log4j.logger.org.apache.hadoop.fs.s3a=DEBUG +# log4j.logger.org.apache.hadoop.fs.s3=DEBUG +# log4j.logger.org.apache.hadoop.fs=DEBUG +# log4j.logger.org.apache.hadoop.conf=DEBUG +log4j.logger.io.delta=INFO + +log4j.logger.com.amazonaws=INFO +log4j.logger.com.amazonaws.http.conn.ssl=INFO +# log4j.logger.com.amazonaws.internal=INFO + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Define some default values that can be overridden by system properties +hadoop.root.logger=INFO,console +hadoop.log.dir=. +hadoop.log.file=hadoop.log + +# Logging Threshold +log4j.threshold=ALL + +# Null Appender +log4j.appender.NullAppender=org.apache.log4j.varia.NullAppender + +# +# Rolling File Appender - cap space usage at 5gb. +# +hadoop.log.maxfilesize=256MB +hadoop.log.maxbackupindex=20 +log4j.appender.RFA=org.apache.log4j.RollingFileAppender +log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file} + +log4j.appender.RFA.MaxFileSize=${hadoop.log.maxfilesize} +log4j.appender.RFA.MaxBackupIndex=${hadoop.log.maxbackupindex} + +log4j.appender.RFA.layout=org.apache.log4j.PatternLayout + +# Pattern format: Date LogLevel LoggerName LogMessage +log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n +# Debugging Pattern format +#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n + + +# +# Daily Rolling File Appender +# + +log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender +log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file} + +# Rollover at midnight +log4j.appender.DRFA.DatePattern=.yyyy-MM-dd + +log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout + +# Pattern format: Date LogLevel LoggerName LogMessage +log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n +# Debugging Pattern format +#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n + + +# +# console +# Add "console" to rootlogger above if you want to use this +# + +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n + +# +# TaskLog Appender +# +log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender + +log4j.appender.TLA.layout=org.apache.log4j.PatternLayout +log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n + +# +# HDFS block state change log from block manager +# +# Uncomment the following to log normal block state change +# messages from BlockManager in NameNode. +#log4j.logger.BlockStateChange=DEBUG + +# +#Security appender +# +hadoop.security.logger=INFO,NullAppender +hadoop.security.log.maxfilesize=256MB +hadoop.security.log.maxbackupindex=20 +log4j.category.SecurityLogger=${hadoop.security.logger} +hadoop.security.log.file=SecurityAuth-${user.name}.audit +log4j.appender.RFAS=org.apache.log4j.RollingFileAppender +log4j.appender.RFAS.File=${hadoop.log.dir}/${hadoop.security.log.file} +log4j.appender.RFAS.layout=org.apache.log4j.PatternLayout +log4j.appender.RFAS.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n +log4j.appender.RFAS.MaxFileSize=${hadoop.security.log.maxfilesize} +log4j.appender.RFAS.MaxBackupIndex=${hadoop.security.log.maxbackupindex} + +# +# Daily Rolling Security appender +# +log4j.appender.DRFAS=org.apache.log4j.DailyRollingFileAppender +log4j.appender.DRFAS.File=${hadoop.log.dir}/${hadoop.security.log.file} +log4j.appender.DRFAS.layout=org.apache.log4j.PatternLayout +log4j.appender.DRFAS.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n +log4j.appender.DRFAS.DatePattern=.yyyy-MM-dd + +# +# hadoop configuration logging +# + +# Uncomment the following line to turn off configuration deprecation warnings. +# log4j.logger.org.apache.hadoop.conf.Configuration.deprecation=WARN + +# +# hdfs audit logging +# +hdfs.audit.logger=INFO,NullAppender +hdfs.audit.log.maxfilesize=256MB +hdfs.audit.log.maxbackupindex=20 +log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=${hdfs.audit.logger} +log4j.additivity.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=false +log4j.appender.RFAAUDIT=org.apache.log4j.RollingFileAppender +log4j.appender.RFAAUDIT.File=${hadoop.log.dir}/hdfs-audit.log +log4j.appender.RFAAUDIT.layout=org.apache.log4j.PatternLayout +log4j.appender.RFAAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.RFAAUDIT.MaxFileSize=${hdfs.audit.log.maxfilesize} +log4j.appender.RFAAUDIT.MaxBackupIndex=${hdfs.audit.log.maxbackupindex} + +# +# NameNode metrics logging. +# The default is to retain two namenode-metrics.log files up to 64MB each. +# +namenode.metrics.logger=INFO,NullAppender +log4j.logger.NameNodeMetricsLog=${namenode.metrics.logger} +log4j.additivity.NameNodeMetricsLog=false +log4j.appender.NNMETRICSRFA=org.apache.log4j.RollingFileAppender +log4j.appender.NNMETRICSRFA.File=${hadoop.log.dir}/namenode-metrics.log +log4j.appender.NNMETRICSRFA.layout=org.apache.log4j.PatternLayout +log4j.appender.NNMETRICSRFA.layout.ConversionPattern=%d{ISO8601} %m%n +log4j.appender.NNMETRICSRFA.MaxBackupIndex=1 +log4j.appender.NNMETRICSRFA.MaxFileSize=64MB + +# +# DataNode metrics logging. +# The default is to retain two datanode-metrics.log files up to 64MB each. +# +datanode.metrics.logger=INFO,NullAppender +log4j.logger.DataNodeMetricsLog=${datanode.metrics.logger} +log4j.additivity.DataNodeMetricsLog=false +log4j.appender.DNMETRICSRFA=org.apache.log4j.RollingFileAppender +log4j.appender.DNMETRICSRFA.File=${hadoop.log.dir}/datanode-metrics.log +log4j.appender.DNMETRICSRFA.layout=org.apache.log4j.PatternLayout +log4j.appender.DNMETRICSRFA.layout.ConversionPattern=%d{ISO8601} %m%n +log4j.appender.DNMETRICSRFA.MaxBackupIndex=1 +log4j.appender.DNMETRICSRFA.MaxFileSize=64MB + +# Custom Logging levels + +#log4j.logger.org.apache.hadoop.mapred.JobTracker=DEBUG +#log4j.logger.org.apache.hadoop.mapred.TaskTracker=DEBUG +#log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=DEBUG + + +# AWS SDK & S3A FileSystem +#log4j.logger.com.amazonaws=ERROR +log4j.logger.com.amazonaws.http.AmazonHttpClient=ERROR +#log4j.logger.org.apache.hadoop.fs.s3a.S3AFileSystem=WARN + +# +# Event Counter Appender +# Sends counts of logging messages at different severity levels to Hadoop Metrics. +# +log4j.appender.EventCounter=org.apache.hadoop.log.metrics.EventCounter + + +# +# shuffle connection log from shuffleHandler +# Uncomment the following line to enable logging of shuffle connections +# log4j.logger.org.apache.hadoop.mapred.ShuffleHandler.audit=DEBUG + +# +# Yarn ResourceManager Application Summary Log +# +# Set the ResourceManager summary log filename +yarn.server.resourcemanager.appsummary.log.file=rm-appsummary.log +# Set the ResourceManager summary log level and appender +yarn.server.resourcemanager.appsummary.logger=${hadoop.root.logger} +#yarn.server.resourcemanager.appsummary.logger=INFO,RMSUMMARY + +# To enable AppSummaryLogging for the RM, +# set yarn.server.resourcemanager.appsummary.logger to +# ,RMSUMMARY in hadoop-env.sh + +# Appender for ResourceManager Application Summary Log +# Requires the following properties to be set +# - hadoop.log.dir (Hadoop Log directory) +# - yarn.server.resourcemanager.appsummary.log.file (resource manager app summary log filename) +# - yarn.server.resourcemanager.appsummary.logger (resource manager app summary log level and appender) + +log4j.logger.org.apache.hadoop.yarn.server.resourcemanager.RMAppManager$ApplicationSummary=${yarn.server.resourcemanager.appsummary.logger} +log4j.additivity.org.apache.hadoop.yarn.server.resourcemanager.RMAppManager$ApplicationSummary=false +log4j.appender.RMSUMMARY=org.apache.log4j.RollingFileAppender +log4j.appender.RMSUMMARY.File=${hadoop.log.dir}/${yarn.server.resourcemanager.appsummary.log.file} +log4j.appender.RMSUMMARY.MaxFileSize=256MB +log4j.appender.RMSUMMARY.MaxBackupIndex=20 +log4j.appender.RMSUMMARY.layout=org.apache.log4j.PatternLayout +log4j.appender.RMSUMMARY.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n + +# +# YARN ResourceManager audit logging +# +rm.audit.logger=INFO,NullAppender +rm.audit.log.maxfilesize=256MB +rm.audit.log.maxbackupindex=20 +log4j.logger.org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger=${rm.audit.logger} +log4j.additivity.org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger=false +log4j.appender.RMAUDIT=org.apache.log4j.RollingFileAppender +log4j.appender.RMAUDIT.File=${hadoop.log.dir}/rm-audit.log +log4j.appender.RMAUDIT.layout=org.apache.log4j.PatternLayout +log4j.appender.RMAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.RMAUDIT.MaxFileSize=${rm.audit.log.maxfilesize} +log4j.appender.RMAUDIT.MaxBackupIndex=${rm.audit.log.maxbackupindex} + +# +# YARN NodeManager audit logging +# +nm.audit.logger=INFO,NullAppender +nm.audit.log.maxfilesize=256MB +nm.audit.log.maxbackupindex=20 +log4j.logger.org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger=${nm.audit.logger} +log4j.additivity.org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger=false +log4j.appender.NMAUDIT=org.apache.log4j.RollingFileAppender +log4j.appender.NMAUDIT.File=${hadoop.log.dir}/nm-audit.log +log4j.appender.NMAUDIT.layout=org.apache.log4j.PatternLayout +log4j.appender.NMAUDIT.layout.ConversionPattern=%d{ISO8601}%p %c{2}: %m%n +log4j.appender.NMAUDIT.MaxFileSize=${nm.audit.log.maxfilesize} +log4j.appender.NMAUDIT.MaxBackupIndex=${nm.audit.log.maxbackupindex} + +# HS audit log configs +#mapreduce.hs.audit.logger=INFO,HSAUDIT +#log4j.logger.org.apache.hadoop.mapreduce.v2.hs.HSAuditLogger=${mapreduce.hs.audit.logger} +#log4j.additivity.org.apache.hadoop.mapreduce.v2.hs.HSAuditLogger=false +#log4j.appender.HSAUDIT=org.apache.log4j.DailyRollingFileAppender +#log4j.appender.HSAUDIT.File=${hadoop.log.dir}/hs-audit.log +#log4j.appender.HSAUDIT.layout=org.apache.log4j.PatternLayout +#log4j.appender.HSAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +#log4j.appender.HSAUDIT.DatePattern=.yyyy-MM-dd + +# Http Server Request Logs +#log4j.logger.http.requests.namenode=INFO,namenoderequestlog +#log4j.appender.namenoderequestlog=org.apache.hadoop.http.HttpRequestLogAppender +#log4j.appender.namenoderequestlog.Filename=${hadoop.log.dir}/jetty-namenode-yyyy_mm_dd.log +#log4j.appender.namenoderequestlog.RetainDays=3 + +#log4j.logger.http.requests.datanode=INFO,datanoderequestlog +#log4j.appender.datanoderequestlog=org.apache.hadoop.http.HttpRequestLogAppender +#log4j.appender.datanoderequestlog.Filename=${hadoop.log.dir}/jetty-datanode-yyyy_mm_dd.log +#log4j.appender.datanoderequestlog.RetainDays=3 + +#log4j.logger.http.requests.resourcemanager=INFO,resourcemanagerrequestlog +#log4j.appender.resourcemanagerrequestlog=org.apache.hadoop.http.HttpRequestLogAppender +#log4j.appender.resourcemanagerrequestlog.Filename=${hadoop.log.dir}/jetty-resourcemanager-yyyy_mm_dd.log +#log4j.appender.resourcemanagerrequestlog.RetainDays=3 + +#log4j.logger.http.requests.jobhistory=INFO,jobhistoryrequestlog +#log4j.appender.jobhistoryrequestlog=org.apache.hadoop.http.HttpRequestLogAppender +#log4j.appender.jobhistoryrequestlog.Filename=${hadoop.log.dir}/jetty-jobhistory-yyyy_mm_dd.log +#log4j.appender.jobhistoryrequestlog.RetainDays=3 + +#log4j.logger.http.requests.nodemanager=INFO,nodemanagerrequestlog +#log4j.appender.nodemanagerrequestlog=org.apache.hadoop.http.HttpRequestLogAppender +#log4j.appender.nodemanagerrequestlog.Filename=${hadoop.log.dir}/jetty-nodemanager-yyyy_mm_dd.log +#log4j.appender.nodemanagerrequestlog.RetainDays=3 + +# WebHdfs request log on datanodes +# Specify -Ddatanode.webhdfs.logger=INFO,HTTPDRFA on datanode startup to +# direct the log to a separate file. +#datanode.webhdfs.logger=INFO,console +#log4j.logger.datanode.webhdfs=${datanode.webhdfs.logger} +#log4j.appender.HTTPDRFA=org.apache.log4j.DailyRollingFileAppender +#log4j.appender.HTTPDRFA.File=${hadoop.log.dir}/hadoop-datanode-webhdfs.log +#log4j.appender.HTTPDRFA.layout=org.apache.log4j.PatternLayout +#log4j.appender.HTTPDRFA.layout.ConversionPattern=%d{ISO8601} %m%n +#log4j.appender.HTTPDRFA.DatePattern=.yyyy-MM-dd + + +# Appender for viewing information for errors and warnings +yarn.ewma.cleanupInterval=300 +yarn.ewma.messageAgeLimitSeconds=86400 +yarn.ewma.maxUniqueMessages=250 +log4j.appender.EWMA=org.apache.hadoop.yarn.util.Log4jWarningErrorMetricsAppender +log4j.appender.EWMA.cleanupInterval=${yarn.ewma.cleanupInterval} +log4j.appender.EWMA.messageAgeLimitSeconds=${yarn.ewma.messageAgeLimitSeconds} +log4j.appender.EWMA.maxUniqueMessages=${yarn.ewma.maxUniqueMessages} + +# +# Fair scheduler state dump +# +# Use following logger to dump the state to a separate file + +#log4j.logger.org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler.statedump=DEBUG,FSSTATEDUMP +#log4j.additivity.org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler.statedump=false +#log4j.appender.FSSTATEDUMP=org.apache.log4j.RollingFileAppender +#log4j.appender.FSSTATEDUMP.File=${hadoop.log.dir}/fairscheduler-statedump.log +#log4j.appender.FSSTATEDUMP.layout=org.apache.log4j.PatternLayout +#log4j.appender.FSSTATEDUMP.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n +#log4j.appender.FSSTATEDUMP.MaxFileSize=${hadoop.log.maxfilesize} +#log4j.appender.FSSTATEDUMP.MaxBackupIndex=${hadoop.log.maxbackupindex} + +# Log levels of third-party libraries +log4j.logger.org.apache.commons.beanutils=WARN \ No newline at end of file diff --git a/envs/s3-spark-delta-sharing-minio/docker-compose.yml b/envs/s3-spark-delta-sharing-minio/docker-compose.yml new file mode 100644 index 0000000..2d75eb1 --- /dev/null +++ b/envs/s3-spark-delta-sharing-minio/docker-compose.yml @@ -0,0 +1,108 @@ +version: '3' + +services: + airflow: + image: docker-whirl-airflow:py-${PYTHON_VERSION}-local + command: ["singlemachine"] + ports: + - '5000:5000' # HTTP (Airflow Web UI) + env_file: + - .whirl.env + environment: + - AIRFLOW__API__AUTH_BACKEND + volumes: + - ${DAG_FOLDER}:/opt/airflow/dags/$PROJECTNAME + - ${ENVIRONMENT_FOLDER}/whirl.setup.d:${WHIRL_SETUP_FOLDER}/env.d/ + - ${DAG_FOLDER}/whirl.setup.d:${WHIRL_SETUP_FOLDER}/dag.d/ + - ${MOCK_DATA_FOLDER}:/mock-data + depends_on: + - s3server + - sparkmaster + links: + - s3server:${DEMO_BUCKET}.s3server + + + s3server: + image: minio/minio:RELEASE.2021-08-31T05-46-54Z + command: + - server + - "--address=0.0.0.0:9000" + - "/tmp/minio" + ports: + - 9000:9000 + environment: + - MINIO_ACCESS_KEY=${AWS_ACCESS_KEY_ID} + - MINIO_SECRET_KEY=${AWS_SECRET_ACCESS_KEY} + volumes: + - ./.s3-mount:/tmp/minio + + delta: + image: deltaio/delta-sharing-server:${DELTA_SHARING_VERSION} + command: ["--config", "/opt/docker/conf/delta-sharing.yml"] + ports: + - 38080:8080 + environment: + - HADOOP_ROOT_LOGGER=ERROR,console + - JAVA_OPTS=-Dlog4j.debug=false -Dlog4j.logLevel=INFO -Dlog4j.configuration=file:///opt/docker/conf/log4j.properties -Dlog4j.configurationFile=file:///opt/docker/conf/log4j.properties + volumes: + - ./config/:/opt/docker/conf/ + + sparkmaster: + build: + context: ${DOCKER_CONTEXT_FOLDER}/aws-spark + dockerfile: Dockerfile + args: + - SPARK_VERSION=${SPARK_VERSION} + environment: + - SPARK_NO_DAEMONIZE=true + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_SERVER + - AWS_PORT + ports: + - 7077:7077 + - 18080:8080 + entrypoint: + - /usr/spark/sbin/start-master.sh + links: + - s3server:${DEMO_BUCKET}.s3server + + sparkworker: + build: + context: ${DOCKER_CONTEXT_FOLDER}/aws-spark + dockerfile: Dockerfile + args: + - SPARK_VERSION=${SPARK_VERSION} + environment: + - SPARK_NO_DAEMONIZE=true + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_SERVER + - AWS_PORT + ports: + - 18081:8081 + entrypoint: + - /usr/spark/sbin/start-slave.sh + - spark://sparkmaster:7077 + - "-m" + - "8G" + depends_on: + - sparkmaster + links: + - s3server:${DEMO_BUCKET}.s3server + + sparkshell: + build: + context: ${DOCKER_CONTEXT_FOLDER}/aws-spark + dockerfile: Dockerfile + args: + - SPARK_VERSION=${SPARK_VERSION} + environment: + - SPARK_NO_DAEMONIZE=true + entrypoint: + - /usr/bin/tail + - "-f" + - /dev/null + depends_on: + - sparkmaster + diff --git a/envs/s3-spark-delta-sharing-minio/whirl.setup.d/01_add_connection_s3.sh b/envs/s3-spark-delta-sharing-minio/whirl.setup.d/01_add_connection_s3.sh new file mode 100644 index 0000000..d8d8e3b --- /dev/null +++ b/envs/s3-spark-delta-sharing-minio/whirl.setup.d/01_add_connection_s3.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +echo "==================" +echo "== Configure S3 ==" +echo "==================" + +pip install awscli awscli-plugin-endpoint + +echo -e "$AWS_ACCESS_KEY_ID\n$AWS_SECRET_ACCESS_KEY\n\n" | aws configure +aws configure set plugins.endpoint awscli_plugin_endpoint +aws configure set default.s3.endpoint_url http://${AWS_SERVER}:${AWS_PORT} +aws configure set default.s3api.endpoint_url http://${AWS_SERVER}:${AWS_PORT} + +echo "======================" +echo "== Create S3 Bucket ==" +echo "======================" +while [[ "$(curl -s -o /dev/null -w ''%{http_code}'' http://${AWS_SERVER}:${AWS_PORT}/minio/health/live)" != "200" ]]; do + echo "Waiting for ${AWS_SERVER} to come up on port ${AWS_PORT}..." + sleep 2; +done + +echo "creating bucket" +aws s3api create-bucket --bucket ${DEMO_BUCKET} diff --git a/envs/s3-spark-delta-sharing-minio/whirl.setup.d/02_add_spark_config.sh b/envs/s3-spark-delta-sharing-minio/whirl.setup.d/02_add_spark_config.sh new file mode 100644 index 0000000..1b19ad1 --- /dev/null +++ b/envs/s3-spark-delta-sharing-minio/whirl.setup.d/02_add_spark_config.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +echo "============================" +echo "======== Add java ==========" +echo "============================" + +sudo apt-get update && sudo apt-get install -y openjdk-11-jre +export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 + +echo "============================" +echo "== Configure Spark config ==" +echo "============================" +airflow connections add spark_default \ + --conn-type spark \ + --conn-host "spark://sparkmaster:7077" \ + --conn-extra "{\"queue\": \"root.default\", \"deploy-mode\": \"client\"}" + +SDK_AWS_VERSION=1.11.563 +HADOOP_AWS_VERSION=3.2.0 + +POSTGRES_JDBC_CHECKSUM=7ffa46f8c619377cdebcd17721b6b21ecf6659850179f96fec3d1035cf5a0cdc +SDK_AWS_CHECKSUM=b323857424e133b44c1156a184dc3a83fa152b656f2e320a71b5637a854822d5 +HADOOP_AWS_CHECKSUM=ceac8724f8bb47d2f039eaecf4ee147623b46e4bbf26ddf73a9bb8808743655e + +pip install pyspark==${SPARK_VERSION} +pip install apache-airflow-providers-apache-spark + +export SPARK_HOME=$(python ~/.local/bin/find_spark_home.py) +echo "-------------------------------" +echo "SPARK_HOME set to ${SPARK_HOME}" +echo "-------------------------------" + +curl -o ${SPARK_HOME}/jars/postgresql-42.2.5.jar https://jdbc.postgresql.org/download/postgresql-42.2.5.jar && \ + echo "$POSTGRES_JDBC_CHECKSUM ${SPARK_HOME}/jars/postgresql-42.2.5.jar" | sha256sum -c - + +curl -o ${SPARK_HOME}/jars/aws-java-sdk-bundle-${SDK_AWS_VERSION}.jar https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${SDK_AWS_VERSION}/aws-java-sdk-bundle-${SDK_AWS_VERSION}.jar && \ + echo "$SDK_AWS_CHECKSUM ${SPARK_HOME}/jars/aws-java-sdk-bundle-${SDK_AWS_VERSION}.jar" | sha256sum -c - + +curl -o ${SPARK_HOME}/jars/hadoop-aws-${HADOOP_AWS_VERSION}.jar https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_AWS_VERSION}/hadoop-aws-${HADOOP_AWS_VERSION}.jar && \ + echo "$HADOOP_AWS_CHECKSUM ${SPARK_HOME}/jars/hadoop-aws-${HADOOP_AWS_VERSION}.jar" | sha256sum -c - diff --git a/envs/s3-spark-delta-sharing-minio/whirl.setup.d/03_add_delta_config.sh b/envs/s3-spark-delta-sharing-minio/whirl.setup.d/03_add_delta_config.sh new file mode 100644 index 0000000..20f98ff --- /dev/null +++ b/envs/s3-spark-delta-sharing-minio/whirl.setup.d/03_add_delta_config.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +echo "==============================================" +echo "== Configure Delta and Delta Sharing config ==" +echo "==============================================" +sudo apt-get update && sudo apt-get install -y git + +pip install delta-spark==${DELTA_VERSION} +pip install delta-sharing==${DELTA_SHARING_VERSION} +# Install fsspec from PR until this is properly released +# pip install git+https://github.com/intake/filesystem_spec.git@refs/pull/718/head + +echo '{ + "shareCredentialsVersion": 1, + "endpoint": "http://delta:8080/delta-sharing/", + "bearerToken": "authTokenDeltaSharing432" +}' > /opt/airflow/delta.profile diff --git a/envs/s3-spark-delta-sharing/docker-compose.yml b/envs/s3-spark-delta-sharing/docker-compose.yml index 0350bf5..cd62fcf 100644 --- a/envs/s3-spark-delta-sharing/docker-compose.yml +++ b/envs/s3-spark-delta-sharing/docker-compose.yml @@ -8,6 +8,8 @@ services: - '5000:5000' # HTTP (Airflow Web UI) env_file: - .whirl.env + environment: + - AIRFLOW__API__AUTH_BACKEND volumes: - ${DAG_FOLDER}:/opt/airflow/dags/$PROJECTNAME - ${ENVIRONMENT_FOLDER}/whirl.setup.d:${WHIRL_SETUP_FOLDER}/env.d/ diff --git a/examples/spark-delta-sharing/.whirl.env b/examples/spark-delta-sharing/.whirl.env index 1aa43ae..95adcb1 100644 --- a/examples/spark-delta-sharing/.whirl.env +++ b/examples/spark-delta-sharing/.whirl.env @@ -1,3 +1,4 @@ +AIRFLOW_VERSION=2.1.3 WHIRL_ENVIRONMENT=s3-spark-delta-sharing MOCK_DATA_FOLDER=$(pwd)/mock-data diff --git a/whirl b/whirl index 617d7d6..a575113 100755 --- a/whirl +++ b/whirl @@ -143,6 +143,9 @@ start() { done if [[ "$(curl -s -o /dev/null -w %\{http_code\} -X PATCH -H 'Content-Type: application/json' --user admin:admin "http://localhost:${AIRFLOW_UI_PORT}/api/v1/dags/${DAG_ID}?update_mask=is_paused" -d '{ "is_paused": false }')" != "200" ]]; then echo "Unable to unpause dag with id ${DAG_ID}." + echo "Command: curl -X PATCH -H 'Content-Type: application/json' --user admin:admin \"http://localhost:${AIRFLOW_UI_PORT}/api/v1/dags/${DAG_ID}?update_mask=is_paused\" -d '{ \"is_paused\": false }'" + echo "Output:" + curl -X PATCH -H 'Content-Type: application/json' --user admin:admin "http://localhost:${AIRFLOW_UI_PORT}/api/v1/dags/${DAG_ID}?update_mask=is_paused" -d '{ "is_paused": false }' stop exit 3 else