Skip to content

Commit

Permalink
Add env with minio as S3 container
Browse files Browse the repository at this point in the history
  • Loading branch information
krisgeus committed Sep 3, 2021
1 parent 055a45d commit 4df7a92
Show file tree
Hide file tree
Showing 15 changed files with 672 additions and 0 deletions.
1 change: 1 addition & 0 deletions envs/s3-spark-delta-sharing-minio/.gitignore
@@ -0,0 +1 @@
.s3-mount
Empty file.
16 changes: 16 additions & 0 deletions envs/s3-spark-delta-sharing-minio/.whirl.env
@@ -0,0 +1,16 @@
AWS_ACCESS_KEY_ID=qwerty
AWS_SECRET_ACCESS_KEY=qwerty123
AWS_SERVER=s3server
AWS_PORT=9000
DEMO_BUCKET=demo-s3-output

# Spark variables
SPARK_VERSION=3.1.1
DELTA_VERSION=1.0.0
DELTA_SHARING_VERSION=0.2.0

# Airflow env vars
AIRFLOW__CORE__EXPOSE_CONFIG=True
AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS=False
AIRFLOW__CORE__LOAD_EXAMPLES=False
AIRFLOW__WEBSERVER__EXPOSE_CONFIG=True
@@ -0,0 +1,16 @@
#!/usr/bin/env bash

function check_docker_mem() {
echo "==============================================="
echo "== Check if there is enough available memory =="
echo "==============================================="
MEM_12_POINT_5_GB=$(((1024 * 1024 * 1024 * 25)/2))
AVAILABLE_MEM=$(docker info -f "{{json .MemTotal}}")

if [ "${AVAILABLE_MEM}" -lt "${MEM_12_POINT_5_GB}" ]; then
echo "NOT ENOUGH MEMORY AVAILABLE ($(bc <<< "scale=1; $AVAILABLE_MEM / 1024 / 1024 / 1024")). Need at least 12.5GB"
exit 12;
fi
}

check_docker_mem
@@ -0,0 +1,18 @@
#!/usr/bin/env bash

function empty_s3_dir() {
echo "================================"
echo "== Cleanup local S3 mount dir =="
echo "================================"
local SCRIPT_DIR=$( dirname ${BASH_SOURCE[0]} )
S3_MOUNT_DIR="${SCRIPT_DIR}/../.s3-mount"

if [ "$(ls -A ${S3_MOUNT_DIR})" ]; then
echo "${S3_MOUNT_DIR} is not empty. Clearing NOW!!"
find ${S3_MOUNT_DIR} -mindepth 1 -delete
else
echo "${S3_MOUNT_DIR} is empty. Continue"
fi
}

empty_s3_dir
60 changes: 60 additions & 0 deletions envs/s3-spark-delta-sharing-minio/config/core-site.xml
@@ -0,0 +1,60 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
<name>fs.s3.awsAccessKeyId</name>
<value>qwerty</value>
</property>

<property>
<name>fs.s3.awsSecretAccessKey</name>
<value>qwerty123</value>
</property>

<property>
<name>fs.s3n.awsAccessKeyId</name>
<value>qwerty</value>
</property>

<property>
<name>fs.s3n.awsSecretAccessKey</name>
<value>qwerty123</value>
</property>

<property>
<name>fs.s3a.access.key</name>
<value>qwerty</value>
</property>

<property>
<name>fs.s3a.secret.key</name>
<value>qwerty123</value>
</property>

<property>
<name>fs.s3a.connection.ssl.enabled</name>
<value>false</value>
</property>

<property>
<name>fs.s3a.endpoint</name>
<value>s3server:9000</value>
</property>

<property>
<name>fs.s3a.path.style.access</name>
<value>true</value>
</property>

<property>
<name>fs.s3a.impl</name>
<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
</property>

<property>
<name>fs.s3.impl</name>
<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
</property>

</configuration>
33 changes: 33 additions & 0 deletions envs/s3-spark-delta-sharing-minio/config/delta-sharing.yml
@@ -0,0 +1,33 @@
# The format version of this config file
version: 1
# Config shares/schemas/tables to share
shares:
- name: "airflow"
schemas:
- name: "spark"
tables:
- name: "table1"
location: "s3a://demo-s3-output/output/data/demo/spark/20210614/"
- name: "cars"
location: "s3a://demo-s3-output/output/data/demo/spark/cars/"
- name: "cars-all"
location: "s3a://demo-s3-output/output/data/demo/spark/cars-all/"
- name: "cars-python"
location: "s3a://demo-s3-output/output/data/demo/spark/cars-python/"
# Set the host name that the server will use
host: "0.0.0.0"
# Set the port that the server will listen on
port: 8080
# Set the url prefix for the REST APIs
endpoint: "/delta-sharing"
# Set the timeout of S3 presigned url in seconds
preSignedUrlTimeoutSeconds: 900
# How many tables to cache in the server
deltaTableCacheSize: 10
# Whether we can accept working with a stale version of the table. This is useful when sharing
# static tables that will never be changed.
stalenessAcceptable: false
# Whether to evaluate user provided `predicateHints`
evaluatePredicateHints: false
authorization:
bearerToken: authTokenDeltaSharing432

0 comments on commit 4df7a92

Please sign in to comment.