Skip to content

Commit

Permalink
[HUDI-4982] Add Utilities and Utilities Slim + Spark Bundle testing t…
Browse files Browse the repository at this point in the history
…o GH Actions (apache#7005)


Co-authored-by: Raymond Xu <2701446+xushiyan@users.noreply.github.com>
  • Loading branch information
2 people authored and fengjian committed Apr 5, 2023
1 parent f9a48fc commit f50a59b
Show file tree
Hide file tree
Showing 12 changed files with 274 additions and 52 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/bot.yml
Expand Up @@ -72,7 +72,7 @@ jobs:
if: ${{ !endsWith(env.SPARK_PROFILE, '2.4') }} # skip test spark 2.4 as it's covered by Azure CI
run: |
HUDI_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
./packaging/bundle-validation/spark-write-hive-sync/ci_run.sh $HUDI_VERSION
./packaging/bundle-validation/ci_run.sh $HUDI_VERSION
- name: Spark SQL Test
env:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
Expand Down
29 changes: 29 additions & 0 deletions packaging/bundle-validation/Dockerfile
@@ -0,0 +1,29 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

ARG IMAGE_TAG=spark313hive313
FROM apachehudi/hudi-ci-bundle-validation-base:$IMAGE_TAG

# configure the stack
ADD . .
ENV HUDI_CONF_DIR=$WORKDIR/conf
RUN cp conf/hive-site.xml $HIVE_HOME/conf/
RUN cp conf/hive-site.xml $SPARK_HOME/conf/
RUN cp $DERBY_HOME/lib/derbyclient.jar $SPARK_HOME/jars/
RUN cp conf/spark-defaults.conf $SPARK_HOME/conf/
RUN if [[ $SPARK_HOME == *"spark-3.2"* ]] || [[ $SPARK_HOME == *"spark-3.3"* ]]; \
then printf "\nspark.sql.catalog.spark_catalog org.apache.spark.sql.hudi.catalog.HoodieCatalog\n" >> $SPARK_HOME/conf/spark-defaults.conf; fi
Expand Up @@ -18,8 +18,8 @@ FROM adoptopenjdk/openjdk8:alpine

RUN apk add --no-cache --upgrade bash

RUN mkdir /opt/hudi-bundles
ENV WORKDIR=/opt/hudi-bundles
RUN mkdir /opt/bundle-validation
ENV WORKDIR=/opt/bundle-validation
WORKDIR $WORKDIR

ARG HADOOP_VERSION=2.7.7
Expand Down Expand Up @@ -47,10 +47,3 @@ RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK
&& tar -xf $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -C $WORKDIR/ \
&& rm $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz
ENV SPARK_HOME=$WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION

RUN cp $DERBY_HOME/lib/derbyclient.jar $SPARK_HOME/jars/
COPY hive-site.xml $HIVE_HOME/conf/
RUN ln -sf $HIVE_HOME/conf/hive-site.xml $SPARK_HOME/conf/hive-site.xml
COPY spark-defaults.conf $SPARK_HOME/conf/
COPY validate.scala .
COPY validate.sh .
Expand Up @@ -18,15 +18,16 @@
# under the License.

# Note:
# this script is to run by GitHub Actions CI tasks from the project root directory
# and contains environment-specific variables
#
# This script is to
# - set the corresponding variables based on CI job's build profiles
# - prepare Hudi bundle jars for mounting into Docker container for validation
# - prepare test datasets for mounting into Docker container for validation
#
# This is to run by GitHub Actions CI tasks from the project root directory
# and it contains the CI environment-specific variables.

HUDI_VERSION=$1
# to store bundle jars for validation
mkdir ${GITHUB_WORKSPACE}/jars
cp packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar ${GITHUB_WORKSPACE}/jars
echo 'Validating jars below:'
ls -l ${GITHUB_WORKSPACE}/jars

# choose versions based on build profiles
if [[ ${SPARK_PROFILE} == 'spark2.4' ]]; then
Expand Down Expand Up @@ -59,13 +60,33 @@ elif [[ ${SPARK_PROFILE} == 'spark3.3' ]]; then
IMAGE_TAG=spark330hive313
fi

cd packaging/bundle-validation/spark-write-hive-sync || exit 1
# Copy bundle jars to temp dir for mounting
TMP_JARS_DIR=/tmp/jars/$(date +%s)
mkdir -p $TMP_JARS_DIR
cp ${GITHUB_WORKSPACE}/packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
cp ${GITHUB_WORKSPACE}/packaging/hudi-utilities-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
cp ${GITHUB_WORKSPACE}/packaging/hudi-utilities-slim-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
echo 'Validating jars below:'
ls -l $TMP_JARS_DIR

# Copy test dataset
TMP_DATA_DIR=/tmp/data/$(date +%s)
mkdir -p $TMP_DATA_DIR/stocks/data
cp ${GITHUB_WORKSPACE}/docker/demo/data/*.json $TMP_DATA_DIR/stocks/data/
cp ${GITHUB_WORKSPACE}/docker/demo/config/schema.avsc $TMP_DATA_DIR/stocks/

# build docker image
cd ${GITHUB_WORKSPACE}/packaging/bundle-validation || exit 1
docker build \
--build-arg HADOOP_VERSION=$HADOOP_VERSION \
--build-arg HIVE_VERSION=$HIVE_VERSION \
--build-arg DERBY_VERSION=$DERBY_VERSION \
--build-arg SPARK_VERSION=$SPARK_VERSION \
--build-arg SPARK_HADOOP_VERSION=$SPARK_HADOOP_VERSION \
--build-arg IMAGE_TAG=$IMAGE_TAG \
-t hudi-ci-bundle-validation:$IMAGE_TAG \
.
docker run -v ${GITHUB_WORKSPACE}/jars:/opt/hudi-bundles/jars -i hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh

# run validation script in docker
docker run -v $TMP_JARS_DIR:/opt/bundle-validation/jars -v $TMP_DATA_DIR:/opt/bundle-validation/data \
-i hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh
Expand Up @@ -15,6 +15,8 @@
# limitations under the License.
#

spark.serializer org.apache.spark.serializer.KryoSerializer
spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse
hoodie.upsert.shuffle.parallelism 8
hoodie.insert.shuffle.parallelism 8
hoodie.delete.shuffle.parallelism 8
hoodie.bulkinsert.shuffle.parallelism 8
hoodie.finalize.write.parallelism 8
22 changes: 22 additions & 0 deletions packaging/bundle-validation/conf/spark-defaults.conf
@@ -0,0 +1,22 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

spark.serializer org.apache.spark.serializer.KryoSerializer
spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse
spark.default.parallelism 8
spark.sql.shuffle.partitions 8
30 changes: 0 additions & 30 deletions packaging/bundle-validation/spark-write-hive-sync/validate.sh

This file was deleted.

23 changes: 23 additions & 0 deletions packaging/bundle-validation/utilities/hoodieapp.properties
@@ -0,0 +1,23 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

hoodie.datasource.write.recordkey.field=key
hoodie.datasource.write.partitionpath.field=date
hoodie.datasource.write.precombine.field=ts
hoodie.metadata.enable=true
hoodie.deltastreamer.source.dfs.root=file:///opt/bundle-validation/data/stocks/data
hoodie.deltastreamer.schemaprovider.target.schema.file=file:///opt/bundle-validation/data/stocks/schema.avsc
hoodie.deltastreamer.schemaprovider.source.schema.file=file:///opt/bundle-validation/data/stocks/schema.avsc
25 changes: 25 additions & 0 deletions packaging/bundle-validation/utilities/validate.scala
@@ -0,0 +1,25 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

val hudiDf = spark.read.format("hudi").load("/tmp/hudi-utilities-test/")
val inputDf = spark.read.format("json").load("/opt/bundle-validation/data/stocks/data")
val hudiCount = hudiDf.select("date", "key").distinct.count
val srcCount = inputDf.select("date", "key").distinct.count
if (hudiCount == srcCount) System.exit(0)
println(s"Counts don't match hudiCount: $hudiCount, srcCount: $srcCount")
System.exit(1)
137 changes: 137 additions & 0 deletions packaging/bundle-validation/validate.sh
@@ -0,0 +1,137 @@
#!/bin/bash

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

#################################################################################################
# NOTE: this script runs inside hudi-ci-bundle-validation container
# $WORKDIR/jars/ is to mount to a host directory where bundle jars are placed
# $WORKDIR/data/ is to mount to a host directory where test data are placed with structures like
# - <dataset name>/schema.avsc
# - <dataset name>/data/<data files>
#################################################################################################

WORKDIR=/opt/bundle-validation
JARS_DIR=${WORKDIR}/jars
# link the jar names to easier to use names
ln -sf $JARS_DIR/hudi-spark*.jar $JARS_DIR/spark.jar
ln -sf $JARS_DIR/hudi-utilities-bundle*.jar $JARS_DIR/utilities.jar
ln -sf $JARS_DIR/hudi-utilities-slim*.jar $JARS_DIR/utilities-slim.jar


##
# Function to test the spark bundle with hive sync.
#
# env vars (defined in container):
# HIVE_HOME: path to the hive directory
# DERBY_HOME: path to the derby directory
# SPARK_HOME: path to the spark directory
##
test_spark_bundle () {
echo "::warning::validate.sh setting up hive metastore for spark bundle validation"

$DERBY_HOME/bin/startNetworkServer -h 0.0.0.0 &
$HIVE_HOME/bin/hiveserver2 &
echo "::warning::validate.sh hive metastore setup complete. Testing"
$SPARK_HOME/bin/spark-shell --jars $JARS_DIR/spark.jar < $WORKDIR/spark/validate.scala
if [ "$?" -ne 0 ]; then
echo "::error::validate.sh failed hive testing"
exit 1
fi
echo "::warning::validate.sh spark bundle validation successful"
}


##
# Function to test the utilities bundle and utilities slim bundle + spark bundle.
# It runs deltastreamer and then verifies that deltastreamer worked correctly.
#
# 1st arg: main jar to run with spark-submit, usually it's the utilities(-slim) bundle
# 2nd arg and beyond: any additional jars to pass to --jars option
#
# env vars (defined in container):
# SPARK_HOME: path to the spark directory
##
test_utilities_bundle () {
MAIN_JAR=$1
printf -v EXTRA_JARS '%s,' "${@:2}"
EXTRA_JARS="${EXTRA_JARS%,}"
OPT_JARS=""
if [[ -n $EXTRA_JARS ]]; then
OPT_JARS="--jars $EXTRA_JARS"
fi
OUTPUT_DIR=/tmp/hudi-utilities-test/
rm -r $OUTPUT_DIR
echo "::warning::validate.sh running deltastreamer"
$SPARK_HOME/bin/spark-submit \
--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer \
$OPT_JARS $MAIN_JAR \
--props $WORKDIR/utilities/hoodieapp.properties \
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
--source-class org.apache.hudi.utilities.sources.JsonDFSSource \
--source-ordering-field ts --table-type MERGE_ON_READ \
--target-base-path ${OUTPUT_DIR} \
--target-table utilities_tbl --op UPSERT
if [ "$?" -ne 0 ]; then
echo "::error::validate.sh deltastreamer failed with exit code $?"
exit 1
fi
echo "::warning::validate.sh done with deltastreamer"

OUTPUT_SIZE=$(du -s ${OUTPUT_DIR} | awk '{print $1}')
if [[ -z $OUTPUT_SIZE || "$OUTPUT_SIZE" -lt "580" ]]; then
echo "::error::validate.sh deltastreamer output folder ($OUTPUT_SIZE) is smaller than minimum expected (580)"
exit 1
fi

echo "::warning::validate.sh validating deltastreamer in spark shell"
SHELL_COMMAND="$SPARK_HOME/bin/spark-shell $OPT_JARS $MAIN_JAR -i $WORKDIR/utilities/validate.scala"
echo "::debug::this is the shell command: $SHELL_COMMAND"
LOGFILE="$WORKDIR/${FUNCNAME[0]}.log"
$SHELL_COMMAND >> $LOGFILE
if [ "$?" -ne 0 ]; then
SHELL_RESULT=$(cat $LOGFILE | grep "Counts don't match")
echo "::error::validate.sh $SHELL_RESULT"
exit 1
fi
echo "::warning::validate.sh done validating deltastreamer in spark shell"
}


test_spark_bundle
if [ "$?" -ne 0 ]; then
exit 1
fi

if [[ $SPARK_HOME == *"spark-2.4"* ]] || [[ $SPARK_HOME == *"spark-3.1"* ]]
then
echo "::warning::validate.sh testing utilities bundle"
test_utilities_bundle $JARS_DIR/utilities.jar
if [ "$?" -ne 0 ]; then
exit 1
fi
echo "::warning::validate.sh done testing utilities bundle"
else
echo "::warning::validate.sh skip testing utilities bundle for non-spark2.4 & non-spark3.1 build"
fi

echo "::warning::validate.sh testing utilities slim bundle"
test_utilities_bundle $JARS_DIR/utilities-slim.jar $JARS_DIR/spark.jar
if [ "$?" -ne 0 ]; then
exit 1
fi
echo "::warning::validate.sh done testing utilities slim bundle"

0 comments on commit f50a59b

Please sign in to comment.