## Demonstration: Apache Hudi with Kafka and S3

__Purpose:__ Read messages from Kafka topic in JSON format and write to Amazon S3 as Parquet using Apache Hudi: Upserts and Delete  
__Author:__  Gary A. Stafford  
__Date:__ 2021-10-03  
__References:__  
- https://hudi.apache.org/docs/quick-start-guide/
- https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hudi-work-with-dataset.html

#### Run commands from master node

```shell
hdfs dfs -rm -r /mnt/tmp/
# or
hdfs dfs -chown -R livy /mnt/tmp
```

#### Run commands from master node

```shell
hdfs dfs -mkdir -p /apps/hudi/lib
hdfs dfs -copyFromLocal /usr/lib/hudi/hudi-spark-bundle.jar /apps/hudi/lib/hudi-spark-bundle.jar
hdfs dfs -copyFromLocal /usr/lib/spark/external/lib/spark-avro.jar /apps/hudi/lib/spark-avro.jar
```

In [None]:
%%spark

In [None]:
%%configure -f
{
    "conf": {
        "spark.jars":
            "hdfs:///apps/hudi/lib/hudi-spark-bundle.jar,hdfs:///apps/hudi/lib/spark-avro.jar",
        "spark.serializer":
            "org.apache.spark.serializer.KryoSerializer",
        "spark.sql.hive.convertMetastoreParquet":
            "false"
    }
}

In [None]:
import os
import time

import boto3
import pyspark.sql.functions as F
from ec2_metadata import ec2_metadata
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, FloatType, TimestampType

In [None]:
# source of Kafka messages

source_topic = "pagila.sales.spark.streaming"

In [None]:
# aws ssm parameter store values

os.environ["AWS_DEFAULT_REGION"] = ec2_metadata.region
ssm_client = boto3.client("ssm")

params = {
    "kafka_servers":
        ssm_client.get_parameter(Name="/kafka_spark_demo/kafka_servers")
        ["Parameter"]["Value"],
    "kafka_demo_bucket":
        ssm_client.get_parameter(Name="/kafka_spark_demo/kafka_demo_bucket")
        ["Parameter"]["Value"],
}

In [None]:
# batch query Kafka topic

options_read = {
    "kafka.bootstrap.servers":
        params["kafka_servers"],
    "subscribe":
        source_topic,
    "startingOffsets":
        "earliest",
    "endingOffsets":
        "latest",
    "kafka.ssl.truststore.location":
        "/tmp/kafka.client.truststore.jks",
    "kafka.security.protocol":
        "SASL_SSL",
    "kafka.sasl.mechanism":
        "AWS_MSK_IAM",
    "kafka.sasl.jaas.config":
        "software.amazon.msk.auth.iam.IAMLoginModule required;",
    "kafka.sasl.client.callback.handler.class":
        "software.amazon.msk.auth.iam.IAMClientCallbackHandler"
}

df_sales = spark.read \
    .format("kafka") \
    .options(**options_read) \
    .load()

In [None]:
df_sales.show(5)

In [None]:
# convert message payload from binary and deserialize JSON

schema = StructType([
    StructField("payment_id", IntegerType(), False),
    StructField("customer_id", IntegerType(), False),
    StructField("amount", FloatType(), False),
    StructField("payment_date", TimestampType(), False),
    StructField("city", StringType(), True),
    StructField("district", StringType(), True),
    StructField("country", StringType(), False),
])

df_sales = df_sales \
    .selectExpr("CAST(value AS STRING)", "timestamp") \
    .select(F.from_json("value", schema=schema).alias("data"), "timestamp") \
    .select("data.*", "timestamp")

In [None]:
df_sales.show(3, truncate=False)

In [None]:
# write all records to S3 as Parquet, partitioned by country, using Apache Hudi

table_name = "hudi.hudi_pagila_sales"
base_path = f"s3://{params['kafka_demo_bucket']}/hudi/"

hudi_options = {
    "hoodie.table.name": table_name,
    "hoodie.datasource.write.recordkey.field": "payment_id",
    "hoodie.datasource.write.table.name": table_name,
    "hoodie.datasource.write.partitionpath.field": "country",
    "hoodie.datasource.write.operation": "upsert",
    "hoodie.datasource.write.precombine.field": "timestamp",
    "hoodie.upsert.shuffle.parallelism": 2,
    "hoodie.insert.shuffle.parallelism": 2,
}

df_sales.write \
    .format("org.apache.hudi") \
    .options(**hudi_options) \
    .mode("overwrite") \
    .save(base_path)

In [None]:
# read data back from S3

df_sales_snapshot = spark \
    .read \
    .format("org.apache.hudi") \
    .load(f"{base_path}/*/*")

df_sales_snapshot.createOrReplaceTempView("hudi_sales_snapshot")

In [None]:
%%sql
SELECT payment_id, payment_date, amount, city, district, country 
FROM hudi_sales_snapshot 
WHERE country="Japan" 
ORDER BY payment_date 
LIMIT 10

In [None]:
# update one record with different payment amount

df_update = df_sales \
    .filter(F.col("payment_id") == 16347) \
    .withColumn("payment_date", F.current_timestamp()) \
    .withColumn("amount", (F.lit(9.99)).cast(FloatType()))

In [None]:
# show updated record

df_update.filter(F.col("payment_id") == 16347).show()

In [None]:
# upsert record to S3 using Apache Hudi

df_update.write \
    .format("org.apache.hudi") \
    .option("hoodie.datasource.write.operation", "upsert") \
    .options(**hudi_options) \
    .mode("append") \
    .save(base_path)

In [None]:
# read updated data back from S3

df_updated_sales_snapshot = spark \
    .read \
    .option("mergeSchema", "true") \
    .format("org.apache.hudi") \
    .load(f"{base_path}/*/*")

df_updated_sales_snapshot.createOrReplaceTempView("df_updated_sales_snapshot")

In [None]:
%%sql
SELECT payment_id, payment_date, amount, city, district, country 
FROM df_updated_sales_snapshot 
WHERE payment_id=16347

In [None]:
# delete the same record from S3 using Apache Hudi

df_update.write \
    .format("org.apache.hudi") \
    .option("hoodie.datasource.write.operation", "upsert") \
    .option("hoodie.datasource.write.payload.class", 
            "org.apache.hudi.common.model.EmptyHoodieRecordPayload") \
    .options(**hudi_options) \
    .mode("append") \
    .save(base_path)

In [None]:
# read updated data back from S3

df_updated_sales_snapshot = spark \
    .read \
    .format("org.apache.hudi") \
    .load(f"{base_path}/*/*")

df_updated_sales_snapshot.createOrReplaceTempView("df_updated_sales_snapshot")

In [None]:
%%sql
SELECT payment_id, payment_date, amount, city, district, country 
FROM df_updated_sales_snapshot 
WHERE payment_id=16347

Last statement should return no results since record was deleted.

