In [1]:
from pyspark.sql import SparkSession
import os

In [2]:
HUDI_SPARK_BUNDLE = os.environ.get("HUDI_SPARK_BUNDLE")
HADOOP_S3_JAR = "/opt/spark/jars/hadoop-aws-3.3.4.jar,/opt/spark/jars/aws-java-sdk-bundle-1.12.734.jar"

ALL_JARS = f"{HUDI_SPARK_BUNDLE},{HADOOP_S3_JAR}"

In [3]:
spark = SparkSession.builder \
    .appName('HudiCRUD') \
    .config("spark.jars", ALL_JARS) \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
    .config("spark.sql.hive.convertMetastoreParquet", "false") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9090") \
    .config("spark.hadoop.fs.s3a.access.key", "minio") \
    .config("spark.hadoop.fs.s3a.secret.key", "minio123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .getOrCreate()

25/08/12 09:13:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
columns = ["id","creation_date", "status", "last_update_time"]
data = [
    ("100", "2015-01-01", "unprocessed", "2015-01-01T13:51:39.340396Z"),
    ("101", "2015-01-01", "unprocessed", "2015-01-01T12:14:58.597216Z"),
    ("102", "2015-01-01", "unprocessed", "2015-01-01T13:51:40.417052Z"),
    ("103", "2015-01-01", "unprocessed", "2015-01-01T13:51:40.519832Z"),
    ("104", "2015-01-02", "unprocessed", "2015-01-01T12:15:00.512679Z"),
    ("105", "2015-01-02", "unprocessed", "2015-01-01T13:51:42.248818Z")
]

In [5]:
inputDF = spark.createDataFrame(data).toDF(*columns)
inputDF.show(truncate = False)

25/08/12 09:13:31 WARN DFSPropertiesConfiguration: Properties file file:/etc/hudi/conf/hudi-defaults.conf not found. Ignoring to load props file
25/08/12 09:13:31 WARN DFSPropertiesConfiguration: Cannot find HUDI_CONF_DIR, please set it as the dir of hudi-defaults.conf
                                                                                

+---+-------------+-----------+---------------------------+
|id |creation_date|status     |last_update_time           |
+---+-------------+-----------+---------------------------+
|100|2015-01-01   |unprocessed|2015-01-01T13:51:39.340396Z|
|101|2015-01-01   |unprocessed|2015-01-01T12:14:58.597216Z|
|102|2015-01-01   |unprocessed|2015-01-01T13:51:40.417052Z|
|103|2015-01-01   |unprocessed|2015-01-01T13:51:40.519832Z|
|104|2015-01-02   |unprocessed|2015-01-01T12:15:00.512679Z|
|105|2015-01-02   |unprocessed|2015-01-01T13:51:42.248818Z|
+---+-------------+-----------+---------------------------+



In [6]:
table_name = "minio_hudi_table_mor"
base_path = f"s3a://warehouse/hudi-db"

In [7]:
hudi_conf = {
    "hoodie.table.name": table_name,
    "hoodie.datasource.write.recordkey.field": "id",
    "hoodie.datasource.write.storage.type": "MERGE_ON_READ",
    "hoodie.datasource.write.partitionpath.field": "creation_date",
    "hoodie.datasource.write.precombine.field": "last_update_time",
    "hoodie.datasource.write.operation": "insert",
    "hoodie.write.markers.type": "DIRECT",
    "hoodie.datasource.write.hive_style_partitioning": "true"
}

In [8]:
# Write the DataFrame to a Hudi COW table
inputDF.write \
    .format("hudi") \
    .options(**hudi_conf) \
    .mode("overwrite") \
    .save(f"{base_path}/{table_name}")

25/08/12 09:13:34 WARN DataSourceOptionsHelper$: hoodie.datasource.write.storage.type is deprecated and will be removed in a later release; Please use hoodie.datasource.write.table.type
25/08/12 09:13:34 WARN DataSourceOptionsHelper$: hoodie.datasource.write.storage.type is deprecated and will be removed in a later release; Please use hoodie.datasource.write.table.type
25/08/12 09:13:35 WARN S3ABlockOutputStream: Application invoked the Syncable API against stream writing to hudi-db/minio_hudi_table_mor/.hoodie/metadata/files/.files-0000-0_00000000000000000.log.1_0-0-0. This is unsupported




In [9]:
print(spark.sparkContext._conf.get("spark.jars"))

/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-spark-bundle.jar,/opt/spark/jars/hadoop-aws-3.3.4.jar,/opt/spark/jars/aws-java-sdk-bundle-1.12.734.jar
