In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, DoubleType, IntegerType
from pyspark.sql.functions import sum as _sum, avg, expr, window, from_unixtime, col

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import time, datetime
import os
import random
import uuid

spark = (
    SparkSession.builder.appName("spark application")
    .config(
        "spark.jars.packages",
        "org.apache.hudi:hudi-spark3.1-bundle_2.12:0.13.0,"
        "org.apache.spark:spark-avro_2.12:3.1.1,"
        "org.apache.hadoop:hadoop-aws:3.1.1,"
        "com.amazonaws:aws-java-sdk:1.11.271,"
    )
    .config(
        "spark.sql.extensions",
        "org.apache.spark.sql.hudi.HoodieSparkSessionExtension",
    )
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config(
        "spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"
    )
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config(
        "spark.hadoop.fs.s3a.aws.credentials.provider",
        "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider",
    )
    .config("spark.hadoop.fs.s3a.connection.maximum", "1000")
    .getOrCreate()
)


23/06/23 08:54:53 WARN Utils: Your hostname, ducdn resolves to a loopback address: 127.0.1.1; using 10.1.124.58 instead (on interface enp1s0)
23/06/23 08:54:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/ducdn/Documents/spark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ducdn/.ivy2/cache
The jars for the packages stored in: /home/ducdn/.ivy2/jars
org.apache.hudi#hudi-spark3.1-bundle_2.12 added as a dependency
org.apache.spark#spark-avro_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7ac29893-2c28-4c95-b755-f953199f7d03;1.0
	confs: [default]
	found org.apache.hudi#hudi-spark3.1-bundle_2.12;0.13.0 in central
	found org.apache.spark#spark-avro_2.12;3.1.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-aws;3.1.1 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.271 in central
	found com.amazonaws#aws-java-sdk;1.11.271 in central
	found com.amazonaws#aws-java-sdk-transcribe;1.11.271 in central
	found com.amazonaws#aws-java-sdk-core;1.11.271 in central
	found commons-logging#commons-logging;1.1.3 in central
	found org.apache.h

In [3]:
hudi_table_name = "hudi_bulk_insert"
hudi_operation = "BULK_INSERT"
hudi_path = "/home/ducdn/Desktop/workspace/hudi_mor/hudi_mor_bulk_insert"
# hudi_path = "s3a://datalake/hudi_mor"

In [4]:

hoodie_options = {
    "hoodie.table.name": f"{hudi_table_name}",
    "hoodie.metadata.enable": "true",
    "hoodie.table.type": "MERGE_ON_READ",
    "hoodie.datasource.write.table.type": "MERGE_ON_READ",
    "hoodie.datasource.write.operation": hudi_operation,
    "hoodie.datasource.write.recordkey.field": "id",
    "hoodie.datasource.write.partitionpath.field": "partition",
    "hoodie.datasource.write.table.name": f"{hudi_table_name}",
    "hoodie.datasource.write.precombine.field": "timestamp",
    "hoodie.clean.automatic": "true",
    "hoodie.cleaner.policy": "KEEP_LATEST_FILE_VERSIONS",
    "hoodie.cleaner.fileversions.retained": 8,
    "hoodie.compact.inline": "true",
    "hoodie.compact.inline.max.delta.commits": 3,
    # "hoodie.datasource.write.hive_style_partitioning": "true",
    # "hoodie.datasource.hive_sync.enable": "true",
    # "hoodie.datasource.hive_sync.mode": "hms",
    # "hoodie.datasource.hive_sync.database": "default",
    # "hoodie.datasource.hive_sync.table": f"hudi_mor",
    # "hoodie.datasource.hive_sync.partition_fields": "partition",
    # "hoodie.datasource.hive_sync.partition_extractor_class": 
    #     "org.apache.hudi.hive.MultiPartKeysValueExtractor",
    # "hoodie.datasource.hive_sync.metastore.uris": 
    #     "thrift://hive-metastore:9083"
}

In [5]:
schema = StructType([
    StructField("id", StringType()),
    StructField("value", StringType()),
    StructField("timestamp", IntegerType()),
    StructField("partition", StringType())
])

#### Write Hudi Mor

In [9]:
def current_timestamp():
    current_time = int(time.time())
    return current_time

def gen_data():
    data = []
    for i in range(0, 10):
        i = uuid.uuid4()
        value = random.randint(0, 1000000)
        timestamp = current_timestamp()
        partition = random.randint(1,2)
        # if partition != 2: continue
        x = (str(i), value, timestamp, partition)
        data.append(x)
    return data

while True:
    print("="* 20 + "START GEN DATA" + "="*20)
    data = gen_data()
    stream_df = spark.createDataFrame(data, schema)
    # stream_df = stream_df.withColumn("timestamp", current_timestamp())

    stream_df.show(n=2, truncate=False)

    stream_df.write.format("hudi") \
        .options(**hoodie_options) \
        .mode("append") \
        .save(hudi_path)

    print(20*"-" + "STARTING WRITE HUDI TABLE" + "-"*20)

    print(20*"-" + "DONE" + "-"*20)

    print(f"================== TIME SLEEP ==============")
    time.sleep(10)



+------------------------------------+------+----------+---------+
|id                                  |value |timestamp |partition|
+------------------------------------+------+----------+---------+
|2473ae90-16a8-478d-95de-3234e9a0fc90|361321|1687485638|1        |
|96e4481b-45fb-4e23-907e-fd2517cbeb01|754129|1687485638|1        |
+------------------------------------+------+----------+---------+
only showing top 2 rows



23/06/23 09:00:38 WARN HoodieBackedTableMetadata: Metadata table was not found at path /home/ducdn/Desktop/workspace/hudi_mor/hudi_mor_bulk_insert/.hoodie/metadata
23/06/23 09:00:38 WARN HoodieBackedTableMetadata: Metadata table was not found at path /home/ducdn/Desktop/workspace/hudi_mor/hudi_mor_bulk_insert/.hoodie/metadata
23/06/23 09:00:38 WARN HoodieBackedTableMetadata: Metadata table was not found at path /home/ducdn/Desktop/workspace/hudi_mor/hudi_mor_bulk_insert/.hoodie/metadata
23/06/23 09:00:38 WARN HoodieBackedTableMetadata: Metadata table was not found at path /home/ducdn/Desktop/workspace/hudi_mor/hudi_mor_bulk_insert/.hoodie/metadata
23/06/23 09:00:38 WARN HoodieWriteConfig: Embedded timeline server is disabled, fallback to use direct marker type for spark


--------------------STARTING WRITE HUDI TABLE--------------------
--------------------DONE--------------------
+------------------------------------+------+----------+---------+
|id                                  |value |timestamp |partition|
+------------------------------------+------+----------+---------+
|43734f71-2923-469e-8367-53dd13de3b34|289153|1687485649|1        |
|3a2a7f9b-8e4d-47d5-8ba6-5eb1184b9c53|479477|1687485649|2        |
+------------------------------------+------+----------+---------+
only showing top 2 rows



23/06/23 09:00:50 WARN HoodieWriteConfig: Embedded timeline server is disabled, fallback to use direct marker type for spark


--------------------STARTING WRITE HUDI TABLE--------------------
--------------------DONE--------------------
+------------------------------------+------+----------+---------+
|id                                  |value |timestamp |partition|
+------------------------------------+------+----------+---------+
|1d863926-e79b-43eb-b6fe-273be10c8a14|146338|1687485661|2        |
|4b80ff7f-ba56-428a-a5c4-2bbc82dd9e00|247235|1687485661|2        |
+------------------------------------+------+----------+---------+
only showing top 2 rows



23/06/23 09:01:01 WARN HoodieWriteConfig: Embedded timeline server is disabled, fallback to use direct marker type for spark
23/06/23 09:01:03 WARN BaseHoodieCompactionPlanGenerator: No operations are retrieved for /home/ducdn/Desktop/workspace/hudi_mor/hudi_mor_bulk_insert


--------------------STARTING WRITE HUDI TABLE--------------------
--------------------DONE--------------------
+------------------------------------+------+----------+---------+
|id                                  |value |timestamp |partition|
+------------------------------------+------+----------+---------+
|741844d9-5052-4587-bcbd-3d2e3cd283be|914230|1687485673|1        |
|e8c1f878-55c6-4a87-9273-cd2a72fedd64|968879|1687485673|1        |
+------------------------------------+------+----------+---------+
only showing top 2 rows



23/06/23 09:01:13 WARN HoodieWriteConfig: Embedded timeline server is disabled, fallback to use direct marker type for spark
23/06/23 09:01:14 WARN BaseHoodieCompactionPlanGenerator: No operations are retrieved for /home/ducdn/Desktop/workspace/hudi_mor/hudi_mor_bulk_insert


--------------------STARTING WRITE HUDI TABLE--------------------
--------------------DONE--------------------
+------------------------------------+------+----------+---------+
|id                                  |value |timestamp |partition|
+------------------------------------+------+----------+---------+
|a484d77f-a9f2-4c1e-846e-fcc6069a3cae|769279|1687485684|1        |
|194840ce-da1e-4e45-bc1a-ef82973207e9|670046|1687485684|2        |
+------------------------------------+------+----------+---------+
only showing top 2 rows



23/06/23 09:01:25 WARN HoodieWriteConfig: Embedded timeline server is disabled, fallback to use direct marker type for spark
23/06/23 09:01:26 WARN BaseHoodieCompactionPlanGenerator: No operations are retrieved for /home/ducdn/Desktop/workspace/hudi_mor/hudi_mor_bulk_insert


--------------------STARTING WRITE HUDI TABLE--------------------
--------------------DONE--------------------
+------------------------------------+------+----------+---------+
|id                                  |value |timestamp |partition|
+------------------------------------+------+----------+---------+
|3763ff9b-c5cd-451b-8de4-fc38fbb3da91|230089|1687485696|2        |
|95b211c7-9382-4300-8cbf-43feee9209ea|141415|1687485696|2        |
+------------------------------------+------+----------+---------+
only showing top 2 rows



23/06/23 09:01:37 WARN HoodieWriteConfig: Embedded timeline server is disabled, fallback to use direct marker type for spark


KeyboardInterrupt: 

23/06/23 09:01:38 WARN BaseHoodieCompactionPlanGenerator: No operations are retrieved for /home/ducdn/Desktop/workspace/hudi_mor/hudi_mor_bulk_insert


#### Update

In [24]:

data = [("fd6be044-0e71-4268-b9ae-8ac0a5sqwer", 3306, current_timestamp(), 2)]
stream_df = spark.createDataFrame(data, schema)


stream_df.write.format("hudi") \
    .options(**hoodie_options) \
    .mode("append") \
    .save(hudi_path)

23/06/20 10:35:31 WARN HoodieWriteConfig: Embedded timeline server is disabled, fallback to use direct marker type for spark
23/06/20 10:35:32 WARN HoodieWriteConfig: Embedded timeline server is disabled, fallback to use direct marker type for spark
23/06/20 10:35:34 WARN BaseHoodieCompactionPlanGenerator: No operations are retrieved for /home/ducdn/Desktop/workspace/hudi_mor/hudi_mor
