In [1]:
!python --version

Python 3.11.4


In [2]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

# Create SparkSession with Hudi configuration

spark = SparkSession.builder \
    .appName("HudiJob") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/02 17:06:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pyspark.sql import Row
from pyspark.sql.functions import lit
# Example data
data = [Row(id=1, name="Alice", age=24), Row(id=2, name="Bob", age=30)]
df = spark.createDataFrame(data)

dfWithTimestamp = df.withColumn("curr_timestamp", f.current_timestamp())
dfWithTimestamp.show()

                                                                                

+---+-----+---+--------------------+
| id| name|age|      curr_timestamp|
+---+-----+---+--------------------+
|  1|Alice| 24|2024-10-02 17:06:...|
|  2|  Bob| 30|2024-10-02 17:06:...|
+---+-----+---+--------------------+



In [4]:
hudi_options = {
    'hoodie.table.name':'my_hudi_table',
    'hoodie.datasource.write.table.type': 'COPY_ON_WRITE'
    'hoodie.datasource.write.recordkey.field': 'id',
    'hoodie.datasource.write.precombine.field': 'curr_timestamp',
    'hoodie.datasource.write.partitionpath.field': 'curr_timestamp',  # Consider changing this
    'hoodie.datasource.write.table.name': 'my_hudi_table',
    'hoodie.datasource.hive_sync.enable': 'false',
    'hoodie.datasource.write.operation': 'upsert',
}

# Sample Hudi write code
dfWithTimestamp.write.format("hudi").options(**hudi_options).mode("overwrite").save("s3a://hudi-minio-bucket/my_hudi_table")

24/10/02 17:06:33 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/10/02 17:06:34 WARN DFSPropertiesConfiguration: Cannot find HUDI_CONF_DIR, please set it as the dir of hudi-defaults.conf
24/10/02 17:06:34 WARN DFSPropertiesConfiguration: Properties file file:/etc/hudi/conf/hudi-defaults.conf not found. Ignoring to load props file
24/10/02 17:06:34 WARN DataSourceOptionsHelper$: hoodie.datasource.write.storage.type is deprecated and will be removed in a later release; Please use hoodie.datasource.write.table.type
24/10/02 17:06:34 WARN DataSourceOptionsHelper$: hoodie.datasource.write.storage.type is deprecated and will be removed in a later release; Please use hoodie.datasource.write.table.type
24/10/02 17:06:34 WARN HoodieSparkSqlWriterInternal: hoodie table at s3a://hudi-minio-bucket/my_hudi_table already exists. Deleting existing data & overwriting with new data.
24/10/02 17:06:37 WARN S3ABlockOutputStr



24/10/02 17:06:46 WARN HoodieSparkSqlWriterInternal: Closing write client       


In [6]:
hudi_read_options = {
    'hoodie.datasource.query.type': 'snapshot'
}

# Reading the Hudi table from the base path
hudi_df = spark.read.format("hudi").options(**hudi_read_options).load("s3a://hudi-minio-bucket/my_hudi_table")

# Display the data
hudi_df.show()



+-------------------+--------------------+------------------+----------------------+--------------------+---+-----+---+-------------------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name| id| name|age|     curr_timestamp|
+-------------------+--------------------+------------------+----------------------+--------------------+---+-----+---+-------------------+
|  20241002170635296|20241002170635296...|                 2|      1727888796133503|f71b70f6-80a4-41c...|  2|  Bob| 30|1970-01-01 00:00:00|
|  20241002170635296|20241002170635296...|                 1|      1727888796133503|f71b70f6-80a4-41c...|  1|Alice| 24|1970-01-01 00:00:00|
+-------------------+--------------------+------------------+----------------------+--------------------+---+-----+---+-------------------+

