In [1]:
!python --version

Python 3.11.4


In [2]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

# Create SparkSession with Hudi configuration

spark = SparkSession.builder \
    .appName("HudiJob") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/02 17:06:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [60]:
from pyspark.sql.functions import monotonically_increasing_id

csv_file_path = "file:///home/sparkuser/app/online_retail.csv"
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)
df.show()

df_with_record_id = df.withColumn("RecordId",(F.floor(F.rand() * 900000) + 100000).cast("int"))
# Generates a number between 100000 and 999999

df_with_record_id.show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|01/12/10 08:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|01/12/10 08:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|01/12/10 08:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|01/12/10 08:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|01/12/10 08:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|01/12/10 08:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|01/12/10 08:26|     4.

In [77]:
# Print schema
df_with_record_id.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- RecordId: integer (nullable = true)



In [51]:
df_with_id.describe().show()

24/10/02 20:09:13 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------+-----------------+------------------+--------------------+------------------+--------------+-----------------+------------------+-----------+--------------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|   InvoiceDate|        UnitPrice|        CustomerID|    Country|                  Id|
+-------+-----------------+------------------+--------------------+------------------+--------------+-----------------+------------------+-----------+--------------------+
|  count|           541909|            541909|              540455|            541909|        541909|           541909|            406829|     541909|              541909|
|   mean| 559965.752026781|27623.240210938104|             20713.0|  9.55224954743324|          NULL|4.611113626089718|15287.690570239585|       NULL|2.716283167967637...|
| stddev|13428.41728079611|16799.737628427665|                NULL|218.08115785023426|          NULL|96.75985306117964|1713.6003033215954|  

                                                                                

In [66]:
df_filtered = df_with_record_id.filter(df_with_record_id["RecordId"].isNull())
df_filtered.show()

+---------+---------+-----------+--------+-----------+---------+----------+-------+--------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|RecordId|
+---------+---------+-----------+--------+-----------+---------+----------+-------+--------+
+---------+---------+-----------+--------+-----------+---------+----------+-------+--------+



In [None]:
df_filtered.count()
df_with_record_id.show(truncate=False)  # Show full contents
df_with_record_id.filter(df_with_record_id.RecordId.isNull() | (df_with_record_id.RecordId == "")).show()

In [None]:
hudi_options = {
    'hoodie.table.name':'my_hudi_table',
    'hoodie.datasource.write.table.type': 'COPY_ON_WRITE',
    'hoodie.datasource.write.recordkey.field': 'RecordId',
    'hoodie.datasource.write.precombine.field': 'InvoiceDate',
    'hoodie.datasource.write.partitionpath.field': 'InvoiceDate',  # Consider changing this
    'hoodie.datasource.write.table.name': 'my_hudi_table',
    'hoodie.datasource.hive_sync.enable': 'false',
    'hoodie.datasource.write.operation': 'upsert',
}

# Sample Hudi write code
df_with_record_id.write.format("hudi").options(**hudi_options).mode("overwrite").save("s3a://hudi-minio-bucket/my_hudi_table")

24/10/02 20:27:38 WARN HoodieSparkSqlWriterInternal: hoodie table at s3a://hudi-minio-bucket/my_hudi_table already exists. Deleting existing data & overwriting with new data.

In [6]:
hudi_read_options = {
    'hoodie.datasource.query.type': 'snapshot'
}

# Reading the Hudi table from the base path
hudi_df = spark.read.format("hudi").options(**hudi_read_options).load("s3a://hudi-minio-bucket/my_hudi_table")

# Display the data
hudi_df.show()



+-------------------+--------------------+------------------+----------------------+--------------------+---+-----+---+-------------------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name| id| name|age|     curr_timestamp|
+-------------------+--------------------+------------------+----------------------+--------------------+---+-----+---+-------------------+
|  20241002170635296|20241002170635296...|                 2|      1727888796133503|f71b70f6-80a4-41c...|  2|  Bob| 30|1970-01-01 00:00:00|
|  20241002170635296|20241002170635296...|                 1|      1727888796133503|f71b70f6-80a4-41c...|  1|Alice| 24|1970-01-01 00:00:00|
+-------------------+--------------------+------------------+----------------------+--------------------+---+-----+---+-------------------+



                                                                                

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|01/12/10 08:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|01/12/10 08:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|01/12/10 08:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|01/12/10 08:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|01/12/10 08:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|01/12/10 08:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|01/12/10 08:26|     4.

In [36]:
!ls

event_logs	   online_retail.xlsx	   spark-warehouse
online_retail.csv  spark-hudi-minio.ipynb


In [37]:
!pwd

/home/sparkuser/app
