# Configurations: Spark-Hudi-S3

In [None]:
from typing import *

from pyspark import Row
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Hudi Table") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.jars.packages", "org.apache.hudi:hudi-spark3.4-bundle_2.12:0.14.0,org.apache.hadoop:hadoop-aws:3.2.4,com.amazonaws:aws-java-sdk:1.12.262") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
    .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.profile.ProfileCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

print("Spark Running")

s3_path = "s3a://my-bucket/dip"

# Access SparkContext
sc = spark.sparkContext

# CREATE HUDI TABLE: Spark SQL

In [12]:
spark.sql(
    """CREATE TABLE customer_data 
        (InvoiceNo STRING, StockCode STRING, Description STRING, Quantity STRING, InvoiceDate STRING, UnitPrice STRING,
        CustomerID STRING, Country STRING)
       USING HUDI 
       LOCATION 's3a://my-bucket/dip/demo'"""
);

# INGEST DATA

In [13]:
spark.sql(
    """CREATE OR REPLACE TEMPORARY VIEW cust_temp USING csv
            OPTIONS (path "online_retail.csv", header true)"""
)

DataFrame[]

In [14]:
spark.sql("INSERT INTO customer_data SELECT * FROM cust_temp")

24/02/20 11:13:20 WARN AutoRecordKeyGenerationUtils$: Precombine field  will be ignored with auto record key generation enabled
                                                                                

DataFrame[]

In [15]:
spark.sql("SELECT COUNT(*) FROM customer_data").toPandas()

                                                                                

Unnamed: 0,count(1)
0,1083818


In [24]:
spark.sql("SELECT * FROM customer_data LIMIT 10").toPandas()

                                                                                

Unnamed: 0,_hoodie_commit_time,_hoodie_commit_seqno,_hoodie_record_key,_hoodie_partition_path,_hoodie_file_name,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,20240212153945516,20240212153945516_0_0,20240212153945516_0_0,,6cd84a53-80a3-44a1-847a-35e72424b918-0_0-14-43...,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,17850,United Kingdom
1,20240212153945516,20240212153945516_0_1,20240212153945516_0_7,,6cd84a53-80a3-44a1-847a-35e72424b918-0_0-14-43...,536366,22633,HAND WARMER UNION JACK,6,12/1/10 8:28,1.85,17850,United Kingdom
2,20240212153945516,20240212153945516_0_2,20240212153945516_0_10,,6cd84a53-80a3-44a1-847a-35e72424b918-0_0-14-43...,536367,22745,POPPY'S PLAYHOUSE BEDROOM,6,12/1/10 8:34,2.1,13047,United Kingdom
3,20240212153945516,20240212153945516_0_3,20240212153945516_0_11,,6cd84a53-80a3-44a1-847a-35e72424b918-0_0-14-43...,536367,22748,POPPY'S PLAYHOUSE KITCHEN,6,12/1/10 8:34,2.1,13047,United Kingdom
4,20240212153945516,20240212153945516_0_4,20240212153945516_0_15,,6cd84a53-80a3-44a1-847a-35e72424b918-0_0-14-43...,536367,22623,BOX OF VINTAGE JIGSAW BLOCKS,3,12/1/10 8:34,4.95,13047,United Kingdom
5,20240212153945516,20240212153945516_0_5,20240212153945516_0_19,,6cd84a53-80a3-44a1-847a-35e72424b918-0_0-14-43...,536367,21777,RECIPE BOX WITH METAL HEART,4,12/1/10 8:34,7.95,13047,United Kingdom
6,20240212153945516,20240212153945516_0_6,20240212153945516_0_27,,6cd84a53-80a3-44a1-847a-35e72424b918-0_0-14-43...,536370,22727,ALARM CLOCK BAKELIKE RED,24,12/1/10 8:45,3.75,12583,France
7,20240212153945516,20240212153945516_0_7,20240212153945516_0_35,,6cd84a53-80a3-44a1-847a-35e72424b918-0_0-14-43...,536370,22629,SPACEBOY LUNCH BOX,24,12/1/10 8:45,1.95,12583,France
8,20240212153945516,20240212153945516_0_8,20240212153945516_0_36,,6cd84a53-80a3-44a1-847a-35e72424b918-0_0-14-43...,536370,22659,LUNCH BOX I LOVE LONDON,24,12/1/10 8:45,1.95,12583,France
9,20240212153945516,20240212153945516_0_9,20240212153945516_0_39,,6cd84a53-80a3-44a1-847a-35e72424b918-0_0-14-43...,536370,21731,RED TOADSTOOL LED NIGHT LIGHT,24,12/1/10 8:45,1.65,12583,France


# Run Clustering

In [17]:
spark.sql("""
    CALL run_clustering(
        table => 'customer_data',
        op => 'scheduleandexecute',
        options => 'hoodie.clustering.async.max.commits=4,
                    hoodie.clustering.plan.strategy.small.file.limit=629145600,
                    hoodie.clustering.plan.strategy.sort.columns=Country,
                    hoodie.write.lock.filesystem.path = file://tmp/hudilock/dip_hudi,
                    hoodie.fs.atomic_creation.support = s3a,
                    hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824'
    )
""").show()

                                                                                

+-----------------+----------------+---------+-------------------+
|        timestamp|input_group_size|    state|involved_partitions|
+-----------------+----------------+---------+-------------------+
|20240220114608983|               1|COMPLETED|                  *|
+-----------------+----------------+---------+-------------------+

