In [5]:
%session_id_prefix orders_cdc_upsert_iceberg_01
%glue_version 3.0
%idle_timeout 60
%%configure 
{
  "--conf": "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
  "--datalake-formats": "iceberg",
  "--job-bookmark-option": "job-bookmark-enable",
  "--JOB_NAME": "orders_cdc_upsert_iceberg"
}

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.4 
Setting session ID prefix to orders_cdc_upsert_iceberg_01
Setting Glue version to: 3.0
Current idle_timeout is None minutes.
idle_timeout has been set to 60 minutes.
The following configurations have been updated: {'--conf': 'spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions', '--datalake-formats': 'iceberg', '--job-bookmark-option': 'job-bookmark-enable', '--JOB_NAME': 'orders_cdc_upsert_iceberg'}


In [1]:
from awsglue.job import Job

Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Session ID: 474053ef-fb15-46bb-b543-2020dec32c12
Applying the following default arguments:
--glue_kernel_version 1.0.4
--enable-glue-datacatalog true
--conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
--datalake-formats iceberg
--job-bookmark-option job-bookmark-enable
--JOB_NAME orders_cdc_upsert_iceberg
Waiting for session 474053ef-fb15-46bb-b543-2020dec32c12 to get into ready status...
Session 474053ef-fb15-46bb-b543-2020dec32c12 has been created.



In [15]:
catalog_name = "glue_catalog"
bucket_name = "chiholee-datalake001"
database_name = "ecommerce"

table_name = "orders"
pk = 'order_id'
last_update_time = 'order_dt'

source_bucket_prefix = "transaction/cdc/raw"
source_path = f"s3://{bucket_name}/{source_bucket_prefix}"
source_table_name = table_name

iceberg_bucket_prefix = "transaction/iceberg/glue"
warehouse_path = f"s3://{bucket_name}/{iceberg_bucket_prefix}"
iceberg_table_name = f"{table_name}_cdc_glue_iceberg"




In [16]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .config(f"spark.sql.catalog.{catalog_name}", "org.apache.iceberg.spark.SparkCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.warehouse", f"{warehouse_path}") \
    .config(f"spark.sql.catalog.{catalog_name}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()




In [17]:
import sys
from awsglue.context import GlueContext
from awsglue.utils import getResolvedOptions


glueContext = GlueContext(spark)

args = getResolvedOptions(sys.argv, ['JOB_NAME'])
job = Job(glueContext)
job.init(args['JOB_NAME'], args)





In [18]:
cdcDyf = glueContext.create_dynamic_frame_from_options(
    connection_type='s3',
    connection_options={
        'paths': [f'{source_path}/{database_name}/{source_table_name}/'],
        'groupFiles': 'none',
        'recurse': True
    },
    format='parquet',
    transformation_ctx='cdcDyf')




In [19]:
print(f"## Count of CDC data after last job bookmark:{cdcDyf.count()}")

## Count of CDC data after last job bookmark:132037


In [20]:
cdcDf = cdcDyf.toDF()




In [21]:
cdcDf.show()

+---+--------------------+--------+--------+---------+-----------+-------------------+-----------+----------+
| Op|     dms_update_time|order_id|promo_id|order_cnt|order_price|           order_dt|customer_id|product_id|
+---+--------------------+--------+--------+---------+-----------+-------------------+-----------+----------+
|  I|2024-04-24 19:12:...|  297263| PROMO01|        5|      16000|2024-04-24 19:12:29|         51|         9|
|  I|2024-04-24 19:12:...|  297264| PROMO03|        2|      30000|2024-04-24 19:12:30|         77|        14|
|  I|2024-04-24 19:12:...|  297265| PROMO20|        6|      30000|2024-04-24 19:12:30|         38|         9|
|  I|2024-04-24 19:12:...|  297266| PROMO02|        8|      14000|2024-04-24 19:12:30|         92|         3|
|  I|2024-04-24 19:12:...|  297267| PROMO06|        9|      17000|2024-04-24 19:12:31|          9|        15|
|  I|2024-04-24 19:12:...|  297268| PROMO17|       10|      33000|2024-04-24 19:12:32|         86|         7|
|  U|2024-

In [22]:
import sys
from pyspark.sql import Window
from pyspark.sql import functions as F 




In [23]:
cdcDf.createOrReplaceTempView("cdcDf")




In [24]:
cdcDf = spark.sql("""
select *
from cdcDf
where (order_id, order_dt) in
(
    select order_id, max(order_dt) max_op_time
    from cdcDf
    group by order_id
)
"""
)




In [25]:
cdcInsertCount = cdcDf.filter("Op = 'I'").count()
cdcUpdateCount = cdcDf.filter("Op = 'U'").count()
cdcDeleteCount = cdcDf.filter("Op = 'D'").count()
print(f"Inserted count: {cdcInsertCount}")
print(f"Updated count: {cdcUpdateCount}")
print(f"Deleted count: {cdcDeleteCount}")
print(f"Total CDC count: {cdcDf.count()}")

Inserted count: 116847
Updated count: 12887
Deleted count: 0
Total CDC count: 129734


In [26]:
dropColumnList = ['Op','dms_update_time']




In [27]:
from datetime import datetime
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.functions import concat, col, lit, to_timestamp

current_datetime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
cdcDf = cdcDf.withColumn('order_dt',to_timestamp(col('order_dt')))
cdcDf = (cdcDf
      .withColumn('year', year(col('order_dt')))
      .withColumn('month', month(col('order_dt')))
      .withColumn('day', dayofmonth(col('order_dt')))
     )
cdcDf = cdcDf.withColumn('last_applied_date',to_timestamp(lit(current_datetime)))






In [28]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {catalog_name}.{database_name}")
existing_tables = spark.sql(f"SHOW TABLES IN {catalog_name}.{database_name};")
df_existing_tables = existing_tables.select('tableName').rdd.flatMap(lambda x:x).collect()




In [29]:
upsertDf = cdcDf.filter("Op != 'D'").drop(*dropColumnList)
upsertDf.createOrReplaceTempView(f"{source_table_name}_upsert")




In [30]:
upsertDf.show()

+--------+--------+---------+-----------+-------------------+-----------+----------+----+-----+---+-------------------+
|order_id|promo_id|order_cnt|order_price|           order_dt|customer_id|product_id|year|month|day|  last_applied_date|
+--------+--------+---------+-----------+-------------------+-----------+----------+----+-----+---+-------------------+
|  297267| PROMO06|        9|      17000|2024-04-24 19:12:31|          9|        15|2024|    4| 24|2024-04-27 05:07:25|
|  297280| PROMO08|        2|      46000|2024-04-24 19:12:54|        100|         4|2024|    4| 24|2024-04-27 05:07:25|
|  297284| PROMO03|        5|      26000|2024-04-24 19:12:58|         40|        19|2024|    4| 24|2024-04-27 05:07:25|
|  301288| PROMO12|        8|      47000|2024-04-24 21:18:11|         19|         5|2024|    4| 24|2024-04-27 05:07:25|
|  301294| PROMO16|        5|      33000|2024-04-24 21:18:19|          4|         7|2024|    4| 24|2024-04-27 05:07:25|
|  301317| PROMO13|        3|      33000

In [6]:
# spark.sql(f"""
# select order_id, count(*)
# from {catalog_name}.{database_name}.{iceberg_table_name}
# group by order_id
# having count(*) > 1
# """).show()

In [7]:
# spark.sql(f"""
# select order_id, count(*)
# from {source_table_name}_upsert
# group by order_id
# having count(*) > 1
# """).show()

In [31]:
deleteDf = cdcDf.filter("Op = 'D'").drop(*dropColumnList)
deleteDf.createOrReplaceTempView(f"{source_table_name}_delete")




In [8]:
# deleteDf.show()

In [32]:
print(f"Table {source_table_name}_iceberg is upserting...")
spark.sql(f"""MERGE INTO {catalog_name}.{database_name}.{iceberg_table_name} t
    USING {source_table_name}_upsert s ON s.{pk} = t.{pk}
    WHEN MATCHED THEN UPDATE SET *
    WHEN NOT MATCHED THEN INSERT *
    """)

Table orders_iceberg is upserting...
DataFrame[]


In [33]:
spark.sql(f"""
select min(order_dt), max(order_dt)
from {catalog_name}.{database_name}.{iceberg_table_name}
""").show()


+-------------------+-------------------+
|      min(order_dt)|      max(order_dt)|
+-------------------+-------------------+
|2024-04-18 01:42:43|2024-04-27 05:05:28|
+-------------------+-------------------+
