In [5]:
%session_id_prefix customer_cdc_upsert_iceberg_01
%glue_version 3.0
%idle_timeout 60
%%configure 
{
  "--conf": "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
  "--datalake-formats": "iceberg",
  "--job-bookmark-option": "job-bookmark-enable",
  "--JOB_NAME": "customer_cdc_upsert_iceberg"
}

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.4 
Setting session ID prefix to customer_cdc_upsert_iceberg_01
Setting Glue version to: 3.0
Current idle_timeout is None minutes.
idle_timeout has been set to 60 minutes.
The following configurations have been updated: {'--conf': 'spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions', '--datalake-formats': 'iceberg', '--job-bookmark-option': 'job-bookmark-enable', '--JOB_NAME': 'customer_cdc_upsert_iceberg'}


In [1]:
from awsglue.job import Job

Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Session ID: d1ca5e1f-3377-4eb6-9006-edf98589ddeb
Applying the following default arguments:
--glue_kernel_version 1.0.4
--enable-glue-datacatalog true
--conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
--datalake-formats iceberg
--job-bookmark-option job-bookmark-enable
--JOB_NAME customer_cdc_upsert_iceberg
Waiting for session d1ca5e1f-3377-4eb6-9006-edf98589ddeb to get into ready status...
Session d1ca5e1f-3377-4eb6-9006-edf98589ddeb has been created.



In [4]:
catalog_name = "glue_catalog"
bucket_name = "chiholee-datalake001"
database_name = "ecommerce"

table_name = "customer"
pk = 'customer_id'
last_update_time = 'last_update_time'

source_bucket_prefix = "transaction/cdc/raw"
source_path = f"s3://{bucket_name}/{source_bucket_prefix}"
source_table_name = table_name

iceberg_bucket_prefix = "transaction/iceberg"
warehouse_path = f"s3://{bucket_name}/{iceberg_bucket_prefix}"
iceberg_table_name = f"{table_name}_cdc_glue_iceberg"




In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .config(f"spark.sql.catalog.{catalog_name}", "org.apache.iceberg.spark.SparkCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.warehouse", f"{warehouse_path}") \
    .config(f"spark.sql.catalog.{catalog_name}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()




In [6]:
import sys
from awsglue.context import GlueContext
from awsglue.utils import getResolvedOptions


glueContext = GlueContext(spark)

args = getResolvedOptions(sys.argv, ['JOB_NAME'])
job = Job(glueContext)
job.init(args['JOB_NAME'], args)





In [7]:
cdcDyf = glueContext.create_dynamic_frame_from_options(
    connection_type='s3',
    connection_options={
        'paths': [f'{source_path}/{database_name}/{source_table_name}/'],
        'groupFiles': 'none',
        'recurse': True
    },
    format='parquet',
    transformation_ctx='cdcDyf')




In [8]:
print(f"## Count of CDC data after last job bookmark:{cdcDyf.count()}")

## Count of CDC data after last job bookmark:0


In [9]:
cdcDf = cdcDyf.toDF()




In [10]:
cdcDf.show()

++
||
++
++


In [11]:
import sys
from pyspark.sql import Window
from pyspark.sql import functions as F 




In [12]:
cdcDf.createOrReplaceTempView("cdcDf")




In [13]:
cdcDf = spark.sql("""
select *
from cdcDf
where (customer_id, last_update_time) in
(
    select customer_id, max(last_update_time) max_op_time
    from cdcDf
    group by customer_id
)
"""
)

AnalysisException: cannot resolve '`customer_id`' given input columns: []; line 4 pos 7;
'Project [*]
+- 'Filter named_struct(customer_id, 'customer_id, last_update_time, 'last_update_time) IN (list#3 [])
   :  +- 'Aggregate ['customer_id], ['customer_id, 'max('last_update_time) AS max_op_time#2]
   :     +- 'UnresolvedRelation [cdcDf], [], false
   +- SubqueryAlias cdcdf
      +- LogicalRDD false



In [14]:
cdcInsertCount = cdcDf.filter("Op = 'I'").count()
cdcUpdateCount = cdcDf.filter("Op = 'U'").count()
cdcDeleteCount = cdcDf.filter("Op = 'D'").count()
print(f"Inserted count: {cdcInsertCount}")
print(f"Updated count: {cdcUpdateCount}")
print(f"Deleted count: {cdcDeleteCount}")
print(f"Total CDC count: {cdcDf.count()}")

AnalysisException: cannot resolve '`Op`' given input columns: []; line 1 pos 0;
'Filter ('Op = I)
+- LogicalRDD false



In [15]:
dropColumnList = ['Op','dms_update_time']




In [16]:
from datetime import datetime
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.functions import concat, col, lit, to_timestamp

current_datetime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
cdcDf = cdcDf.withColumn('order_dt',to_timestamp(col('order_dt')))
cdcDf = (cdcDf
      .withColumn('year', year(col('order_dt')))
      .withColumn('month', month(col('order_dt')))
      .withColumn('day', dayofmonth(col('order_dt')))
     )
cdcDf = cdcDf.withColumn('last_applied_date',to_timestamp(lit(current_datetime)))



AnalysisException: cannot resolve '`order_dt`' given input columns: [];
'Project [to_timestamp('order_dt, None) AS order_dt#4]
+- LogicalRDD false



In [None]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {catalog_name}.{database_name}")
existing_tables = spark.sql(f"SHOW TABLES IN {catalog_name}.{database_name};")
df_existing_tables = existing_tables.select('tableName').rdd.flatMap(lambda x:x).collect()

In [None]:
upsertDf = cdcDf.filter("Op != 'D'").drop(*dropColumnList)
upsertDf.createOrReplaceTempView(f"{source_table_name}_upsert")

In [None]:
upsertDf.show()

In [6]:
# spark.sql(f"""
# select order_id, count(*)
# from {catalog_name}.{database_name}.{iceberg_table_name}
# group by order_id
# having count(*) > 1
# """).show()

In [7]:
# spark.sql(f"""
# select order_id, count(*)
# from {source_table_name}_upsert
# group by order_id
# having count(*) > 1
# """).show()

In [17]:
deleteDf = cdcDf.filter("Op = 'D'").drop(*dropColumnList)
deleteDf.createOrReplaceTempView(f"{source_table_name}_delete")

AnalysisException: cannot resolve '`Op`' given input columns: []; line 1 pos 0;
'Filter ('Op = D)
+- LogicalRDD false



In [8]:
# deleteDf.show()

In [18]:
print(f"Table {source_table_name}_iceberg is upserting...")
spark.sql(f"""MERGE INTO {catalog_name}.{database_name}.{iceberg_table_name} t
    USING {source_table_name}_upsert s ON s.{pk} = t.{pk}
    WHEN MATCHED THEN UPDATE SET *
    WHEN NOT MATCHED THEN INSERT *
    """)

AnalysisException: Table or view not found: customer_upsert; line 1 pos 0;
'MergeIntoTable ('s.customer_id = 't.customer_id), [updateaction(None)], [insertaction(None)]
:- SubqueryAlias t
:  +- SubqueryAlias glue_catalog.ecommerce.customer_cdc_iceberg
:     +- RelationV2[customer_id#5L, password#6, last_login#7, is_superuser#8, username#9, first_name#10, last_name#11, email#12, is_staff#13, is_active#14, date_joined#15, phone_number#16, age#17, gender#18, address#19, last_update_time#20, name#21, year#22, month#23, day#24, last_applied_date#25] glue_catalog.ecommerce.customer_cdc_iceberg
+- 'SubqueryAlias s
   +- 'UnresolvedRelation [customer_upsert], [], false



In [19]:
spark.sql(f"""
select min(last_update_time), max(last_update_time)
from {catalog_name}.{database_name}.{iceberg_table_name}
""").show()


+---------------------+---------------------+
|min(last_update_time)|max(last_update_time)|
+---------------------+---------------------+
|  2023-04-08 11:22:14|  2023-04-12 00:34:33|
+---------------------+---------------------+
