In [15]:
%session_id_prefix orders_cdc_full_iceberg_01
%glue_version 3.0
%idle_timeout 60
%%configure 
{
  "--conf": "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
  "--datalake-formats": "iceberg",
  "--JOB_NAME": "orders_cdc_full_iceberg"
}

Setting session ID prefix to orders_cdc_full_iceberg_01


You are already connected to a glueetl session 476466d6-1a93-45d7-9f13-4b9d8e1879e3.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Setting Glue version to: 3.0


You are already connected to a glueetl session 476466d6-1a93-45d7-9f13-4b9d8e1879e3.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Current idle_timeout is 60 minutes.
idle_timeout has been set to 60 minutes.


You are already connected to a glueetl session 476466d6-1a93-45d7-9f13-4b9d8e1879e3.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


The following configurations have been updated: {'--conf': 'spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions', '--datalake-formats': 'iceberg', '--JOB_NAME': 'orders_cdc_full_iceberg'}


In [53]:
catalog_name = "glue_catalog"
bucket_name = "chiholee-datalake001"
database_name = "ecommerce"

table_name = "orders"
last_update_time = 'order_dt'

source_bucket_prefix = "transaction/initial/raw"
source_path = f"s3://{bucket_name}/{source_bucket_prefix}"
source_table_name = table_name

iceberg_bucket_prefix = "transaction/iceberg/glue"
warehouse_path = f"s3://{bucket_name}/{iceberg_bucket_prefix}"
iceberg_table_name = f"{table_name}_cdc_glue_iceberg"








In [54]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .config(f"spark.sql.catalog.{catalog_name}", "org.apache.iceberg.spark.SparkCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.warehouse", f"{warehouse_path}") \
    .config(f"spark.sql.catalog.{catalog_name}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()




In [55]:
# GLUE NOTEBOOK SAVE 상태에서 진행
import sys
from awsglue.job import Job
from awsglue.context import GlueContext
from awsglue.utils import getResolvedOptions


glueContext = GlueContext(spark)

args = getResolvedOptions(sys.argv, ['JOB_NAME'])
job = Job(glueContext)
job.init(args['JOB_NAME'], args)





In [56]:
print(f'{source_path}/{database_name}/{source_table_name}/')

s3://chiholee-datalake001/transaction/initial/raw/ecommerce/orders/


In [57]:
fullDyf = glueContext.create_dynamic_frame_from_options(
    connection_type='s3',
    connection_options={
        'paths': [f'{source_path}/{database_name}/{source_table_name}/'],
        'groupFiles': 'none',
        'recurse': True
    },
    format='parquet',
    transformation_ctx='fullDyf')




In [58]:
print(f"Count of data after last job bookmark:{fullDyf.count()}")

Count of data after last job bookmark:288650


In [59]:
fullDf = fullDyf.toDF()




In [60]:
fullDf.show()

+--------+--------+---------+-----------+-------------------+-----------+----------+
|order_id|promo_id|order_cnt|order_price|           order_dt|customer_id|product_id|
+--------+--------+---------+-----------+-------------------+-----------+----------+
|  192041| PROMO02|        8|      31000|2024-04-22 06:32:25|         70|         6|
|  249635| PROMO20|        7|      45000|2024-04-23 12:47:36|         22|        10|
|  199096| PROMO20|        9|      19000|2024-04-22 10:14:55|         85|         8|
|  281230| PROMO08|        8|      25000|2024-04-24 05:15:36|         96|         1|
|  252318| PROMO20|        3|      31000|2024-04-23 14:11:15|         30|        14|
|  212103| PROMO09|       10|      39000|2024-04-22 17:05:54|         35|         1|
|  256471| PROMO10|        1|      14000|2024-04-23 16:21:21|         36|        14|
|  218664| PROMO16|        1|      47000|2024-04-22 20:34:47|         10|         1|
|  244040| PROMO12|        1|       9000|2024-04-23 09:51:51|    

In [61]:
from datetime import datetime
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.functions import concat, col, lit, to_timestamp

current_datetime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
fullDf = fullDf.withColumn(last_update_time,to_timestamp(col(last_update_time)))
fullDf = (fullDf
      .withColumn('year', year(col(last_update_time)))
      .withColumn('month', month(col(last_update_time)))
      .withColumn('day', dayofmonth(col(last_update_time)))
     )
fullDf = fullDf.withColumn('last_applied_date',to_timestamp(lit(current_datetime)))








In [62]:
# Incoming records violate the writer assumption that records are clustered by spec and by partition within each spec. Either cluster the incoming records or switch to fanout writers.
# 아래 insert select 시 위의 에러가 발생하여 파티션 & 정렬함
fullDf = fullDf.repartition("year", "month", "day").sortWithinPartitions("year", "month", "day")




In [63]:
fullDf.createOrReplaceTempView(f"{source_table_name}_initial")




In [64]:
fullDf.show()

+--------+--------+---------+-----------+-------------------+-----------+----------+----+-----+---+-------------------+
|order_id|promo_id|order_cnt|order_price|           order_dt|customer_id|product_id|year|month|day|  last_applied_date|
+--------+--------+---------+-----------+-------------------+-----------+----------+----+-----+---+-------------------+
|  172376| PROMO10|        8|      35000|2024-04-24 02:40:10|         90|         4|2024|    4| 24|2024-04-30 02:19:37|
|  181420| PROMO11|        4|       9000|2024-04-24 07:05:54|         28|         2|2024|    4| 24|2024-04-30 02:19:37|
|  186901| PROMO05|        9|       9000|2024-04-24 04:19:56|         75|        15|2024|    4| 24|2024-04-30 02:19:37|
|  129025| PROMO19|        1|      42000|2024-04-24 01:20:38|          7|         4|2024|    4| 24|2024-04-30 02:19:37|
|  118004| PROMO06|        4|      38000|2024-04-24 03:09:38|         38|        16|2024|    4| 24|2024-04-30 02:19:37|
|  148775| PROMO03|        3|       6000

In [65]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {catalog_name}.{database_name}")

DataFrame[]


In [66]:
existing_tables = spark.sql(f"SHOW TABLES IN {catalog_name}.{database_name};")




In [67]:
df_existing_tables = existing_tables.select('tableName').rdd.flatMap(lambda x:x).collect()




In [52]:
# spark.sql(f"""DROP TABLE {catalog_name}.{database_name}.{iceberg_table_name}""")

DataFrame[]


In [68]:
spark.sql(f"""CREATE TABLE IF NOT EXISTS {catalog_name}.{database_name}.{iceberg_table_name}
            USING iceberg 
            PARTITIONED BY (year, month, day)
            as (SELECT * from {source_table_name}_initial)""")

DataFrame[]
