In [25]:
%session_id_prefix native-iceberg-dataframe-
%glue_version 3.0
%idle_timeout 60
%%configure 
{
  "--conf": "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
  "--datalake-formats": "iceberg",
  "--JOB_NAME": "cdc_full_iceberg"
}

Setting session ID prefix to native-iceberg-dataframe-


You are already connected to a glueetl session 476466d6-1a93-45d7-9f13-4b9d8e1879e3.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Setting Glue version to: 3.0


You are already connected to a glueetl session 476466d6-1a93-45d7-9f13-4b9d8e1879e3.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Current idle_timeout is 60 minutes.
idle_timeout has been set to 60 minutes.


You are already connected to a glueetl session 476466d6-1a93-45d7-9f13-4b9d8e1879e3.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


The following configurations have been updated: {'--conf': 'spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions', '--datalake-formats': 'iceberg', '--JOB_NAME': 'cdc_full_iceberg'}


In [117]:
catalog_name = "glue_catalog"
bucket_name = "chiholee-datalake001"
database_name = "ecommerce"

table_name = "product"
last_update_time = 'last_update_time'

source_bucket_prefix = "transaction/initial/raw"
source_path = f"s3://{bucket_name}/{source_bucket_prefix}"
source_table_name = table_name

iceberg_bucket_prefix = "transaction/iceberg/glue"
warehouse_path = f"s3://{bucket_name}/{iceberg_bucket_prefix}"
iceberg_table_name = f"{table_name}_cdc_glue_iceberg"








In [118]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .config(f"spark.sql.catalog.{catalog_name}", "org.apache.iceberg.spark.SparkCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.warehouse", f"{warehouse_path}") \
    .config(f"spark.sql.catalog.{catalog_name}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()




In [119]:
import sys
from awsglue.context import GlueContext
from awsglue.utils import getResolvedOptions
from awsglue.job import Job


glueContext = GlueContext(spark)

args = getResolvedOptions(sys.argv, ['JOB_NAME'])
job = Job(glueContext)
job.init(args['JOB_NAME'], args)





In [120]:
print(f'{source_path}/{database_name}/{source_table_name}/')

s3://chiholee-datalake001/transaction/initial/raw/ecommerce/product/


In [121]:
fullDyf = glueContext.create_dynamic_frame_from_options(
    connection_type='s3',
    connection_options={
        'paths': [f'{source_path}/{database_name}/{source_table_name}/'],
        'groupFiles': 'none',
        'recurse': True
    },
    format='parquet',
    transformation_ctx='fullDyf')




In [122]:
print(f"Count of data after last job bookmark:{fullDyf.count()}")

Count of data after last job bookmark:20


In [123]:
fullDf = fullDyf.toDF()




In [124]:
fullDf.show()

+----------+--------+-----------+--------+-----+-------------------+
|product_id|    name|   img_path|category|price|   last_update_time|
+----------+--------+-----------+--------+-----+-------------------+
|        15|  고추장|img/15.jpeg|    반찬|12000|2023-04-10 14:24:21|
|        11|      쌀|img/11.jpeg|  농산물|35000|2023-04-10 14:23:58|
|        13|  비빔밥|img/13.jpeg|    분식| 9000|2023-04-10 14:24:15|
|        10|  핫도그| img/10.jpg|    분식| 1600|2023-04-07 14:54:42|
|         5|  삼계탕| img/05.jpg|    육류|15000|2023-04-07 14:52:19|
|        14|  깍뚜기|img/14.jpeg|    반찬|10000|2023-04-10 14:24:38|
|        12|    두부|img/12.jpeg|    반찬| 3500|2023-04-10 14:24:08|
|        19|  계란찜|img/19.jpeg|    반찬| 7000|2023-04-10 14:24:52|
|         6|  발효빵| img/03.jpg|      빵| 7000|2023-04-07 14:53:04|
|         4|  삼겹살| img/04.jpg|    육류|11000|2023-04-07 14:51:55|
|         7|  고추전| img/07.jpg|    분식|11000|2023-04-07 14:53:31|
|         2|    김치| img/01.jpg|    반찬| 8000|2023-04-07 03:11:09|
|        16|    게장|im

In [125]:
from datetime import datetime
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.functions import concat, col, lit, to_timestamp

current_datetime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
fullDf = fullDf.withColumn(last_update_time,to_timestamp(col(last_update_time)))
fullDf = (fullDf
      .withColumn('year', year(col(last_update_time)))
      .withColumn('month', month(col(last_update_time)))
      .withColumn('day', dayofmonth(col(last_update_time)))
     )
fullDf = fullDf.withColumn('last_applied_date',to_timestamp(lit(current_datetime)))








In [126]:
# Incoming records violate the writer assumption that records are clustered by spec and by partition within each spec. Either cluster the incoming records or switch to fanout writers.
# 아래 insert select 시 위의 에러가 발생하여 파티션 & 정렬함
fullDf = fullDf.repartition("year", "month", "day").sortWithinPartitions("year", "month", "day")




In [127]:
fullDf.createOrReplaceTempView(f"{source_table_name}_initial")




In [128]:
fullDf.show()

+----------+--------+-----------+--------+-----+-------------------+----+-----+---+-------------------+
|product_id|    name|   img_path|category|price|   last_update_time|year|month|day|  last_applied_date|
+----------+--------+-----------+--------+-----+-------------------+----+-----+---+-------------------+
|         5|  삼계탕| img/05.jpg|    육류|15000|2023-04-07 14:52:19|2023|    4|  7|2024-04-30 02:25:34|
|         7|  고추전| img/07.jpg|    분식|11000|2023-04-07 14:53:31|2023|    4|  7|2024-04-30 02:25:34|
|         3|  떡볶이| img/02.jpg|    분식| 4000|2023-04-07 03:22:07|2023|    4|  7|2024-04-30 02:25:34|
|         9|    치킨| img/09.jpg|    육류|18000|2023-04-07 14:54:28|2023|    4|  7|2024-04-30 02:25:34|
|        10|  핫도그| img/10.jpg|    분식| 1600|2023-04-07 14:54:42|2023|    4|  7|2024-04-30 02:25:34|
|         8|    족발| img/08.jpg|    안주|18000|2023-04-07 14:53:59|2023|    4|  7|2024-04-30 02:25:34|
|         6|  발효빵| img/03.jpg|      빵| 7000|2023-04-07 14:53:04|2023|    4|  7|2024-04-30 02

In [129]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {catalog_name}.{database_name}")

DataFrame[]


In [130]:
existing_tables = spark.sql(f"SHOW TABLES IN {catalog_name}.{database_name};")




In [131]:
df_existing_tables = existing_tables.select('tableName').rdd.flatMap(lambda x:x).collect()




In [132]:
spark.sql(f"""DROP TABLE {catalog_name}.{database_name}.{iceberg_table_name}""")

AnalysisException: Table or view not found for 'DROP TABLE': glue_catalog.ecommerce.product_cdc_glue_iceberg; line 1 pos 0;
'DropTable false, false
+- 'UnresolvedTableOrView [glue_catalog, ecommerce, product_cdc_glue_iceberg], DROP TABLE, true



In [133]:
spark.sql(f"""CREATE TABLE IF NOT EXISTS {catalog_name}.{database_name}.{iceberg_table_name}
            USING iceberg 
            PARTITIONED BY (year, month, day)
            as (SELECT * from {source_table_name}_initial)""")

DataFrame[]
