In [20]:
%session_id_prefix customer_cdc_full_iceberg_01
%glue_version 3.0
%idle_timeout 60
%%configure 
{
  "--conf": "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
  "--datalake-formats": "iceberg",
  "--JOB_NAME": "customer_cdc_full_iceberg"
}

Setting session ID prefix to customer_cdc_full_iceberg_01


You are already connected to a glueetl session 476466d6-1a93-45d7-9f13-4b9d8e1879e3.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Setting Glue version to: 3.0


You are already connected to a glueetl session 476466d6-1a93-45d7-9f13-4b9d8e1879e3.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Current idle_timeout is 60 minutes.
idle_timeout has been set to 60 minutes.


You are already connected to a glueetl session 476466d6-1a93-45d7-9f13-4b9d8e1879e3.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


The following configurations have been updated: {'--conf': 'spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions', '--datalake-formats': 'iceberg', '--JOB_NAME': 'customer_cdc_full_iceberg'}


In [85]:
catalog_name = "glue_catalog"
bucket_name = "chiholee-datalake001"
database_name = "ecommerce"

table_name = "customer"
last_update_time = 'last_update_time'

source_bucket_prefix = "transaction/initial/raw"
source_path = f"s3://{bucket_name}/{source_bucket_prefix}"
source_table_name = table_name

iceberg_bucket_prefix = "transaction/iceberg/glue"
warehouse_path = f"s3://{bucket_name}/{iceberg_bucket_prefix}"
iceberg_table_name = f"{table_name}_cdc_glue_iceberg"








In [86]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .config(f"spark.sql.catalog.{catalog_name}", "org.apache.iceberg.spark.SparkCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.warehouse", f"{warehouse_path}") \
    .config(f"spark.sql.catalog.{catalog_name}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()




In [87]:
import sys
from awsglue.job import Job
from awsglue.context import GlueContext
from awsglue.utils import getResolvedOptions


glueContext = GlueContext(spark)

args = getResolvedOptions(sys.argv, ['JOB_NAME'])
job = Job(glueContext)
job.init(args['JOB_NAME'], args)





In [88]:
print(f'{source_path}/{database_name}/{source_table_name}/')

s3://chiholee-datalake001/transaction/initial/raw/ecommerce/customer/


In [89]:
fullDyf = glueContext.create_dynamic_frame_from_options(
    connection_type='s3',
    connection_options={
        'paths': [f'{source_path}/{database_name}/{source_table_name}/'],
        'groupFiles': 'none',
        'recurse': True
    },
    format='parquet',
    transformation_ctx='fullDyf')




In [90]:
print(f"Count of data after last job bookmark:{fullDyf.count()}")

Count of data after last job bookmark:100


In [91]:
fullDf = fullDyf.toDF()




In [92]:
fullDf.show()

+-----------+--------------------+-------------------+------------+-----------+----------+---------+--------------------+--------+---------+-------------------+-------------+---+------+-------------------------------------+-------------------+------+
|customer_id|            password|         last_login|is_superuser|   username|first_name|last_name|               email|is_staff|is_active|        date_joined| phone_number|age|gender|                              address|   last_update_time|  name|
+-----------+--------------------+-------------------+------------+-----------+----------+---------+--------------------+--------+---------+-------------------+-------------+---+------+-------------------------------------+-------------------+------+
|         38|pbkdf2_sha256$260...|2023-04-12 01:14:51|           0|     isubin|      성수|       박|gimyeeun@example.org|       1|        1|2021-06-09 11:53:35| 044-323-7026| 61|    여|    울산광역시 구로구 테헤란9거리 ...|2023-04-08 21:07:22|엄도현|
|         66|pbk

In [93]:
from datetime import datetime
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.functions import concat, col, lit, to_timestamp

current_datetime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
fullDf = fullDf.withColumn(last_update_time,to_timestamp(col(last_update_time)))
fullDf = (fullDf
      .withColumn('year', year(col(last_update_time)))
      .withColumn('month', month(col(last_update_time)))
      .withColumn('day', dayofmonth(col(last_update_time)))
     )
fullDf = fullDf.withColumn('last_applied_date',to_timestamp(lit(current_datetime)))








In [94]:
# Incoming records violate the writer assumption that records are clustered by spec and by partition within each spec. Either cluster the incoming records or switch to fanout writers.
# 아래 insert select 시 위의 에러가 발생하여 파티션 & 정렬함
fullDf = fullDf.repartition("year", "month", "day").sortWithinPartitions("year", "month", "day")




In [95]:
fullDf.createOrReplaceTempView(f"{source_table_name}_initial")




In [96]:
fullDf.show()

+-----------+--------------------+-------------------+------------+---------------+----------+---------+--------------------+--------+---------+-------------------+-------------+---+------+-----------------------------------+-------------------+------+----+-----+---+-------------------+
|customer_id|            password|         last_login|is_superuser|       username|first_name|last_name|               email|is_staff|is_active|        date_joined| phone_number|age|gender|                            address|   last_update_time|  name|year|month|day|  last_applied_date|
+-----------+--------------------+-------------------+------------+---------------+----------+---------+--------------------+--------+---------+-------------------+-------------+---+------+-----------------------------------+-------------------+------+----+-----+---+-------------------+
|          2|pbkdf2_sha256$260...|2023-04-13 01:45:01|           1|            AWS|       AWS|      AWS|       aws@gmail.com|       1|  

In [97]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {catalog_name}.{database_name}")

DataFrame[]


In [98]:
existing_tables = spark.sql(f"SHOW TABLES IN {catalog_name}.{database_name};")




In [99]:
df_existing_tables = existing_tables.select('tableName').rdd.flatMap(lambda x:x).collect()




In [84]:
spark.sql(f"""DROP TABLE {catalog_name}.{database_name}.{iceberg_table_name}""")

DataFrame[]


In [100]:
spark.sql(f"""CREATE TABLE IF NOT EXISTS {catalog_name}.{database_name}.{iceberg_table_name}
            USING iceberg 
            PARTITIONED BY (year, month, day)
            as (SELECT * from {source_table_name}_initial)""")

DataFrame[]
