In [5]:
%session_id_prefix native-iceberg-dataframe-
%glue_version 3.0
%idle_timeout 60
%%configure 
{
  "--conf": "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
  "--datalake-formats": "iceberg"
}

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.4 
Setting session ID prefix to native-iceberg-dataframe-
Setting Glue version to: 3.0
Current idle_timeout is None minutes.
idle_timeout has been set to 60 minutes.
The following configurations have been updated: {'--conf': 'spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions', '--datalake-formats': 'iceberg'}


In [32]:
catalog_name = "glue_catalog"
bucket_name = "chiholee-datalake001"
database_name = "ecommerce"

source_bucket_prefix = "transaction/initial/raw"
source_path = f"s3://{bucket_name}/{source_bucket_prefix}"
source_table_name = "orders"

iceberg_bucket_prefix = "transaction/iceberg"
warehouse_path = f"s3://{bucket_name}/{iceberg_bucket_prefix}"
iceberg_table_name = "orders_cdc_iceberg"








In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .config(f"spark.sql.catalog.{catalog_name}", "org.apache.iceberg.spark.SparkCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.warehouse", f"{warehouse_path}") \
    .config(f"spark.sql.catalog.{catalog_name}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \
    .config(f"spark.sql.catalog.{catalog_name}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()




In [16]:
from awsglue.context import GlueContext




In [17]:
glueContext = GlueContext(spark)




In [33]:
print(f'{source_path}/{database_name}/{source_table_name}/')

s3://chiholee-datalake001/transaction/initial/raw/ecommerce/orders/


In [36]:
fullDyf = glueContext.create_dynamic_frame_from_options(
    connection_type='s3',
    connection_options={
        'paths': [f'{source_path}/{database_name}/{source_table_name}/'],
        'groupFiles': 'none',
        'recurse': True
    },
    format='parquet',
    transformation_ctx='fullDyf')




In [37]:
print(f"Count of data after last job bookmark:{fullDyf.count()}")

Count of data after last job bookmark:288650


In [38]:
fullDf = fullDyf.toDF()




In [39]:
fullDf.show()

+--------+--------+---------+-----------+-------------------+-----------+----------+
|order_id|promo_id|order_cnt|order_price|           order_dt|customer_id|product_id|
+--------+--------+---------+-----------+-------------------+-----------+----------+
|  110485| PROMO03|        8|      47000|2024-04-20 11:45:16|         82|        11|
|  181343| PROMO07|        2|      34000|2024-04-22 00:58:27|         38|         8|
|  101099| PROMO09|        3|      43000|2024-04-20 06:52:35|         40|        16|
|  172154| PROMO01|        4|      32000|2024-04-21 20:07:37|         69|        12|
|  146144| PROMO17|        5|      32000|2024-04-21 06:35:34|          6|        20|
|  123105| PROMO08|       10|      24000|2024-04-20 18:23:47|         52|         6|
|  105389| PROMO11|        4|      25000|2024-04-20 09:06:05|         77|         9|
|  163394| PROMO03|        5|      23000|2024-04-21 15:33:58|         64|        14|
|  175538| PROMO03|        6|      19000|2024-04-21 21:53:15|    

In [45]:
from datetime import datetime
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.functions import concat, col, lit, to_timestamp

current_datetime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
fullDf = fullDf.withColumn('order_dt',to_timestamp(col('order_dt')))
fullDf = (fullDf
      .withColumn('year', year(col('order_dt')))
      .withColumn('month', month(col('order_dt')))
      .withColumn('day', dayofmonth(col('order_dt')))
     )
fullDf = fullDf.withColumn('last_applied_date',to_timestamp(lit(current_datetime)))








In [72]:
# Incoming records violate the writer assumption that records are clustered by spec and by partition within each spec. Either cluster the incoming records or switch to fanout writers.
# 아래 insert select 시 위의 에러가 발생하여 파티션 & 정렬함
fullDf = fullDf.repartition("year", "month", "day").sortWithinPartitions("year", "month", "day")




In [73]:
fullDf.createOrReplaceTempView(f"{source_table_name}_initial")




In [74]:
fullDf.show()

+--------+--------+---------+-----------+-------------------+-----------+----------+----+-----+---+-------------------+
|order_id|promo_id|order_cnt|order_price|           order_dt|customer_id|product_id|year|month|day|  last_applied_date|
+--------+--------+---------+-----------+-------------------+-----------+----------+----+-----+---+-------------------+
|  281230| PROMO08|        8|      25000|2024-04-24 05:15:36|         96|         1|2024|    4| 24|2024-04-26 07:00:05|
|  277880| PROMO03|        4|      38000|2024-04-24 03:31:24|         71|         4|2024|    4| 24|2024-04-26 07:00:05|
|  280276| PROMO18|        2|      17000|2024-04-24 04:44:53|         22|        12|2024|    4| 24|2024-04-26 07:00:05|
|  286657| PROMO01|        5|       7000|2024-04-24 13:38:47|         99|        14|2024|    4| 24|2024-04-26 07:00:05|
|  272537| PROMO11|        6|      44000|2024-04-24 00:42:58|         29|        12|2024|    4| 24|2024-04-26 07:00:05|
|  277759| PROMO08|        7|      26000

In [47]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {catalog_name}.{database_name}")

DataFrame[]


In [48]:
existing_tables = spark.sql(f"SHOW TABLES IN {catalog_name}.{database_name};")




In [49]:
df_existing_tables = existing_tables.select('tableName').rdd.flatMap(lambda x:x).collect()




In [52]:
spark.sql(f"""CREATE TABLE IF NOT EXISTS {catalog_name}.{database_name}.{iceberg_table_name}
            USING iceberg 
            TBLPROPERTIES ('format-version'='2')
            as (SELECT * from {source_table_name}_initial)""")

DataFrame[]


In [75]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog_name}.{database_name}.{iceberg_table_name}4 (
  order_id int,
  promo_id string,
  order_cnt int,
  order_price int,
  order_dt timestamp,
  customer_id bigint,
  product_id int,
  year int,
  month int,
  day int,
  last_applied_date timestamp
)
USING iceberg 
PARTITIONED BY (year, month, day)""")
            # as (SELECT * from {source_table_name}_initial)""")

DataFrame[]


In [76]:
spark.sql(f"""
INSERT INTO {catalog_name}.{database_name}.{iceberg_table_name}4
SELECT * from {source_table_name}_initial""")


DataFrame[]


In [83]:
spark.sql(f"""CREATE TABLE IF NOT EXISTS {catalog_name}.{database_name}.{iceberg_table_name}
            USING iceberg 
            PARTITIONED BY (year, month, day)
            as (SELECT * from {source_table_name}_initial)""")

DataFrame[]


In [82]:
spark.sql(f"""DROP TABLE {catalog_name}.{database_name}.{iceberg_table_name}""")

DataFrame[]


In [8]:
query = f"""
DROP TABLE IF EXISTS {catalog_name}.{database_name}.{table_name}
"""
spark.sql(query)

DataFrame[]


In [4]:
query = f"""
CREATE DATABASE IF NOT EXISTS {database_name}
"""
spark.sql(query)
     

DataFrame[]


In [5]:
from pyspark.sql import Row
import time

ut = time.time()

df_products = spark.createDataFrame(
    [
        ("00001", "Heater", 250, "Electronics", ut),
        ("00002", "Thermostat", 400, "Electronics", ut),
        ("00003", "Television", 600, "Electronics", ut),
        ("00004", "Blender", 100, "Electronics", ut),
        ("00005", "Table", 150, "Furniture", ut)
    ],
    ["product_id", "product_name", "price", "category", "updated_at"],
)

df_products.show()

+----------+------------+-----+-----------+--------------------+
|product_id|product_name|price|   category|          updated_at|
+----------+------------+-----+-----------+--------------------+
|     00001|      Heater|  250|Electronics|1.7141134651999865E9|
|     00002|  Thermostat|  400|Electronics|1.7141134651999865E9|
|     00003|  Television|  600|Electronics|1.7141134651999865E9|
|     00004|     Blender|  100|Electronics|1.7141134651999865E9|
|     00005|       Table|  150|  Furniture|1.7141134651999865E9|
+----------+------------+-----+-----------+--------------------+


In [6]:
df_products.sortWithinPartitions("category") \
    .writeTo(f"{catalog_name}.{database_name}.{table_name}") \
    .create()
     




In [7]:
spark.catalog.listTables(database_name)

[Table(name='accesslog', database='ecommerce', description=None, tableType='EXTERNAL', isTemporary=False), Table(name='orders_cdc_iceberg', database='ecommerce', description=None, tableType=None, isTemporary=False)]


In [9]:
from pyspark.sql import Row
import time

ut = time.time()

product = [
    {'product_id': '00001', 'product_name': 'Heater', 'price': 250, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00002', 'product_name': 'Thermostat', 'price': 400, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00003', 'product_name': 'Television', 'price': 600, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00004', 'product_name': 'Blender', 'price': 100, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00005', 'product_name': 'USB charger', 'price': 50, 'category': 'Electronics', 'updated_at': ut}
]

df_products = spark.createDataFrame(Row(**x) for x in product)
     




In [10]:
df_products.createOrReplaceTempView(f"tmp_{table_name}")




In [11]:
query = f"""
CREATE DATABASE IF NOT EXISTS {database_name}
"""
spark.sql(query)

DataFrame[]


In [12]:
query = f"""
CREATE TABLE {catalog_name}.{database_name}.{table_name}
USING iceberg
AS SELECT * FROM tmp_{table_name}
"""
spark.sql(query)

DataFrame[]


In [13]:
query = f"""
DROP TABLE {catalog_name}.{database_name}.{table_name}
"""
spark.sql(query)

DataFrame[]


# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [7]:
%help


# Available Magic Commands

## Sessions Magic

----
    %help                             Return a list of descriptions and input types for all magic commands. 
    %profile            String        Specify a profile in your aws configuration to use as the credentials provider.
    %region             String        Specify the AWS region in which to initialize a session. 
                                      Default from ~/.aws/config on Linux or macOS, 
                                      or C:\Users\ USERNAME \.aws\config" on Windows.
    %idle_timeout       Int           The number of minutes of inactivity after which a session will timeout. 
                                      Default: 2880 minutes (48 hours).
    %session_id_prefix  String        Define a String that will precede all session IDs in the format 
                                      [session_id_prefix]-[session_id]. If a session ID is not provided,
                                      a random UUID will be generated.
    %status                           Returns the status of the current Glue session including its duration, 
                                      configuration and executing user / role.
    %session_id                       Returns the session ID for the running session. 
    %list_sessions                    Lists all currently running sessions by ID.
    %stop_session                     Stops the current session.
    %glue_version       String        The version of Glue to be used by this session. 
                                      Currently, the only valid options are 2.0, 3.0 and 4.0. 
                                      Default: 2.0.
----

## Selecting Session Types

----
    %streaming          String        Sets the session type to Glue Streaming.
    %etl                String        Sets the session type to Glue ETL.
    %glue_ray           String        Sets the session type to Glue Ray.
    %session_type       String        Specify a session_type to be used. Supported values: streaming, etl and glue_ray. 
----

## Glue Config Magic 
*(common across all session types)*

----

    %%configure         Dictionary    A json-formatted dictionary consisting of all configuration parameters for 
                                      a session. Each parameter can be specified here or through individual magics.
    %iam_role           String        Specify an IAM role ARN to execute your session with.
                                      Default from ~/.aws/config on Linux or macOS, 
                                      or C:\Users\%USERNAME%\.aws\config` on Windows.
    %number_of_workers  int           The number of workers of a defined worker_type that are allocated 
                                      when a session runs.
                                      Default: 5.
    %additional_python_modules  List  Comma separated list of additional Python modules to include in your cluster 
                                      (can be from Pypi or S3).
    %%tags        Dictionary          Specify a json-formatted dictionary consisting of tags to use in the session.
    
    %%assume_role Dictionary, String  Specify a json-formatted dictionary or an IAM role ARN string to create a session 
                                      for cross account access.
                                      E.g. {valid arn}
                                      %%assume_role 
                                      'arn:aws:iam::XXXXXXXXXXXX:role/AWSGlueServiceRole' 
                                      E.g. {credentials}
                                      %%assume_role
                                      {
                                            "aws_access_key_id" : "XXXXXXXXXXXX",
                                            "aws_secret_access_key" : "XXXXXXXXXXXX",
                                            "aws_session_token" : "XXXXXXXXXXXX"
                                       }
----

                                      
## Magic for Spark Sessions (ETL & Streaming)

----
    %worker_type        String        Set the type of instances the session will use as workers. 
    %connections        List          Specify a comma separated list of connections to use in the session.
    %extra_py_files     List          Comma separated list of additional Python files From S3.
    %extra_jars         List          Comma separated list of additional Jars to include in the cluster.
    %spark_conf         String        Specify custom spark configurations for your session. 
                                      E.g. %spark_conf spark.serializer=org.apache.spark.serializer.KryoSerializer
----
                                      
## Magic for Ray Session

----
    %min_workers        Int           The minimum number of workers that are allocated to a Ray session. 
                                      Default: 1.
    %object_memory_head Int           The percentage of free memory on the instance head node after a warm start. 
                                      Minimum: 0. Maximum: 100.
    %object_memory_worker Int         The percentage of free memory on the instance worker nodes after a warm start. 
                                      Minimum: 0. Maximum: 100.
----

## Action Magic

----

    %%sql               String        Run SQL code. All lines after the initial %%sql magic will be passed
                                      as part of the SQL code.  
    %matplot      Matplotlib figure   Visualize your data using the matplotlib library.
                                      E.g. 
                                      import matplotlib.pyplot as plt
                                      # Set X-axis and Y-axis values
                                      x = [5, 2, 8, 4, 9]
                                      y = [10, 4, 8, 5, 2]
                                      # Create a bar chart 
                                      plt.bar(x, y) 
                                      # Show the plot
                                      %matplot plt    
    %plotly            Plotly figure  Visualize your data using the plotly library.
                                      E.g.
                                      import plotly.express as px
                                      #Create a graphical figure
                                      fig = px.line(x=["a","b","c"], y=[1,3,2], title="sample figure")
                                      #Show the figure
                                      %plotly fig

  
                
----



####  Run this cell to set up and start your interactive session.


In [None]:
# %idle_timeout 2880
# %glue_version 4.0
# %worker_type G.1X
# %number_of_workers 5

# import sys
# from awsglue.transforms import *
# from awsglue.utils import getResolvedOptions
# from pyspark.context import SparkContext
# from awsglue.context import GlueContext
# from awsglue.job import Job
  
# sc = SparkContext.getOrCreate()
# glueContext = GlueContext(sc)
# spark = glueContext.spark_session
# job = Job(glueContext)

#### Example: Create a DynamicFrame from a table in the AWS Glue Data Catalog and display its schema


In [None]:
# dyf = glueContext.create_dynamic_frame.from_catalog(database='database_name', table_name='table_name')
# dyf.printSchema()

#### Example: Convert the DynamicFrame to a Spark DataFrame and display a sample of the data


In [None]:
# df = dyf.toDF()
# df.show()

#### Example: Visualize data with matplotlib


In [None]:
# import matplotlib.pyplot as plt

# # Set X-axis and Y-axis values
# x = [5, 2, 8, 4, 9]
# y = [10, 4, 8, 5, 2]
  
# # Create a bar chart 
# plt.bar(x, y)
  
# # Show the plot
# %matplot plt

#### Example: Write the data in the DynamicFrame to a location in Amazon S3 and a table for it in the AWS Glue Data Catalog


In [None]:
# s3output = glueContext.getSink(
#   path="s3://bucket_name/folder_name",
#   connection_type="s3",
#   updateBehavior="UPDATE_IN_DATABASE",
#   partitionKeys=[],
#   compression="snappy",
#   enableUpdateCatalog=True,
#   transformation_ctx="s3output",
# )
# s3output.setCatalogInfo(
#   catalogDatabase="demo", catalogTableName="populations"
# )
# s3output.setFormat("glueparquet")
# s3output.writeFrame(DyF)

In [5]:
%stop_session

Stopping session: 6bff742e-f05a-4996-aa04-36037a308ac0
Stopped session.


In [1]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

from pyspark.sql.functions import *
from awsglue.dynamicframe import DynamicFrame

from pyspark.sql.window import Window
from pyspark.sql.functions import rank, max, col


from pyspark.conf import SparkConf

Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Session ID: c9770581-873d-4116-a3d5-7bcb49d3bac4
Applying the following default arguments:
--glue_kernel_version 1.0.4
--enable-glue-datacatalog true
Waiting for session c9770581-873d-4116-a3d5-7bcb49d3bac4 to get into ready status...
Session c9770581-873d-4116-a3d5-7bcb49d3bac4 has been created.



In [2]:
CATALOG="iceberg_catalog"




In [3]:
conf = SparkConf()
conf.set(f"spark.sql.catalog.{CATALOG}", "org.apache.iceberg.spark.SparkCatalog")
conf.set(f"spark.sql.catalog.{CATALOG}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog")
conf.set(f"spark.sql.catalog.{CATALOG}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
conf.set(f"spark.sql.catalog.{CATALOG}.lock-impl", "org.apache.iceberg.aws.glue.DynamoLockManager")
conf.set("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")



<pyspark.conf.SparkConf object at 0x7f80c8e24e80>


In [7]:
#3. Set the Spark + Glue context
# sc = SparkContext(conf=conf)
sc = SparkContext.getOrCreate(conf=conf);
glueContext = GlueContext(sc)
spark = glueContext.spark_session




In [11]:
RAW_S3_PATH="s3://chiholee-datalake001/transaction/initial/raw"
DATABASE="ecommerce"
TABLE_NAME="orders"




In [13]:
# 에러 케이스
# s3 permission 확인
fullDyf = glueContext.create_dynamic_frame_from_options(
    connection_type='s3',
    connection_options={
        'paths': [f'{RAW_S3_PATH}/{DATABASE}/{TABLE_NAME}/'],
        'groupFiles': 'none',
        'recurse': True
    },
    format='parquet',
    transformation_ctx='fullDyf')




In [14]:
fullDyf.count()

288650


In [15]:
fullDyf.show()

{"order_id": 14729, "promo_id": "PROMO18", "order_cnt": 10, "order_price": 22000, "order_dt": 2024-04-18 09:24:40.0, "customer_id": 59, "product_id": 5}
{"order_id": 39050, "promo_id": "PROMO02", "order_cnt": 4, "order_price": 37000, "order_dt": 2024-04-19 06:38:29.0, "customer_id": 70, "product_id": 3}
{"order_id": 34951, "promo_id": "PROMO13", "order_cnt": 5, "order_price": 41000, "order_dt": 2024-04-18 20:02:42.0, "customer_id": 95, "product_id": 8}
{"order_id": 62228, "promo_id": "PROMO05", "order_cnt": 5, "order_price": 27000, "order_dt": 2024-04-20 23:19:08.0, "customer_id": 52, "product_id": 17}
{"order_id": 2873, "promo_id": "PROMO06", "order_cnt": 9, "order_price": 49000, "order_dt": 2024-04-18 03:14:56.0, "customer_id": 41, "product_id": 15}
{"order_id": 31017, "promo_id": "PROMO06", "order_cnt": 5, "order_price": 38000, "order_dt": 2024-04-24 00:03:54.0, "customer_id": 53, "product_id": 20}
{"order_id": 42684, "promo_id": "PROMO10", "order_cnt": 8, "order_price": 26000, "ord

In [None]:
s3://chiholee-datalake001/transaction/initial/raw/ecommerce/orders/LOAD00000001.parquet
s3://chiholee-datalake001/transaction/initial/raw/ecommerce/orders/LOAD00000001.parquet