In [1]:
# Define the AWS env variables if you are using AWS Auth:
%env AWS_REGION= region
%env AWS_ACCESS_KEY_ID= key
%env AWS_SECRET_ACCESS_KEY= secret

env: AWS_REGION=region
env: AWS_ACCESS_KEY_ID=key
env: AWS_SECRET_ACCESS_KEY=secret


In [2]:
import pyspark
from pyspark.sql import SparkSession
import os


conf = (
    pyspark.SparkConf()
        .setAppName('app_name')
    
    # first we will define the packages that we need. Iceberg Spark runtime
        .set('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.2.0,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178')
        
    # This property allows us to add any extensions that we want to use
        .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions')
    
    # configures a new catalog to a particular implementation of SparkCatalog
        .set('spark.sql.catalog.glue', 'org.apache.iceberg.spark.SparkCatalog')
    
    # particular type of catalog we are using
        .set('spark.sql.catalog.glue.catalog-impl', 'org.apache.iceberg.aws.glue.GlueCatalog')
    
    # engine writes to the warehouse
        .set('spark.sql.catalog.glue.warehouse', 's3://my-bucket/warehouse/')
    
    # changes IO impl of catalog, mainly for changing writing data to object storage
        .set('spark.sql.catalog.glue.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')
)

## Start Spark Session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
print("Spark Running")

:: loading settings :: url = jar:file:/home/docker/.local/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/docker/.ivy2/cache
The jars for the packages stored in: /home/docker/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.3_2.12 added as a dependency
software.amazon.awssdk#bundle added as a dependency
software.amazon.awssdk#url-connection-client added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e1fec518-8617-4155-a095-5a9e1323c5b4;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.3_2.12;1.2.0 in central
	found software.amazon.awssdk#bundle;2.17.178 in central
	found software.amazon.eventstream#eventstream;1.0.1 in central
	found software.amazon.awssdk#url-connection-client;2.17.178 in central
	found software.amazon.awssdk#utils;2.17.178 in central
	found org.reactivestreams#reactive-streams;1.0.3 in central
	found software.amazon.awssdk#annotations;2.17.178 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found software.amazon.awssdk#http-client-spi;2.17.178 in central
	found software

23/09/12 19:41:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/09/12 19:41:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
Spark Running


In [55]:
spark.sql('''CREATE TABLE glue.test.inventory_new(
  product_id string,
  product_name string,
  stock_level string,
  price string,
  last_updated string) USING iceberg''')

DataFrame[]

In [67]:
spark.sql('''INSERT INTO glue.test.inventory_new VALUES (1, 'Pasta-thin', 60, 45, '3/25/2023'),
(2, 'Bread-white', 55, 6, '3/10/2023'),
(3, 'Eggs-nonorg', 100, 8, '3/12/2023'),
(4, 'Sausage-pork', 72, 25, '3/29/2023'),
(5, 'Coffee-vanilla', 30, 45, '3/12/2023')
''')

                                                                                

DataFrame[]

In [70]:
spark.sql('''INSERT INTO glue.test.inventory_new VALUES (6, 'Maple Syrup', 20, 85, '3/29/2023'),
(7, 'Protein Bar', 120, 5, '3/15/2023')
''')

                                                                                

DataFrame[]

In [71]:
spark.sql("SELECT * FROM glue.test.inventory_new").toPandas()

                                                                                

Unnamed: 0,product_id,product_name,stock_level,price,last_updated
0,1,Pasta,50,35,3/24/2023
1,6,Maple Syrup,20,85,3/29/2023
2,7,Protein Bar,120,5,3/15/2023
3,2,Bread-brown,87,8,3/25/2023
4,3,Eggs-organic,30,11,3/26/2023
5,4,Sausage-chicken,100,20,3/24/2023
6,1,Pasta-thin,60,45,3/25/2023
7,5,Coffee-arabiaca,45,60,3/18/2023
8,2,Bread-white,55,6,3/10/2023
9,3,Eggs-nonorg,100,8,3/12/2023


In [85]:
spark.sql('''CREATE TABLE glue.test.inventory_new_summary(
    product_id string,
    total_stock string,
    avg_price string) USING iceberg''')

DataFrame[]

In [86]:
spark.sql('''INSERT INTO glue.test.inventory_new_summary 
SELECT 
    product_id,  
    SUM(stock_level) AS total_stock, 
    AVG(price) AS avg_price
FROM glue.test.inventory_new
GROUP BY product_id;
''')

                                                                                

DataFrame[]

In [87]:
spark.sql("SELECT * FROM glue.test.inventory_new_summary").toPandas()

                                                                                

Unnamed: 0,product_id,total_stock,avg_price
0,1,110.0,40.0
1,2,142.0,7.0
2,6,20.0,85.0
3,3,130.0,9.5
4,4,172.0,22.5
5,5,75.0,52.5
6,7,120.0,5.0


In [88]:
spark.sql("SELECT * FROM glue.test.inventory_new").toPandas()

                                                                                

Unnamed: 0,product_id,product_name,stock_level,price,last_updated
0,6,Maple Syrup,20,85,3/29/2023
1,7,Protein Bar,120,5,3/15/2023
2,2,Bread-brown,87,8,3/25/2023
3,1,Pasta,50,35,3/24/2023
4,3,Eggs-organic,30,11,3/26/2023
5,4,Sausage-chicken,100,20,3/24/2023
6,5,Coffee-arabiaca,45,60,3/18/2023
7,1,Pasta-thin,60,45,3/25/2023
8,2,Bread-white,55,6,3/10/2023
9,3,Eggs-nonorg,100,8,3/12/2023


In [90]:
spark.sql('''UPDATE glue.test.inventory_new
SET stock_level = stock_level - 15
WHERE product_name = 'Bread-white' ''')

                                                                                

DataFrame[]

In [91]:
spark.sql("SELECT * FROM glue.test.inventory_new").toPandas()

                                                                                

Unnamed: 0,product_id,product_name,stock_level,price,last_updated
0,1,Pasta,30.0,35,3/24/2023
1,6,Maple Syrup,20.0,85,3/29/2023
2,2,Bread-white,40.0,6,3/10/2023
3,2,Bread-brown,87.0,8,3/25/2023
4,1,Pasta-thin,60.0,45,3/25/2023
5,7,Protein Bar,120.0,5,3/15/2023
6,3,Eggs-organic,30.0,11,3/26/2023
7,4,Sausage-chicken,100.0,20,3/24/2023
8,5,Coffee-arabiaca,45.0,60,3/18/2023
9,3,Eggs-nonorg,100.0,8,3/12/2023


In [92]:
spark.sql("SELECT * FROM glue.test.inventory_new.history").toPandas()

  series = series.astype(t, copy=False)


Unnamed: 0,made_current_at,snapshot_id,parent_id,is_current_ancestor
0,2023-09-12 21:33:42.793,6092371825016876429,,True
1,2023-09-12 21:40:27.836,4816648710583642722,6.092372e+18,True
2,2023-09-12 21:40:38.411,8367599232686988199,4.816649e+18,True
3,2023-09-12 21:45:33.840,5569908650637172278,8.367599e+18,True
4,2023-09-12 21:48:57.585,25292813513278887,5.569909e+18,True
5,2023-09-12 21:50:19.610,3459378582266846364,2.529281e+16,True
6,2023-09-12 22:59:24.421,2959510555509473926,3.459379e+18,True
7,2023-09-12 23:00:54.631,2557325773776943708,2.959511e+18,True


In [95]:
spark.sql(f"CALL glue.system.create_changelog_view( table => 'glue.test.inventory_new', \
                                                options => map('start-snapshot-id','4816648710583642722',\
                                                'end-snapshot-id', '2557325773776943708'),\
                                                changelog_view => 'inventory_changes_new' )" )

DataFrame[changelog_view: string]

In [96]:
spark.sql("SELECT * FROM glue.test.inventory_new.changes").toPandas()

                                                                                

Unnamed: 0,product_id,product_name,stock_level,price,last_updated,_change_type,_change_ordinal,_commit_snapshot_id
0,1,Pasta,50.0,35,6.178942165101334E-5,DELETE,1,4816648710583642722
1,1,Pasta,50.0,35,6.178942165101334E-5,INSERT,0,6092371825016876429
2,2,Bread-brown,87.0,8,3/25/2023,INSERT,3,5569908650637172278
3,3,Eggs-organic,30.0,11,3/26/2023,INSERT,3,5569908650637172278
4,4,Sausage-chicken,100.0,20,3/24/2023,INSERT,3,5569908650637172278
5,5,Coffee-arabiaca,45.0,60,3/18/2023,INSERT,3,5569908650637172278
6,6,Maple Syrup,20.0,85,3/29/2023,INSERT,5,3459378582266846364
7,7,Protein Bar,120.0,5,3/15/2023,INSERT,5,3459378582266846364
8,1,Pasta,50.0,35,3/24/2023,INSERT,2,8367599232686988199
9,1,Pasta,50.0,35,3/24/2023,DELETE,6,2959510555509473926


In [102]:
spark.sql("select * from glue.test.inventory_new.changes").toPandas()

                                                                                

Unnamed: 0,product_id,product_name,stock_level,price,last_updated,_change_type,_change_ordinal,_commit_snapshot_id
0,1,Pasta,30.0,35,3/24/2023,INSERT,6,2959510555509473926
1,1,Pasta,50.0,35,6.178942165101334E-5,INSERT,0,6092371825016876429
2,2,Bread-brown,87.0,8,3/25/2023,INSERT,3,5569908650637172278
3,3,Eggs-organic,30.0,11,3/26/2023,INSERT,3,5569908650637172278
4,4,Sausage-chicken,100.0,20,3/24/2023,INSERT,3,5569908650637172278
5,5,Coffee-arabiaca,45.0,60,3/18/2023,INSERT,3,5569908650637172278
6,6,Maple Syrup,20.0,85,3/29/2023,INSERT,5,3459378582266846364
7,7,Protein Bar,120.0,5,3/15/2023,INSERT,5,3459378582266846364
8,1,Pasta,50.0,35,6.178942165101334E-5,DELETE,1,4816648710583642722
9,2,Bread-white,55.0,6,3/10/2023,DELETE,7,2557325773776943708


In [107]:
spark.sql("SELECT * FROM glue.test.inventory_new_summary").toPandas()

                                                                                

Unnamed: 0,product_id,total_stock,avg_price
0,1,110.0,40.0
1,2,142.0,7.0
2,6,20.0,85.0
3,3,130.0,9.5
4,4,172.0,22.5
5,5,75.0,52.5
6,7,120.0,5.0


In [118]:
spark.sql("SELECT * FROM glue.test.inventory_new_summary").toPandas()

                                                                                

Unnamed: 0,product_id,total_stock,avg_price
0,1,90.0,40.0
1,2,127.0,7.0
2,6,20.0,85.0
3,3,130.0,9.5
4,4,172.0,22.5
5,5,75.0,52.5
6,7,120.0,5.0


In [109]:
spark.sql("""
    CREATE OR REPLACE TEMPORARY VIEW aggregated_changes AS 
    SELECT
        product_id,
        SUM(CASE 
            WHEN _change_type = 'INSERT' THEN stock_level
            WHEN _change_type = 'DELETE' THEN -stock_level
            ELSE 0 END) AS total_stock_change,
        AVG(price) AS new_avg_price
    FROM
        glue.test.inventory_new.changes
    GROUP BY
        product_id
""")


DataFrame[]

In [117]:
spark.sql("""
    MERGE INTO glue.test.inventory_new_summary AS target
    USING aggregated_changes AS source
    ON target.product_id = source.product_id
    WHEN MATCHED THEN 
        UPDATE SET 
            target.total_stock = source.total_stock_change
    WHEN NOT MATCHED THEN 
        INSERT (product_id, total_stock, avg_price)
        VALUES (source.product_id, source.total_stock_change, source.new_avg_price)
""")


                                                                                

DataFrame[]