In [1]:
# Define the AWS env variables if you are using AWS Auth:
%env AWS_REGION= region
%env AWS_ACCESS_KEY_ID= key
%env AWS_SECRET_ACCESS_KEY= secret

env: AWS_REGION=region
env: AWS_ACCESS_KEY_ID=key
env: AWS_SECRET_ACCESS_KEY=secret


In [3]:
import pyspark
from pyspark.sql import SparkSession
import os


conf = (
    pyspark.SparkConf()
        .setAppName('app_name')
    
    # first we will define the packages that we need. Iceberg Spark runtime
        .set('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.2.0,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178')
        
    # This property allows us to add any extensions that we want to use
        .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions')
    
    # configures a new catalog to a particular implementation of SparkCatalog
        .set('spark.sql.catalog.glue', 'org.apache.iceberg.spark.SparkCatalog')
    
    # particular type of catalog we are using
        .set('spark.sql.catalog.glue.catalog-impl', 'org.apache.iceberg.aws.glue.GlueCatalog')
    
    # engine writes to the warehouse
        .set('spark.sql.catalog.glue.warehouse', 's3://my-bucket/warehouse/')
    
    # changes IO impl of catalog, mainly for changing writing data to object storage
        .set('spark.sql.catalog.glue.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')
)

## Start Spark Session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
print("Spark Running")

:: loading settings :: url = jar:file:/home/docker/.local/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/docker/.ivy2/cache
The jars for the packages stored in: /home/docker/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.3_2.12 added as a dependency
software.amazon.awssdk#bundle added as a dependency
software.amazon.awssdk#url-connection-client added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-00e84c3c-8393-480a-9da9-09591325e3c8;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.3_2.12;1.2.0 in central
	found software.amazon.awssdk#bundle;2.17.178 in central
	found software.amazon.eventstream#eventstream;1.0.1 in central
	found software.amazon.awssdk#url-connection-client;2.17.178 in central
	found software.amazon.awssdk#utils;2.17.178 in central
	found org.reactivestreams#reactive-streams;1.0.3 in central
	found software.amazon.awssdk#annotations;2.17.178 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found software.amazon.awssdk#http-client-spi;2.17.178 in central
	found software

23/08/15 20:02:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Spark Running


In [5]:
# Creating a Simple Table:
spark.sql("""
    CREATE TABLE glue.test.employees (
        id INT,
        role STRING,
        department STRING,
        salary FLOAT,
        region STRING)
    USING iceberg
""")

DataFrame[]

In [None]:
# Creating a Partitioned Table
spark.sql("""
    CREATE TABLE glue.test.emp_partitioned (
        id INT,
        role STRING,
        department STRING)
    USING iceberg
    PARTITIONED BY (department)
""")

In [None]:
# Creating a Partitioned Table using a transform expression:
spark.sql("""
    CREATE TABLE glue.test.emp_partitioned_month (
        id INT,
        role STRING,
        department STRING,
        join_date DATE
    )
    USING iceberg
    PARTITIONED BY (months(join_date))
""")

In [None]:
# Create Table AS (CTAS):
spark.sql("""
    CREATE TABLE glue.test.employee_ctas
    USING iceberg
    AS SELECT * FROM glue.test.sample
""")

In [6]:
# CTAS with Properties:
spark.sql("""
    CREATE TABLE glue.test.emp_ctas_partition
    USING iceberg
    PARTITIONED BY (category)
    TBLPROPERTIES (write.format.default='avro')
    AS SELECT *
    FROM glue.test.sample
""")

In [None]:
# ALTER Table:
spark.sql("""
    ALTER TABLE glue.test.employees RENAME TO glue.test.emp_renamed
""")

In [7]:
# Set Table Properties:
spark.sql("""
    ALTER TABLE glue.test.employees SET TBLPROPERTIES ('write.wap.enabled'='true')
""")

In [9]:
# Add Column:
spark.sql("""
    ALTER TABLE glue.test.employees ADD COLUMN manager STRING
""")

In [None]:
# Add Multiple Columns:
spark.sql("""
    ALTER TABLE glue.test.employees ADD COLUMN details STRING, manager_id INT
""")

In [None]:
# Add Column to Specific Position:
spark.sql("""
ALTER TABLE glue.test.employees ADD COLUMN new_column bigint AFTER department
""")

In [None]:
# Rename Column:
spark.sql("""
    ALTER TABLE glue.test.employees RENAME COLUMN role TO title
""")

In [None]:
# Modify Column:
spark.sql("""
    ALTER TABLE glue.test.employees ALTER COLUMN id TYPE BIGINT
""")

In [None]:
# Reorder Column:
spark.sql("ALTER TABLE glue.test.employees ALTER COLUMN salary FIRST")

In [None]:
# Drop Column:
spark.sql("""
    ALTER TABLE glue.test.employees DROP COLUMN department
""")

In [None]:
# Add Partition (with SQL Extension):
spark.sql("""
    ALTER TABLE glue.test.employees ADD PARTITION FIELD region
""")

In [None]:
# Drop Partition (with SQL Extension):
spark.sql("""
    ALTER TABLE glue.test.employees DROP PARTITION FIELD department
""")

In [None]:
# Replace Partition (with SQL Extension):
spark.sql("""
    ALTER TABLE glue.test.employees REPLACE PARTITION FIELD region WITH department
""")

In [None]:
# Set Write Order:
spark.sql("""
    ALTER TABLE glue.test.employees WRITE ORDERED BY id ASC
""")

In [None]:
# Set Write Distribution:
spark.sql("""
    ALTER TABLE glue.test.employees WRITE DISTRIBUTED BY PARTITION
""")

In [None]:
# Set Identifier Field:
spark.sql("""
    ALTER TABLE glue.test.employees SET IDENTIFIER FIELDS id
""")

In [None]:
# Drop Identifier Field:
spark.sql("""
    ALTER TABLE glue.test.employees DROP IDENTIFIER FIELDS id
""")

In [None]:
# Drop Table:
spark.sql("DROP TABLE glue.test.employees")

In [None]:
# Purge Table:
spark.sql("DROP TABLE glue.test.employees PURGE")

In [None]:
# Select All Query:
spark.sql("SELECT * FROM glue.test.employees").show()

In [10]:
# Filtering Rows:
spark.sql("SELECT * FROM glue.test.employees WHERE department = 'Marketing'").show()

In [None]:
# Aggregated Queries:
# Count:
spark.sql("SELECT COUNT(*) FROM glue.test.employees").show()

# Average:
spark.sql("SELECT AVG(salary) FROM glue.test.employees").show()

# Sum:
spark.sql("SELECT SUM(salary) FROM glue.test.employees").show()

# Maximum:
spark.sql("SELECT category, MAX(salary) FROM glue.test.employees GROUP BY category").show()

# Window Function (Rank):
spark.sql("""
SELECT * , RANK() OVER (PARTITION BY department ORDER BY salary DESC) as rank
FROM glue.test.employees
""").show()

In [None]:
# INSERT INTO:
spark.sql("""
INSERT INTO glue.test.employees 
VALUES 
(1, 'Software Engineer', 'Engineering', 25000, 'NA'),
(2, 'Director', 'Sales', 22000, 'EMEA')
""")

In [None]:
# MERGE INTO: (Please ensure to create the 'employee_updates' table prior to running this)
spark.sql("""
MERGE INTO glue.test.employees AS target
USING (SELECT * FROM employee_updates) AS source
ON target.id = source.id
WHEN MATCHED AND source.role = 'Manager' AND source.salary > 100000 THEN
    UPDATE SET target.salary = source.salary
WHEN NOT MATCHED THEN
    INSERT *
""")

In [None]:
# INSERT OVERWRITE: Static Overwrite
spark.sql("""
INSERT OVERWRITE glue.test.employees
PARTITION (region = 'EMEA')
SELECT * 
FROM employee_source
WHERE region = 'EMEA'
""")

In [None]:
# INSERT OVERWRITE: Dynamic Overwrite. Please make sure to set 
# spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") in your Spark app.
# Make sure to create the 'employee_source' table prior to running this.
spark.sql("""
INSERT OVERWRITE glue.test.employees
SELECT * FROM employee_source
WHERE region = 'EMEA'
""")

In [None]:
# Delete:
spark.sql("DELETE FROM glue.test.employees WHERE id < 3")

In [None]:
# Update:
spark.sql("""
UPDATE glue.test.employee
SET region = 'APAC', salary = 6000
WHERE id = 6
""")

In [None]:
# Expire Snapshots Procedure:
spark.sql("CALL glue.system.expire_snapshots('test.employees', date_sub(current_date(), 90), 50)")

In [None]:
# Rewrite Data Files:
spark.sql("CALL glue.system.rewrite_data_files('test.employees')")

In [None]:
# Rewrite Manifests:
spark.sql("CALL test.system.rewrite_manifests('test.employees')")

In [None]:
# Remove Orphan files:
spark.sql("CALL glue.system.remove_orphan_files(table => 'test.employees', dry_run => true)")