In [None]:
%pip install findspark pyspark

In [None]:
import configparser
import os
import findspark
from pyspark.sql import SparkSession, functions
from pyspark import SparkConf, SparkContext
from py4j.java_gateway import java_import
findspark.init()

# Read config file
config = configparser.ConfigParser()
config.read('config.ini')
warehouse = config['DEFAULT']['warehouse']
secret = config['DEFAULT']['secret']
storageAccountName  = config['DEFAULT']['storageAccountName']
hive_uri = config['DEFAULT']['hive_uri']

# add Iceberg dependency
ICEBERG_VERSION="0.12.0"
DEPENDENCIES="org.apache.iceberg:iceberg-spark3-runtime:{}".format(ICEBERG_VERSION)
DEPENDENCIES+=",org.apache.hadoop:hadoop-azure:3.2.0"
DEPENDENCIES+=",com.microsoft.azure:azure-storage:7.0.0" 
DEPENDENCIES+=",org.apache.hadoop:hadoop-azure-datalake:3.2.0"
#DEPENDENCIES+=",org.apache.hadoop:hadoop-common:3.2.0"
#DEPENDENCIES+=",org.apache.hadoop:hadoop-core:1.2.1"

conf = SparkConf()
# Set iceberg settings
conf.set('spark.jars.packages', DEPENDENCIES)
conf.set("spark.sql.execution.pyarrow.enabled", "true")
conf.set("fs.azure.account.key.dremio.blob.core.windows.net", secret)
conf.set("spark.sql.catalog.spark_catalog.warehouse", warehouse)
conf.set("spark.sql.catalog.spark_catalog.type", "hive")
conf.set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")
conf.set("spark.sql.catalog.spark_catalog.uri", hive_uri)
conf.set("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
spark = SparkSession.builder.config(conf=conf).getOrCreate()
sc = spark.sparkContext
jvm = sc._gateway.jvm

# must be done using Java
sc._jsc.hadoopConfiguration().set("fs.azure.account.key." + storageAccountName + ".blob.core.windows.net", secret)
java_import(jvm, "org.apache.iceberg.CatalogUtil")
java_import(jvm, "org.apache.iceberg.catalog.TableIdentifier")
java_import(jvm, "org.apache.iceberg.Schema")
java_import(jvm, "org.apache.iceberg.types.Types")
java_import(jvm, "org.apache.iceberg.PartitionSpec")
java_import(jvm, "org.apache.iceberg.actions.Actions")

In [None]:
# Using Java compact the table for performance
catalog = jvm.CatalogUtil.loadCatalog("org.apache.iceberg.hive.HiveCatalog", "spark_catalog", {'uri': hive_uri}, sc._jsc.hadoopConfiguration())

# Select the table
table_name = jvm.TableIdentifier.parse("default.tickers")
prices_table = catalog.loadTable(table_name)

# Run the compact
table = jvm.Actions.forTable(prices_table).rewriteDataFiles().targetSizeInBytes(500 * 1024 * 1024).execute()