In [1]:
import os
from pyspark.sql import SparkSession

In [6]:
HUDI_JAR = os.environ.get("HUDI_SPARK_BUNDLE")
if not HUDI_JAR:
        raise EnvironmentError("HUDI_SPARK_BUNDLE environment variable not set")

HADOOP_S3_JAR = "/opt/spark/jars/hadoop-aws-3.3.4.jar,/opt/spark/jars/aws-java-sdk-bundle-1.12.734.jar"

ALL_JARS = f"{HUDI_JAR},{HADOOP_S3_JAR}"

In [1]:
def get_spark(app_name="Hudi-Jupyter"):
    """
    Initialize a SparkSession with Hudi extensions.
    
    Parameters:
    - app_name (str): Optional name for the Spark application.
    
    Returns:
    - SparkSession object
    """
    hudi_jar = os.environ.get("HUDI_SPARK_BUNDLE")
    if not hudi_jar:
        raise EnvironmentError("HUDI_SPARK_BUNDLE environment variable not set")

    spark = SparkSession.builder \
        .appName('HudiCRUD') \
        .config("spark.jars", ALL_JARS) \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
        .config("spark.sql.hive.convertMetastoreParquet", "false") \
        .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9090") \
        .config("spark.hadoop.fs.s3a.access.key", "minio") \
        .config("spark.hadoop.fs.s3a.secret.key", "minio123") \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .config("spark.hadoop.fs.defaultFS", "s3a://warehouse") \
        .enableHiveSupport() \
        .getOrCreate()

    spark.sparkContext.setLogLevel("ERROR")
    print(f"SparkSession started with app name: {app_name}")
    return spark