In [1]:
!pip install faker

Collecting faker
  Downloading faker-39.0.0-py3-none-any.whl.metadata (16 kB)
Downloading faker-39.0.0-py3-none-any.whl (2.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.0/2.0 MB[0m [31m26.2 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-39.0.0


In [2]:
import os

# get the accessKey and secretKey from Environment
accessKey = os.environ['AWS_ACCESS_KEY_ID']
secretKey = os.environ['AWS_SECRET_ACCESS_KEY']

import pyspark
from pyspark.sql import SparkSession

conf = pyspark.SparkConf()

# point to mesos master or zookeeper entry (e.g., zk://10.10.10.10:2181/mesos)
conf.setMaster("spark://spark-master:7077")

# set other options as desired
conf.set("spark.executor.memory", "8g")
conf.set("spark.executor.cores", "2")
conf.set("spark.core.connection.ack.wait.timeout", "1200")
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set("spark.hadoop.fs.s3a.endpoint", "http://minio-1:9000")
conf.set("spark.hadoop.fs.s3a.path.style.access", "true")
conf.set("spark.hadoop.fs.s3a.access.key", accessKey)
conf.set("spark.hadoop.fs.s3a.secret.key", secretKey)
conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
conf.set("spark.sql.catalogImplementation", "hive")
conf.set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkCatalog")
conf.set("spark.sql.catalog.hive", "org.apache.iceberg.spark.SparkCatalog")
conf.set("spark.sql.catalog.spark_catalog.type", "hive")
conf.set("spark.sql.catalog.spark_catalog.uri", "thrift://hive-metastore:9083")
conf.set("spark.sql.catalog.spark_catalog.warehouse", "s3a://admin-bucket/iceberg/warehouse")
conf.set("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
conf.set("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.0")
conf.set("spark.sql.legacy.allowNonEmptyLocationInCTAS","true")
conf.set("spark.sql.hive.metastore.jars","builtin")

spark = SparkSession.builder.appName('Jupyter').config(conf=conf).getOrCreate()
spark.sparkContext.setLogLevel("INFO")

sc = spark.sparkContext

In [3]:
%load_ext sql
%sql spark

In [4]:
%%sql
SHOW CATALOGS

Field 1
spark_catalog


In [5]:
%%sql
DROP TABLE IF EXISTS raw_person_xl;

CREATE TABLE
  IF NOT EXISTS raw_person_xl (
    surrogate_key STRING,
    person_id STRING,
    -- identity
    salutation STRING,
    title STRING,
    first_name STRING,
    middle_name STRING,
    last_name STRING,
    suffix STRING,
    gender STRING,
    -- contact
    email STRING,
    phone_mobile STRING,
    phone_home STRING,
    -- address
    street STRING,
    house_number STRING,
    postal_code STRING,
    city STRING,
    state STRING,
    country STRING,
    -- personal
    birth_date DATE,
    nationality STRING,
    marital_status STRING,
    number_of_children INT,
    -- employment
    employment_status STRING,
    job_title STRING,
    employer STRING,
    annual_income DOUBLE,
    -- identifiers
    national_id STRING,
    tax_id STRING,
    -- metadata
    source_system STRING,
    status STRING,
    export_date DATE,
    load_ts TIMESTAMP
  ) USING iceberg PARTITIONED BY (days (export_date)) LOCATION 's3a://warehouse-bucket/warehouse/raw_person_xl';

In [6]:
%%sql
DROP TABLE IF EXISTS dim_person_xl;

CREATE TABLE
  IF NOT EXISTS dim_person_xl (
    surrogate_key STRING,
    person_id STRING,
    -- identity
    salutation STRING,
    title STRING,
    first_name STRING,
    middle_name STRING,
    last_name STRING,
    suffix STRING,
    gender STRING,
    -- contact
    email STRING,
    phone_mobile STRING,
    phone_home STRING,
    -- address
    street STRING,
    house_number STRING,
    postal_code STRING,
    city STRING,
    state STRING,
    country STRING,
    -- personal
    birth_date DATE,
    nationality STRING,
    marital_status STRING,
    number_of_children INT,
    -- employment
    employment_status STRING,
    job_title STRING,
    employer STRING,
    annual_income DOUBLE,
    -- identifiers
    national_id STRING,
    tax_id STRING,
    -- metadata
    source_system STRING,
    record_status STRING,
    -- SCD2 metadata columns
    valid_from TIMESTAMP,
    valid_to TIMESTAMP,
    is_current_version BOOLEAN,
    is_active BOOLEAN,
    source_loaded_at TIMESTAMP,
    created_at TIMESTAMP,
    replaced_at TIMESTAMP,
    -- Additional metadata
    change_type STRING,
    record_hash STRING
  ) USING iceberg PARTITIONED BY (days (valid_from), is_current_version) LOCATION 's3a://warehouse-bucket/warehouse/dim_person_xl';

In [7]:
import random
from datetime import date, timedelta
from faker import Faker

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

from faker import Faker
fake = Faker()
Faker.seed(42)


In [10]:
schema = StructType([
    StructField("surrogate_key", StringType()),
    StructField("person_id", StringType()),

    StructField("salutation", StringType()),
    StructField("title", StringType()),
    StructField("first_name", StringType()),
    StructField("middle_name", StringType()),
    StructField("last_name", StringType()),
    StructField("suffix", StringType()),
    StructField("gender", StringType()),

    StructField("email", StringType()),
    StructField("phone_mobile", StringType()),
    StructField("phone_home", StringType()),

    StructField("street", StringType()),
    StructField("house_number", StringType()),
    StructField("postal_code", StringType()),
    StructField("city", StringType()),
    StructField("state", StringType()),
    StructField("country", StringType()),

    StructField("birth_date", DateType()),
    StructField("nationality", StringType()),
    StructField("marital_status", StringType()),
    StructField("number_of_children", IntegerType()),

    StructField("employment_status", StringType()),
    StructField("job_title", StringType()),
    StructField("employer", StringType()),
    StructField("annual_income", DoubleType()),

    StructField("national_id", StringType()),
    StructField("tax_id", StringType()),

    StructField("source_system", StringType()),
    StructField("status", StringType())
])


In [11]:
import uuid
def generate_person_row(person_id: int):
    return (
        str(uuid.uuid4()),
        str(person_id),
        random.choice(["Mr", "Ms", "Mrs", "Dr"]),
        random.choice(["", "Dr", "Prof"]),
        fake.first_name(),
        fake.first_name() if random.random() < 0.3 else None,
        fake.last_name(),
        random.choice(["", "Jr", "Sr"]),
        random.choice(["M", "Fi", "X"]),

        fake.email(),
        fake.phone_number(),
        fake.phone_number(),

        fake.street_name(),
        str(fake.building_number()),
        fake.postcode(),
        fake.city(),
        fake.state(),
        fake.country_code(),

        fake.date_of_birth(minimum_age=18, maximum_age=90),
        fake.country_code(),
        random.choice(["single", "married", "divorced", "widowed"]),
        random.randint(0, 4),

        random.choice(["employed", "self-employed", "unemployed", "retired"]),
        fake.job(),
        fake.company(),
        round(random.uniform(30_000, 180_000), 2),

        fake.ssn(),
        fake.bothify("??######"),

        random.choice(["CRM", "ERP", "HR"]),
        "ACTIVE"
    )


In [12]:
INITIAL_PERSONS = 200_000   # scale here

rows = [generate_person_row(i) for i in range(INITIAL_PERSONS)]
df = spark.createDataFrame(rows, schema).cache()
next_person_id = INITIAL_PERSONS

In [13]:
UPDATE_RATE = 0.05
INSERT_RATE = 0.01
DELETE_RATE = 0.005


In [14]:
def apply_daily_changes(df, next_person_id):
    updates = df.sample(UPDATE_RATE)
    deletes = df.sample(DELETE_RATE)

    updated = (
        updates
        .withColumn("email", F.when(F.rand() < 0.6, fake.email()).otherwise(F.col("email")))
        .withColumn("street", F.when(F.rand() < 0.4, fake.street_name()).otherwise(F.col("street")))
        .withColumn("job_title", F.when(F.rand() < 0.3, fake.job()).otherwise(F.col("job_title")))
        .withColumn(
            "annual_income",
            F.when(F.rand() < 0.3, F.col("annual_income") * (1 + (F.rand() - 0.5) / 5))
             .otherwise(F.col("annual_income"))
        )
    )

    remaining = df.subtract(deletes).subtract(updates)

    inserts_count = int(df.count() * INSERT_RATE)
    new_rows = [
        generate_person_row(next_person_id + i)
        for i in range(inserts_count)
    ]

    inserts = spark.createDataFrame(new_rows, schema)

    full_export = remaining.unionByName(updated).unionByName(inserts)

    return full_export, next_person_id + inserts_count


In [15]:
start_date = date(2024, 1, 1)
DAYS = 30

spark.sparkContext.setCheckpointDir("s3a://admin-bucket/checkpoints")

for d in range(DAYS):
    export_date = start_date + timedelta(days=d)

    df, next_person_id = apply_daily_changes(df, next_person_id)

    out_df = (
        df
        .withColumn("export_date", F.lit(export_date))
        .withColumn("load_ts", F.current_timestamp())
    )

    out_df.writeTo("raw_person_xl").overwrite(F.expr(f"export_date = DATE '{export_date}'"))

    # üîë break lineage
    df = df.checkpoint(eager=True)
    
    spark.catalog.clearCache()

    print(f"{export_date} | rows={out_df.count()} | next_person_id={next_person_id}")


2024-01-01 | rows=201081 | next_person_id=202000
2024-01-02 | rows=202124 | next_person_id=204010
2024-01-03 | rows=203216 | next_person_id=206031
2024-01-04 | rows=204274 | next_person_id=208063
2024-01-05 | rows=205312 | next_person_id=210105
2024-01-06 | rows=206357 | next_person_id=212158
2024-01-07 | rows=207455 | next_person_id=214221
2024-01-08 | rows=208536 | next_person_id=216295
2024-01-09 | rows=209642 | next_person_id=218380
2024-01-10 | rows=210727 | next_person_id=220476
2024-01-11 | rows=211855 | next_person_id=222583
2024-01-12 | rows=212988 | next_person_id=224701
2024-01-13 | rows=214085 | next_person_id=226830
2024-01-14 | rows=215220 | next_person_id=228970
2024-01-15 | rows=216283 | next_person_id=231122
2024-01-16 | rows=217439 | next_person_id=233284
2024-01-17 | rows=218600 | next_person_id=235458
2024-01-18 | rows=219753 | next_person_id=237644
2024-01-19 | rows=220898 | next_person_id=239841
2024-01-20 | rows=222085 | next_person_id=242049
2024-01-21 | rows=22

## SCD2 Handling

In [17]:
fields = [
    "salutation",
    "title",
    "first_name",
    "middle_name",
    "last_name",
    "suffix",
    "gender",
    "email",
    "phone_mobile",
    "phone_home",
    "street",
    "house_number",
    "postal_code",
    "city",
    "state",
    "country",
    "birth_date",
    "nationality",
    "marital_status",
    "number_of_children",
    "employment_status",
    "job_title",
    "employer",
    "annual_income",
    "national_id",
    "tax_id"
]


In [27]:
def format_values(values, prefix):
    return ", ".join(f"{prefix}{v}" for v in values)

def format_sql(load_date: str, current_timestamp: str, table_name: str, pk_col: str, val_columns: []):
    val_columns_str = format_values(val_columns, "")
    source_val_columns_str = format_values(val_columns, "source.")
    
    stmt = f"""
    WITH changed_records AS (
        SELECT 
            src.*,
            CASE 
                WHEN tgt.{pk_col} IS NULL THEN 'NEW'
                WHEN src.row_hash != tgt.row_hash THEN 'CHANGED'
                ELSE 'UNCHANGED'
            END AS change_classification
        FROM (
        	SELECT *,
     			sha2(concat_ws('||', {val_columns_str}, status), 256) AS row_hash
  			FROM raw_person_xl
        ) src
        LEFT JOIN (
            SELECT 
                surrogate_key,
                {pk_col},
                sha2(concat_ws('||', {pk_col}, {val_columns_str}, CASE WHEN is_active THEN 'ACTIVE' else 'INACTIVE' END), 256) AS row_hash,
                source_loaded_at,
                valid_from
            FROM dim_person_xl
            WHERE is_current_version = true
        ) tgt ON src.{pk_col} = tgt.{pk_col}
        WHERE src.export_date = CAST('{load_date}' as date)
    ),
    records_to_process AS (
        SELECT *
        FROM changed_records
        WHERE change_classification IN ('NEW', 'CHANGED')
    ),
    prepared_source AS (
        -- Original records for matching existing rows (updates)
        SELECT
            surrogate_key,
            {pk_col} AS merge_key,  -- Used for matching
            {pk_col},
            {val_columns_str},
            export_date ,
            status,
            'UPDATE_EXISTING' AS operation_type
        FROM records_to_process
        
        UNION ALL
        
        -- Duplicate records with NULL key for insertions
        SELECT 
            surrogate_key,
            NULL AS merge_key,     -- NULL prevents matching, forces insert
            {pk_col},
            {val_columns_str},
            export_date,
            status,
            'INSERT_NEW_VERSION' AS operation_type
        FROM records_to_process
        WHERE change_classification = 'CHANGED'  -- Only for updates, not new records
    )
    
    MERGE INTO dim_person_xl target
    USING prepared_source source
    ON target.{pk_col} = source.merge_key 
       AND target.is_current_version = true
    -- Close existing current records for updated entities
    WHEN MATCHED 
        AND source.operation_type = 'UPDATE_EXISTING'
        AND source.export_date > target.source_loaded_at
    THEN UPDATE SET
        valid_to = source.export_date - INTERVAL 1 SECOND,
        is_current_version = false,
        change_type = 'SUPERSEDED',
        created_at = CAST('{current_timestamp}' AS TIMESTAMP)
    -- Insert new records (both new entities and new versions)
    WHEN NOT MATCHED 
    THEN INSERT (
        surrogate_key,
        {pk_col},
        {val_columns_str},
        valid_from,
        valid_to,
        is_current_version,
        is_active,
        source_loaded_at,
        created_at,
        replaced_at,
        change_type,
        record_hash
    ) VALUES (
        source.surrogate_key,
        source.{pk_col},
        {source_val_columns_str},
        source.export_date,
        CAST('9999-12-31 23:59:59' AS TIMESTAMP),  -- Far future date
        true,
        CASE WHEN source.status = 'ACTIVE' THEN true ELSE false END,
        source.export_date,
        CAST('{current_timestamp}' AS TIMESTAMP),
        CAST('9999-12-31 23:59:59' AS TIMESTAMP),  -- Far future date
        CASE 
            WHEN source.operation_type = 'UPDATE_EXISTING' THEN 'NEW'
            ELSE 'SUPERSEDED_BY'
        END,
        sha2(concat_ws('|', 
            cast(source.{pk_col} as string),
            {source_val_columns_str},status
        ), 256)
    )

    """
    return stmt

In [33]:
start_date = date(2024, 1, 1)
DAYS = 1

spark.sparkContext.setCheckpointDir("s3a://admin-bucket/checkpoints")

for d in range(DAYS):
    load_date = start_date + timedelta(days=d)

    stmt = format_sql('2024-01-02', '2025-01-07', 'person', 'person_id', fields)
    print(stmt)
    spark.sql(stmt)
        
    spark.catalog.clearCache()

    print(f"{load_date}")



    WITH changed_records AS (
        SELECT 
            src.*,
            CASE 
                WHEN tgt.person_id IS NULL THEN 'NEW'
                WHEN src.row_hash != tgt.row_hash THEN 'CHANGED'
                ELSE 'UNCHANGED'
            END AS change_classification
        FROM (
        	SELECT *,
     			sha2(concat_ws('||', salutation, title, first_name, middle_name, last_name, suffix, gender, email, phone_mobile, phone_home, street, house_number, postal_code, city, state, country, birth_date, nationality, marital_status, number_of_children, employment_status, job_title, employer, annual_income, national_id, tax_id, status), 256) AS row_hash
  			FROM raw_person_xl
        ) src
        LEFT JOIN (
            SELECT 
                surrogate_key,
                person_id,
                sha2(concat_ws('||', person_id, salutation, title, first_name, middle_name, last_name, suffix, gender, email, phone_mobile, phone_home, street, house_number, postal_code, city, state, 

Py4JJavaError: An error occurred while calling o66.sql.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1519.0 failed 4 times, most recent failure: Lost task 0.3 in stage 1519.0 (TID 49170) (192.168.148.17 executor 0): org.apache.spark.SparkRuntimeException: [MERGE_CARDINALITY_VIOLATION] The ON search condition of the MERGE statement matched a single row from the target table with multiple rows of the source table.
This could result in the target row being operated on more than once with an update or delete operation and is not allowed.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.mergeCardinalityViolationError(QueryExecutionErrors.scala:2702)
	at org.apache.spark.sql.execution.datasources.v2.MergeRowsExec$BitmapCardinalityValidator.validate(MergeRowsExec.scala:164)
	at org.apache.spark.sql.execution.datasources.v2.MergeRowsExec$MergeRowIterator.next(MergeRowsExec.scala:203)
	at org.apache.spark.sql.execution.datasources.v2.MergeRowsExec$MergeRowIterator.next(MergeRowsExec.scala:176)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:514)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage15.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:168)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:840)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: org.apache.spark.SparkRuntimeException: [MERGE_CARDINALITY_VIOLATION] The ON search condition of the MERGE statement matched a single row from the target table with multiple rows of the source table.
This could result in the target row being operated on more than once with an update or delete operation and is not allowed.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.mergeCardinalityViolationError(QueryExecutionErrors.scala:2702)
	at org.apache.spark.sql.execution.datasources.v2.MergeRowsExec$BitmapCardinalityValidator.validate(MergeRowsExec.scala:164)
	at org.apache.spark.sql.execution.datasources.v2.MergeRowsExec$MergeRowIterator.next(MergeRowsExec.scala:203)
	at org.apache.spark.sql.execution.datasources.v2.MergeRowsExec$MergeRowIterator.next(MergeRowsExec.scala:176)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:514)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage15.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:168)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:840)
