# Delta Lake Generated Columns

In [1]:
import pyspark
from delta import *
from pyspark.sql import functions as F

builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-330-delta-220/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6677e20b-619c-4f8e-a096-71efe9713ada;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.2.0 in central
	found io.delta#delta-storage;2.2.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
:: resolution report :: resolve 102ms :: artifacts dl 3ms
	:: modules in use:
	io.delta#delta-core_2.12;2.2.0 from central in [default]
	io.delta#delta-storage;2.2.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |  

23/04/02 07:20:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/02 07:20:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Create table with generated column

In [16]:
from delta import DeltaTable

In [17]:
(
    DeltaTable.create(spark)
    .tableName("default.some_people")
    .addColumn("id", "LONG")
    .addColumn("first_name", "STRING")
    .addColumn("last_name", "STRING")
    .addColumn("age", "LONG")
    .addColumn(
        "full_name", "STRING", generatedAlwaysAs="concat(first_name, ' ', last_name)"
    )
    .execute()
)

<delta.tables.DeltaTable at 0x14db0e1c0>

In [18]:
spark.sql("select * from some_people").show()

+---+----------+---------+---+---------+
| id|first_name|last_name|age|full_name|
+---+----------+---------+---+---------+
+---+----------+---------+---+---------+



## Insert into table with generated column

In [19]:
df = spark.createDataFrame(
    [(0, "Bob", "Loblaw", 23), (1, "Sue", "Grafton", None), (2, "Jim", "Carrey", 61)]
).toDF("id", "first_name", "last_name", "age")

In [20]:
df.show()

+---+----------+---------+----+
| id|first_name|last_name| age|
+---+----------+---------+----+
|  0|       Bob|   Loblaw|  23|
|  1|       Sue|  Grafton|null|
|  2|       Jim|   Carrey|  61|
+---+----------+---------+----+



In [21]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- age: long (nullable = true)



In [22]:
df.write.mode("append").format("delta").saveAsTable("some_people")

                                                                                

In [26]:
DeltaTable.forName(spark, "some_people").toDF().show()

+---+----------+---------+----+-----------+
| id|first_name|last_name| age|  full_name|
+---+----------+---------+----+-----------+
|  2|       Jim|   Carrey|  61| Jim Carrey|
|  0|       Bob|   Loblaw|  23| Bob Loblaw|
|  1|       Sue|  Grafton|null|Sue Grafton|
+---+----------+---------+----+-----------+



## Insert into table with generated column mergeSchema

In [27]:
df = spark.createDataFrame(
    [
        (8, "Liam", 66),
        (9, "Colin", 77),
    ]
).toDF("id", "first_name", "age")

In [28]:
df.write.mode("append").format("delta").saveAsTable("some_people")

AnalysisException: Column 'last_name' does not exist. Did you mean one of the following? [first_name, age, id]; line 1 pos 24;
'Project [id#2335L AS id#2370L, first_name#2336 AS first_name#2371, age#2337L AS age#2372L, 'concat(first_name#2336,  , 'last_name) AS full_name#2373]
+- Project [_1#2329L AS id#2335L, _2#2330 AS first_name#2336, _3#2331L AS age#2337L]
   +- LogicalRDD [_1#2329L, _2#2330, _3#2331L], false


In [29]:
df.write.option("mergeSchema", "true").mode("append").format("delta").saveAsTable(
    "some_people"
)

AnalysisException: Column 'last_name' does not exist. Did you mean one of the following? [first_name, age, id]; line 1 pos 24;
'Project [id#2335L AS id#2391L, first_name#2336 AS first_name#2392, age#2337L AS age#2393L, 'concat(first_name#2336,  , 'last_name) AS full_name#2394]
+- Project [_1#2329L AS id#2335L, _2#2330 AS first_name#2336, _3#2331L AS age#2337L]
   +- LogicalRDD [_1#2329L, _2#2330, _3#2331L], false


## What happens when users supply generated column values

In [31]:
df = spark.createDataFrame([
    (21, "Curtis", "Jackson", 47, "50 cent"), 
    (22, "Eric", "Wright", None, "easy-e"),
]).toDF("id", "first_name", "last_name", "age", "full_name")

In [32]:
df.write.mode("append").format("delta").saveAsTable("some_people")

23/04/02 10:03:38 ERROR Utils: Aborting task
org.apache.spark.sql.delta.schema.DeltaInvariantViolationException: CHECK constraint Generated Column (full_name <=> concat(first_name, ' ', last_name)) violated by row with values:
 - first_name : Curtis
 - full_name : 50 cent
 - last_name : Jackson
	at org.apache.spark.sql.delta.schema.DeltaInvariantViolationException$.apply(InvariantViolationException.scala:60)
	at org.apache.spark.sql.delta.schema.DeltaInvariantViolationException$.apply(InvariantViolationException.scala:70)
	at org.apache.spark.sql.delta.schema.DeltaInvariantViolationException.apply(InvariantViolationException.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.CheckDeltaInvariant_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.delta.constraints.DeltaInvariantCheckerExec.$anonfun$doExecute$3(DeltaInvariantCheckerExec.scala:80)
	

Py4JJavaError: An error occurred while calling o234.saveAsTable.
: org.apache.spark.sql.delta.schema.DeltaInvariantViolationException: CHECK constraint Generated Column (full_name <=> concat(first_name, ' ', last_name)) violated by row with values:
 - first_name : Eric
 - full_name : easy-e
 - last_name : Wright
	at org.apache.spark.sql.delta.schema.DeltaInvariantViolationException$.apply(InvariantViolationException.scala:60)
	at org.apache.spark.sql.delta.schema.DeltaInvariantViolationException$.apply(InvariantViolationException.scala:70)
	at org.apache.spark.sql.delta.schema.DeltaInvariantViolationException.apply(InvariantViolationException.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.CheckDeltaInvariant_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.delta.constraints.DeltaInvariantCheckerExec.$anonfun$doExecute$3(DeltaInvariantCheckerExec.scala:80)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.writeWithIterator(FileFormatDataWriter.scala:92)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:331)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1538)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:338)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$21(FileFormatWriter.scala:256)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)


## Cleanup

In [14]:
spark.sql("drop table some_people")

DataFrame[]