# Delta Lake Generated Columns

In [8]:
import pyspark
from delta import *
from pyspark.sql import functions as F

builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

## Create table with generated column

In [9]:
from delta import DeltaTable

In [10]:
(
    DeltaTable.create(spark)
    .tableName("default.some_people")
    .addColumn("id", "LONG")
    .addColumn("first_name", "STRING")
    .addColumn("last_name", "STRING")
    .addColumn("age", "LONG")
    .addColumn(
        "full_name", "STRING", generatedAlwaysAs="concat(first_name, ' ', last_name)"
    )
    .execute()
)

<delta.tables.DeltaTable at 0x10783b070>

In [11]:
spark.sql("select * from some_people").show()

+---+----------+---------+---+---------+
| id|first_name|last_name|age|full_name|
+---+----------+---------+---+---------+
+---+----------+---------+---+---------+



## Insert into table with generated column

In [12]:
df = spark.createDataFrame(
    [(0, "Bob", "Loblaw", 23), (1, "Sue", "Grafton", None), (2, "Jim", "Carrey", 61)]
).toDF("id", "first_name", "last_name", "age")

In [13]:
df.show()

+---+----------+---------+----+
| id|first_name|last_name| age|
+---+----------+---------+----+
|  0|       Bob|   Loblaw|  23|
|  1|       Sue|  Grafton|null|
|  2|       Jim|   Carrey|  61|
+---+----------+---------+----+



In [14]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- age: long (nullable = true)



In [15]:
df.write.mode("append").format("delta").saveAsTable("some_people")

                                                                                

In [16]:
DeltaTable.forName(spark, "some_people").toDF().show()

+---+----------+---------+----+-----------+
| id|first_name|last_name| age|  full_name|
+---+----------+---------+----+-----------+
|  2|       Jim|   Carrey|  61| Jim Carrey|
|  0|       Bob|   Loblaw|  23| Bob Loblaw|
|  1|       Sue|  Grafton|null|Sue Grafton|
+---+----------+---------+----+-----------+



## Insert into table with generated column mergeSchema

In [17]:
df = spark.createDataFrame(
    [
        (8, "Liam", 66),
        (9, "Colin", 77),
    ]
).toDF("id", "first_name", "age")

In [18]:
df.write.mode("append").format("delta").saveAsTable("some_people")

AnalysisException: Column 'last_name' does not exist. Did you mean one of the following? [first_name, age, id]; line 1 pos 24;
'Project [id#2070L AS id#2105L, first_name#2071 AS first_name#2106, age#2072L AS age#2107L, 'concat(first_name#2071,  , 'last_name) AS full_name#2108]
+- Project [_1#2064L AS id#2070L, _2#2065 AS first_name#2071, _3#2066L AS age#2072L]
   +- LogicalRDD [_1#2064L, _2#2065, _3#2066L], false


In [19]:
df.write.option("mergeSchema", "true").mode("append").format("delta").saveAsTable(
    "some_people"
)

AnalysisException: Column 'last_name' does not exist. Did you mean one of the following? [first_name, age, id]; line 1 pos 24;
'Project [id#2070L AS id#2126L, first_name#2071 AS first_name#2127, age#2072L AS age#2128L, 'concat(first_name#2071,  , 'last_name) AS full_name#2129]
+- Project [_1#2064L AS id#2070L, _2#2065 AS first_name#2071, _3#2066L AS age#2072L]
   +- LogicalRDD [_1#2064L, _2#2065, _3#2066L], false


## What happens when users supply generated column values

In [20]:
df = spark.createDataFrame(
    [
        (21, "Curtis", "Jackson", 47, "50 cent"),
        (22, "Eric", "Wright", None, "easy-e"),
    ]
).toDF("id", "first_name", "last_name", "age", "full_name")

In [21]:
df.write.mode("append").format("delta").saveAsTable("some_people")

23/04/08 14:36:52 ERROR Utils: Aborting task
org.apache.spark.sql.delta.schema.DeltaInvariantViolationException: CHECK constraint Generated Column (full_name <=> concat(first_name, ' ', last_name)) violated by row with values:
 - first_name : Curtis
 - full_name : 50 cent
 - last_name : Jackson
	at org.apache.spark.sql.delta.schema.DeltaInvariantViolationException$.apply(InvariantViolationException.scala:60)
	at org.apache.spark.sql.delta.schema.DeltaInvariantViolationException$.apply(InvariantViolationException.scala:70)
	at org.apache.spark.sql.delta.schema.DeltaInvariantViolationException.apply(InvariantViolationException.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.CheckDeltaInvariant_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.delta.constraints.DeltaInvariantCheckerExec.$anonfun$doExecute$3(DeltaInvariantCheckerExec.scala:80)
	

Py4JJavaError: An error occurred while calling o182.saveAsTable.
: org.apache.spark.sql.delta.schema.DeltaInvariantViolationException: CHECK constraint Generated Column (full_name <=> concat(first_name, ' ', last_name)) violated by row with values:
 - first_name : Curtis
 - full_name : 50 cent
 - last_name : Jackson
	at org.apache.spark.sql.delta.schema.DeltaInvariantViolationException$.apply(InvariantViolationException.scala:60)
	at org.apache.spark.sql.delta.schema.DeltaInvariantViolationException$.apply(InvariantViolationException.scala:70)
	at org.apache.spark.sql.delta.schema.DeltaInvariantViolationException.apply(InvariantViolationException.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.CheckDeltaInvariant_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.delta.constraints.DeltaInvariantCheckerExec.$anonfun$doExecute$3(DeltaInvariantCheckerExec.scala:80)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.writeWithIterator(FileFormatDataWriter.scala:92)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:331)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1538)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:338)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$21(FileFormatWriter.scala:256)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)


## What happens when generated columns depend on columns with null values?

In [22]:
df = spark.createDataFrame(
    [
        (44, None, "Perkins", 20),
        (55, "Li", None, 30),
    ]
).toDF("id", "first_name", "last_name", "age")

In [23]:
df.write.mode("append").format("delta").saveAsTable(
    "some_people"
)

In [24]:
spark.sql("select * from some_people").show()

+---+----------+---------+----+-----------+
| id|first_name|last_name| age|  full_name|
+---+----------+---------+----+-----------+
|  2|       Jim|   Carrey|  61| Jim Carrey|
|  0|       Bob|   Loblaw|  23| Bob Loblaw|
|  1|       Sue|  Grafton|null|Sue Grafton|
| 44|      null|  Perkins|  20|       null|
| 55|        Li|     null|  30|       null|
+---+----------+---------+----+-----------+



In [26]:
df.withColumn("whatevs", F.concat(F.col("first_name"), F.col("last_name"))).show()

+---+----------+---------+---+-------+
| id|first_name|last_name|age|whatevs|
+---+----------+---------+---+-------+
| 44|      null|  Perkins| 20|   null|
| 55|        Li|     null| 30|   null|
+---+----------+---------+---+-------+



## Cleanup

In [4]:
spark.sql("drop table some_people")

AnalysisException: Table or view not found: some_people; line 1 pos 11;
'DropTable false, false
+- 'UnresolvedTableOrView [some_people], DROP TABLE, true


In [6]:
!rm -rf spark-warehouse

In [7]:
!rm -rf tmp