Nullable columns should work when using generated columns

rahulsmahadev · vkorukanti · commit 86ae53ba6ebb · 2022-07-19T07:43:18.000-07:00
(Cherry-pick of 38945d0) - There was a bug in the generated columns code `addDefaultExprsOrReturnConstraints` that would not allow null columns in the insert DataFrame to be written even if the column was nullable. - added unit test GitOrigin-RevId: effdb5732e7aeaf0da7fa5e18bc2eda7436ecfbc
diff --git a/core/src/main/scala/org/apache/spark/sql/delta/ColumnWithDefaultExprUtils.scala b/core/src/main/scala/org/apache/spark/sql/delta/ColumnWithDefaultExprUtils.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.delta.commands.cdc.CDCReader
 import org.apache.spark.sql.delta.constraints.{Constraint, Constraints}
 import org.apache.spark.sql.delta.metering.DeltaLogging
 import org.apache.spark.sql.delta.schema.SchemaUtils
-import org.apache.spark.sql.delta.sources.DeltaSourceUtils
+import org.apache.spark.sql.delta.sources.{DeltaSourceUtils, DeltaSQLConf}
 
 import org.apache.spark.sql.{Column, DataFrame, Dataset, Encoder}
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
@@ -98,20 +98,28 @@ object ColumnWithDefaultExprUtils extends DeltaLogging {
     lazy val metadataOutputNames = CaseInsensitiveMap(schema.map(f => f.name -> f).toMap)
     val constraints = mutable.ArrayBuffer[Constraint]()
     val track = mutable.Set[String]()
-    var selectExprs = schema.map { f =>
+    var selectExprs = schema.flatMap { f =>
       GeneratedColumn.getGenerationExpression(f) match {
         case Some(expr) =>
           if (topLevelOutputNames.contains(f.name)) {
             val column = SchemaUtils.fieldToColumn(f)
             // Add a constraint to make sure the value provided by the user is the same as the value
             // calculated by the generation expression.
             constraints += Constraints.Check(s"Generated Column", EqualNullSafe(column.expr, expr))
-            column.alias(f.name)
+            Some(column.alias(f.name))
           } else {
-            new Column(expr).alias(f.name)
+            Some(new Column(expr).alias(f.name))
           }
         case None =>
-            SchemaUtils.fieldToColumn(f).alias(f.name)
+            if (topLevelOutputNames.contains(f.name) ||
+                !data.sparkSession.conf.get(DeltaSQLConf.GENERATED_COLUMN_ALLOW_NULLABLE)) {
+              Some(SchemaUtils.fieldToColumn(f).alias(f.name))
+            } else {
+              // we only want to consider columns that are in the data's schema or are generated
+              // to allow DataFrame with null columns to be written.
+              // The actual check for nullability on data is done in the DeltaInvariantCheckerExec
+              None
+            }
       }
     }
     val cdcSelectExprs = CDCReader.CDC_COLUMNS_IN_DATA.flatMap { cdcColumnName =>
diff --git a/core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala b/core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala
@@ -670,6 +670,14 @@ trait DeltaSQLConfBase {
       .booleanConf
       .createWithDefault(true)
 
+  val GENERATED_COLUMN_ALLOW_NULLABLE =
+    buildConf("generatedColumn.allowNullableIngest.enabled")
+      .internal()
+      .doc("When enabled this will allow tables with generated columns enabled to be able " +
+        "to write data without providing values for a nullable column via DataFrame.write")
+      .booleanConf
+      .createWithDefault(true)
+
   val DELTA_OPTIMIZE_MIN_FILE_SIZE =
     buildConf("optimize.minFileSize")
         .internal()
diff --git a/core/src/test/scala/org/apache/spark/sql/delta/GeneratedColumnSuite.scala b/core/src/test/scala/org/apache/spark/sql/delta/GeneratedColumnSuite.scala
@@ -19,9 +19,12 @@ package org.apache.spark.sql.delta
 // scalastyle:off import.ordering.noEmptyLine
 import java.io.PrintWriter
 
+import scala.collection.JavaConverters._
+
 import org.apache.spark.sql.delta.commands.cdc.CDCReader
-import org.apache.spark.sql.delta.schema.{InvariantViolationException, SchemaUtils}
+import org.apache.spark.sql.delta.schema.{DeltaInvariantViolationException, InvariantViolationException, SchemaUtils}
 import org.apache.spark.sql.delta.sources.DeltaSourceUtils.GENERATION_EXPRESSION_METADATA_KEY
+import org.apache.spark.sql.delta.sources.DeltaSQLConf
 import org.apache.spark.sql.delta.test.DeltaSQLCommandTest
 import io.delta.tables.DeltaTableBuilder
 
@@ -35,7 +38,7 @@ import org.apache.spark.sql.functions.{current_timestamp, lit}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.streaming.{StreamingQueryException, Trigger}
 import org.apache.spark.sql.test.SharedSparkSession
-import org.apache.spark.sql.types.{ArrayType, DateType, IntegerType, MetadataBuilder, StructField, StructType, TimestampType}
+import org.apache.spark.sql.types.{ArrayType, DateType, IntegerType, MetadataBuilder, StringType, StructField, StructType, TimestampType}
 import org.apache.spark.unsafe.types.UTF8String
 
 trait GeneratedColumnTest extends QueryTest with SharedSparkSession with DeltaSQLCommandTest {
@@ -1672,6 +1675,67 @@ trait GeneratedColumnSuiteBase extends GeneratedColumnTest {
       )
     }
   }
+
+  test("not null should be enforced with generated columns") {
+    withTableName("tbl") { tbl =>
+      createTable(tbl,
+        None, "c1 INT, c2 STRING, c3 INT", Map("c3" -> "c1 + 1"), Seq.empty, Set("c1", "c2", "c3"))
+
+      // try to write data without c2 in the DF
+      val schemaWithoutColumnC2 = StructType(
+        Seq(StructField("c1", IntegerType, true)))
+      val data1 = List(Row(3))
+      val df1 = spark.createDataFrame(data1.asJava, schemaWithoutColumnC2)
+
+      val e1 = intercept[DeltaInvariantViolationException] {
+        df1.write.format("delta").mode("append").saveAsTable("tbl")
+      }
+      assert(e1.getMessage.contains("Column c2, which has a NOT NULL constraint," +
+        " is missing from the data being written into the table."))
+    }
+  }
+
+  Seq(true, false).foreach { allowNullInsert =>
+    test("nullable column should work with generated columns - " +
+      "allowNullInsert enabled=" + allowNullInsert) {
+      withTableName("tbl") { tbl =>
+        withSQLConf(DeltaSQLConf.GENERATED_COLUMN_ALLOW_NULLABLE.key -> allowNullInsert.toString) {
+          createTable(
+            tbl, None, "c1 INT, c2 STRING, c3 INT", Map("c3" -> "c1 + 1"), Seq.empty)
+
+          // create data frame that matches the table's schema
+          val data1 = List(Row(1, "a1"), Row(2, "a2"))
+          val schema = StructType(
+            Seq(StructField("c1", IntegerType, true), StructField("c2", StringType, true)))
+          val df1 = spark.createDataFrame(data1.asJava, schema)
+          df1.write.format("delta").mode("append").saveAsTable("tbl")
+
+          // create a data frame that does not have c2
+          val schemaWithoutOptionalColumnC2 = StructType(
+            Seq(StructField("c1", IntegerType, true)))
+
+          val data2 = List(Row(3))
+          val df2 = spark.createDataFrame(data2.asJava, schemaWithoutOptionalColumnC2)
+
+          if (allowNullInsert) {
+            df2.write.format("delta").mode("append").saveAsTable("tbl")
+            // check correctness
+            val expectedDF = df1
+              .union(df2.withColumn("c2", lit(null).cast(StringType)))
+              .withColumn("c3", 'c1 + 1)
+            checkAnswer(spark.read.table(tbl), expectedDF)
+          } else {
+            // when allow null insert is not enabled.
+            val e = intercept[AnalysisException] {
+              df2.write.format("delta").mode("append").saveAsTable("tbl")
+            }
+            e.getMessage.contains(
+              "A column or function parameter with name `c2` cannot be resolved")
+          }
+        }
+      }
+    }
+  }
 }
 
 class GeneratedColumnSuite extends GeneratedColumnSuiteBase