Hudi uniform support (#2333)

#### Which Delta project/connector is this regarding? - [ ] Spark - [ ] Standalone - [ ] Flink - [ ] Kernel - [x] Other (Uniform) ## Description - This change aims to add support for Hudi in Uniform - The changes were mostly copied from [OneTable](https://github.com/onetable-io/onetable) which has a working version of Delta to Hudi already ## How was this patch tested? Some basic tests are added ## Does this PR introduce _any_ user-facing changes? Yes, this allows users to expose their Delta tables as Hudi
delta-io · Mar 29, 2024 · 9028303 · 9028303
1 parent 9c302b0
commit 9028303
Show file tree

Hide file tree

Showing 15 changed files with 1,452 additions and 5 deletions.
diff --git a/build.sbt b/build.sbt
@@ -484,6 +484,57 @@ lazy val icebergShaded = (project in file("icebergShaded"))
     // Make the 'compile' invoke the 'assembly' task to generate the uber jar.
   )
 
+lazy val hudi = (project in file("hudi"))
+  .dependsOn(spark % "compile->compile;test->test;provided->provided")
+  .settings (
+    name := "delta-hudi",
+    commonSettings,
+    scalaStyleSettings,
+    releaseSettings,
+    libraryDependencies ++= Seq(
+      "org.apache.hudi" % "hudi-java-client" % "0.14.0" % "compile" excludeAll(
+        ExclusionRule(organization = "org.apache.hadoop"),
+        ExclusionRule(organization = "org.apache.zookeeper"),
+      ),
+      "org.apache.spark" %% "spark-avro" % sparkVersion % "test" excludeAll ExclusionRule(organization = "org.apache.hadoop"),
+      "org.apache.parquet" % "parquet-avro" % "1.12.3" % "compile"
+    ),
+    assembly / assemblyJarName := s"${name.value}-assembly_${scalaBinaryVersion.value}-${version.value}.jar",
+    assembly / logLevel := Level.Info,
+    assembly / test := {},
+    assembly / assemblyMergeStrategy := {
+      // Project hudi `dependsOn` spark and accidentally brings in it, along with its
+      // compile-time dependencies (like delta-storage). We want these excluded from the
+      // delta-hudi jar.
+      case PathList("io", "delta", xs @ _*) =>
+        // - delta-storage will bring in classes: io/delta/storage
+        // - delta-spark will bring in classes: io/delta/exceptions/, io/delta/implicits,
+        //   io/delta/package, io/delta/sql, io/delta/tables,
+        MergeStrategy.discard
+      case PathList("com", "databricks", xs @ _*) =>
+        // delta-spark will bring in com/databricks/spark/util
+        MergeStrategy.discard
+      case PathList("org", "apache", "spark", "sql", "delta", "hudi", xs @ _*) =>
+        MergeStrategy.first
+      case PathList("org", "apache", "spark", xs @ _*) =>
+        MergeStrategy.discard
+      // Discard `module-info.class` to fix the `different file contents found` error.
+      // TODO Upgrade SBT to 1.5 which will do this automatically
+      case "module-info.class" => MergeStrategy.discard
+      // Discard unused `parquet.thrift` so that we don't conflict the file used by the user
+      case "parquet.thrift" => MergeStrategy.discard
+      // Hudi metadata writer requires this service file to be present on the classpath
+      case "META-INF/services/org.apache.hadoop.hbase.regionserver.MetricsRegionServerSourceFactory" => MergeStrategy.first
+      // Discard the jackson service configs that we don't need. These files are not shaded so
+      // adding them may conflict with other jackson version used by the user.
+      case PathList("META-INF", "services", xs @ _*) => MergeStrategy.discard
+      case x =>
+        MergeStrategy.first
+    },
+    // Make the 'compile' invoke the 'assembly' task to generate the uber jar.
+    Compile / packageBin := assembly.value
+  )
+
 lazy val hive = (project in file("connectors/hive"))
   .dependsOn(standaloneCosmetic)
   .settings (
@@ -1120,7 +1171,7 @@ val createTargetClassesDir = taskKey[Unit]("create target classes dir")
 
 // Don't use these groups for any other projects
 lazy val sparkGroup = project
-  .aggregate(spark, contribs, storage, storageS3DynamoDB, iceberg, testDeltaIcebergJar, sharing)
+  .aggregate(spark, contribs, storage, storageS3DynamoDB, iceberg, testDeltaIcebergJar, sharing, hudi)
   .settings(
     // crossScalaVersions must be set to Nil on the aggregating project
     crossScalaVersions := Nil,

diff --git a/hudi/README.md b/hudi/README.md
@@ -0,0 +1,22 @@
+# Converting to Hudi with UniForm
+## Create a table with Hudi UniForm enabled
+Using spark-sql you can create a table and insert a few records into it. You will need to include the delta-hudi-assembly jar on the path.
+```
+spark-sql --packages io.delta:delta-spark_2.12:3.2.0-SNAPSHOT --jars delta-hudi-assembly_2.12-3.2.0-SNAPSHOT.jar --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog"
+```
+Then you can create a table with Hudi UniForm enabled.
+```
+CREATE TABLE `delta_table_with_hudi` (col1 INT) USING DELTA TBLPROPERTIES('delta.universalFormat.enabledFormats' = 'hudi') LOCATION '/tmp/delta-table-with-hudi';
+```
+And insert a record into it.
+```
+INSERT INTO delta_table_with_hudi VALUES (1);
+```
+
+## Read the table with Hudi
+Hudi does not currently support spark 3.5.X so you will need to launch a spark shell with spark 3.4.X or earlier.  
+Instructions for launching the spark-shell with Hudi can be found [here](https://hudi.apache.org/docs/quick-start-guide#spark-shellsql).  
+After launching the shell, you can read the table by enabling the hudi metadata table in the reader and loading from the path used in the create table step.
+```scala
+val df = spark.read.format("hudi").option("hoodie.metadata.enable", "true").load("/tmp/delta-table-with-hudi")
+```