Add descriptive NamedTransformations to Spark UI (#1223)

neilbest-db · gueniai · web-flow · commit bbdb61ff87b4 · 2024-08-20T11:11:03.000-04:00
* Initial commit * Add descriptive job group IDs and named transformations This makes the Spark UI more developer-friendly when analyzing Overwatch runs. Job group IDs have the form <workspace name>:<OW module name> Any use of `.transform( df => df)` may be replaced with `.transformWithDescription( nt)` after instantiating a `val nt = NamedTransformation( df => df)` as its argument. This commit contains one such application of the new extension method. (See `val jobRunsAppendClusterName` in `WorkflowsTransforms.scala`.) Some logic in `GoldTransforms` falls through to elements of the special job-run-action form of Job Group IDs emitted by the platform but the impact is minimal relative to the benefit to Overwatch development and troubleshooting. Even so this form of Job Group ID is still present in initial Spark events before OW ETL modules begin to execute. * improve TransformationDescriberTest * flip transformation names to beginning of label for greater visibility in Spark UI. `NamedTransformation` type name now appears in labels' second position. (cherry picked from commit 2ead752) * revert modified Spark UI Job Group labels TODO: enumerate the regressions this would introduce when the labels set by then platform are replaced this way. --------- Co-authored-by: Guenia <guenia.izquierdo@databricks.com>
diff --git a/build.sbt b/build.sbt
@@ -18,6 +18,7 @@ libraryDependencies += "com.databricks" % "dbutils-api_2.12" % "0.0.5" % Provide
 libraryDependencies += "com.amazonaws" % "aws-java-sdk-s3" % "1.11.595" % Provided
 libraryDependencies += "io.delta" % "delta-core_2.12" % "1.0.0" % Provided
 libraryDependencies += "org.scalaj" %% "scalaj-http" % "2.4.2"
+libraryDependencies += "com.lihaoyi" %% "sourcecode" % "0.4.1"
 
 //libraryDependencies += "org.apache.hive" % "hive-metastore" % "2.3.9"
 
@@ -51,4 +52,4 @@ assemblyMergeStrategy in assembly := {
   case PathList("META-INF", xs @ _*) => MergeStrategy.discard
   case x => MergeStrategy.first
 }
- assembly / assemblyOption := (assembly / assemblyOption).value.copy(includeScala = false)
+ assembly / assemblyOption := (assembly / assemblyOption).value.copy(includeScala = false)
diff --git a/src/main/scala/com/databricks/labs/overwatch/pipeline/ETLDefinition.scala b/src/main/scala/com/databricks/labs/overwatch/pipeline/ETLDefinition.scala
@@ -27,7 +27,18 @@ class ETLDefinition(
 
     val transformedDF = transforms.foldLeft(verifiedSourceDF) {
       case (df, transform) =>
-        df.transform(transform)
+	/* 
+ 	 * reverting Spark UI Job Group labels for now
+   	 *
+	 * TODO: enumerate the regressions this would introduce
+	 *       when the labels set by then platform are replaced
+	 *       this way.
+	 * df.sparkSession.sparkContext.setJobGroup(
+         *    s"${module.pipeline.config.workspaceName}:${module.moduleName}",
+         *    transform.toString)
+	 */
+	
+	df.transform( transform)
     }
     write(transformedDF, module)
   }
diff --git a/src/main/scala/com/databricks/labs/overwatch/pipeline/SilverTransforms.scala b/src/main/scala/com/databricks/labs/overwatch/pipeline/SilverTransforms.scala
@@ -12,6 +12,7 @@ import org.apache.spark.sql.{Column, DataFrame}
 
 trait SilverTransforms extends SparkSessionWrapper {
 
+  import TransformationDescriber._
   import spark.implicits._
 
   private val logger: Logger = Logger.getLogger(this.getClass)
@@ -1461,7 +1462,8 @@ trait SilverTransforms extends SparkSessionWrapper {
 
     // caching before structifying
     jobRunsDeriveRunsBase(jobRunsLag30D, etlUntilTime)
-      .transform(jobRunsAppendClusterName(jobRunsLookups))
+      .transformWithDescription(
+        jobRunsAppendClusterName( jobRunsLookups))
       .transform(jobRunsAppendJobMeta(jobRunsLookups))
       .transform(jobRunsStructifyLookupMeta(optimalCacheParts))
       .transform(jobRunsAppendTaskAndClusterDetails)
diff --git a/src/main/scala/com/databricks/labs/overwatch/pipeline/WorkflowsTransforms.scala b/src/main/scala/com/databricks/labs/overwatch/pipeline/WorkflowsTransforms.scala
@@ -12,6 +12,7 @@ import org.apache.spark.sql.{Column, DataFrame}
 
 object WorkflowsTransforms extends SparkSessionWrapper {
 
+  import TransformationDescriber._
   import spark.implicits._
 
   /**
@@ -991,31 +992,38 @@ object WorkflowsTransforms extends SparkSessionWrapper {
   }
 
   /**
-   * looks up the cluster_name based on id first from job_status_silver and if not present there fallback to latest
-   * snapshot prior to the run
-   */
-  def jobRunsAppendClusterName(lookups: Map[String, DataFrame])(df: DataFrame): DataFrame = {
-
-    val runsWClusterNames1 = if (lookups.contains("cluster_spec_silver")) {
-      df.toTSDF("timestamp", "organization_id", "clusterId")
-        .lookupWhen(
-          lookups("cluster_spec_silver")
-            .toTSDF("timestamp", "organization_id", "clusterId")
-        ).df
-    } else df
-
-    val runsWClusterNames2 = if (lookups.contains("clusters_snapshot_bronze")) {
-      runsWClusterNames1
-        .toTSDF("timestamp", "organization_id", "clusterId")
-        .lookupWhen(
-          lookups("clusters_snapshot_bronze")
-            .toTSDF("timestamp", "organization_id", "clusterId")
-        ).df
-    } else runsWClusterNames1
+    * Look up the cluster_name based on id first from
+    * `job_status_silver`.  If not present there fallback to latest
+    * snapshot prior to the run
+    */
+
+  val jobRunsAppendClusterName = (lookups: Map[String,DataFrame]) => NamedTransformation {
+
+    (df: DataFrame) => {
+
+      val runsWClusterNames1 = if (lookups.contains("cluster_spec_silver")) {
+        df.toTSDF("timestamp", "organization_id", "clusterId")
+          .lookupWhen(
+            lookups("cluster_spec_silver")
+              .toTSDF("timestamp", "organization_id", "clusterId")
+          ).df
+      } else df
+
+      val runsWClusterNames2 = if (lookups.contains("clusters_snapshot_bronze")) {
+        runsWClusterNames1
+          .toTSDF("timestamp", "organization_id", "clusterId")
+          .lookupWhen(
+            lookups("clusters_snapshot_bronze")
+              .toTSDF("timestamp", "organization_id", "clusterId")
+          ).df
+      } else runsWClusterNames1
+
+      runsWClusterNames2
+    }
 
-    runsWClusterNames2
   }
 
+
   /**
    * looks up the job name based on id first from job_status_silver and if not present there fallback to latest
    * snapshot prior to the run
diff --git a/src/main/scala/com/databricks/labs/overwatch/utils/TransformationDescriber.scala b/src/main/scala/com/databricks/labs/overwatch/utils/TransformationDescriber.scala
@@ -0,0 +1,58 @@
+package com.databricks.labs.overwatch.utils
+
+import org.apache.spark.sql.Dataset
+
+// TODO: implement this as a `trait`.  Initial attempts would not
+// compile because of the dependencies among other `trait`s and
+// `object`s that would have to be refactored.
+
+object TransformationDescriber {
+
+
+  class NamedTransformation[T,U](
+    val transformation: Dataset[T] => Dataset[U])(
+    implicit _name: sourcecode.Name) {
+
+    final val name: String = _name.value
+
+    override def toString = s"${_name.value}: NamedTransformation"
+
+  }
+
+
+  object NamedTransformation {
+
+    def apply[T,U](
+      transformation: Dataset[T] => Dataset[U])(
+      implicit name: sourcecode.Name) =
+      new NamedTransformation( transformation)( name)
+
+  }
+
+
+  implicit class TransformationDescriber[T,U]( ds: Dataset[T]) {
+
+    def transformWithDescription[U](
+      namedTransformation: NamedTransformation[T,U])(
+      implicit
+        // enclosing: sourcecode.Enclosing,
+        name: sourcecode.Name,
+        fileName: sourcecode.FileName,
+        line: sourcecode.Line
+    ): Dataset[U] = {
+
+      // println( s"Inside TransformationDescriber.transformWithDescription: $enclosing")
+
+      val callSite =  s"${name.value} at ${fileName.value}:${line.value}"
+
+      val sc = ds.sparkSession.sparkContext
+      sc.setJobDescription( namedTransformation.toString)
+      sc.setCallSite( callSite)
+
+      ds.transform( namedTransformation.transformation)
+
+    }
+
+  }
+
+}
diff --git a/src/test/scala/com/databricks/labs/overwatch/utils/TransformationDescriberTest.scala b/src/test/scala/com/databricks/labs/overwatch/utils/TransformationDescriberTest.scala
@@ -0,0 +1,102 @@
+package com.databricks.labs.overwatch.utils
+
+import com.databricks.labs.overwatch.SparkSessionTestWrapper
+import org.apache.spark.sql.DataFrame
+import org.scalatest.funspec.AnyFunSpec
+import org.scalatest.GivenWhenThen
+import java.io.ByteArrayOutputStream
+
+class TransformationDescriberTest
+    extends AnyFunSpec
+    with GivenWhenThen
+    with SparkSessionTestWrapper {
+
+  import TransformationDescriber._
+  import spark.implicits._
+  spark.conf.set("spark.sql.session.timeZone", "UTC")
+
+  val t = (df: DataFrame) => df.select( $"foo")
+
+  val nt = NamedTransformation( t)
+
+  // TODO: replace use of `s` and `Console.withOut` with an abstraction
+
+  val s = new ByteArrayOutputStream
+
+  describe( "A NamedTransformation") {
+
+    it( "wraps a function literal") {
+
+      info( s"nt.transformation: ${nt.transformation}")
+
+      assert( nt.transformation === t)
+
+    }
+
+    it( "knows its own name") {
+
+      info( s"`nt.name`: ${nt.name}")
+      info( s"`nt.toString`: ${nt.toString}")
+
+      assert( nt.name === "nt")
+      assert( nt.toString === "nt: NamedTransformation")
+
+    }
+
+    Given( "a Spark `Dataset` (including `DataFrame`s)")
+
+    val in = Seq( ("foo", "bar")).toDF( "foo", "bar")
+
+    Console.withOut( s) {
+      in.show(numRows= 1, truncate= 0, vertical= true)
+    }
+    // info( s.toString)
+    s.toString.linesIterator.foreach( info(_))
+    s.reset
+
+    When( "a `NamedTransformation` is applied")
+
+    val out = in.transformWithDescription( nt)
+
+    // val s = new ByteArrayOutputStream
+    Console.withOut( s) {
+      out.show(numRows= 1, truncate= 0, vertical= true)
+    }
+    // info( s.toString)
+    s.toString.linesIterator.foreach( info(_))
+
+
+
+    Then( "the resulting Spark jobs have a matching description (pending)")
+
+    // info( s"""spark.jobGroup.id: ${out.sparkSession.sparkContext.getLocalProperty( "spark.jobGroup.id")}""")
+
+    val sjd = out.sparkSession.sparkContext.getLocalProperty( "spark.job.description")
+
+    info( s"spark.job.description: ${sjd}")
+
+    assert( sjd === "nt: NamedTransformation")
+
+    // info( s"""spark.callSite.short: ${out.sparkSession.sparkContext.getLocalProperty( "spark.callSite.short")}""")
+    // info( s"""spark.callSite.long: ${out.sparkSession.sparkContext.getLocalProperty( "spark.callSite.long")}""")
+
+    
+
+
+
+
+    And( "the result of the transformation is correct")
+
+    assertResult( "`foo` STRING") {
+      out.schema.toDDL
+    }
+
+    assertResult( "foo") {
+      out.first.getString(0)
+    }
+
+
+  }
+
+
+}