In [1]:
import org.apache.spark.sql.SparkSession
import sys.env

val spark = SparkSession.builder()
    .appName("MyAppScala")
    .master("spark://192.168.0.144:7077")
    .config("spark.hadoop.hive.metastore.uris", "thrift://192.168.0.144:9083")
    .config("spark.hadoop.javax.jdo.option.ConnectionURL", "jdbc:mysql://192.168.0.144:3306/metastore_db")
    .config("spark.hadoop.javax.jdo.option.ConnectionDriverName", "com.mysql.cj.jdbc.Driver")
    .config("spark.hadoop.javax.jdo.option.ConnectionUserName", "lh")
    .config("spark.hadoop.javax.jdo.option.ConnectionPassword", env.getOrElse("MYSQL", "Default_Value"))
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2")
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.HDFSLogStore")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars", "/usr/local/spark/jars/delta_2.12-3.2.0.jar,/usr/local/spark/jars/delta-storage-3.2.0.jar,/usr/local/spark/jars/delta-spark_2.12-3.2.0.jar")    
    .config("spark.executor.memory", "9g")
    .config("spark.executor.cores", "3")
    .config("spark.driver.memory", "19g")
    .config("spark.driver.maxResultSize", "2g") 
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.shuffle.partitions", "1000")
    .config("spark.hadoop.fs.defaultFS", "hdfs://192.168.0.144:9000")
    .config("spark.databricks.delta.clusteredTable.enableClusteringTablePreview", "true")
    .enableHiveSupport()
    .getOrCreate()


spark = org.apache.spark.sql.SparkSession@50e6aa69


org.apache.spark.sql.SparkSession@50e6aa69

In [3]:
spark.stop()

In [3]:
spark.sql("USE zorder")
val tablesDf = spark.sql("SHOW TABLES").show()

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|   zorder|        campaign_dim|      false|
|   zorder|        customer_dim|      false|
|   zorder|      department_dim|      false|
|   zorder|        location_dim|      false|
|   zorder|         product_dim|      false|
|   zorder|zorder_eventid_ac...|      false|
|   zorder|zorder_eventid_ac...|      false|
|   zorder|zorder_eventid_ac...|      false|
|   zorder|zorder_eventid_ta...|      false|
|   zorder|zorder_eventid_ta...|      false|
|   zorder|zorder_eventid_ta...|      false|
+---------+--------------------+-----------+



tablesDf = ()


()

In [25]:
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ListBuffer
import scala.util.{Try, Success, Failure}


spark.sql("USE zorder")

val tables = Seq("zorder_eventid_table_2")
val numExecutions = 10

def measureExecutionTime(query: String): Double = {
  spark.catalog.clearCache()
  val startTime = System.nanoTime()
  spark.sql(query).show()
  val endTime = System.nanoTime()
  (endTime - startTime) / 1e9
}

val executionTimes = tables.map(table => table -> ListBuffer[Double]()).toMap

for (table <- tables) {
  for (_ <- 1 to numExecutions) {
    val query =
      s"""
        |SELECT
        |    product_dim.product_name,
        |    location_dim.city,
        |    location_dim.state,
        |    location_dim.country,
        |    SUM($table.value) AS total_value,
        |    COUNT($table.event_id) AS event_count
        |FROM
        |    $table
        |JOIN
        |    product_dim ON $table.product_id = product_dim.product_id
        |JOIN
        |    location_dim ON $table.location_id = location_dim.location_id
        |WHERE
        |    $table.event_id BETWEEN 459999 AND 999999
        |    AND $table.actor_id IN (5001, 5002, 5003)
        |GROUP BY
        |    product_dim.product_name,
        |    location_dim.city,
        |    location_dim.state,
        |    location_dim.country
        |ORDER BY
        |    total_value DESC;
      """.stripMargin

    val executionTime = Try(measureExecutionTime(query)) match {
      case Success(time) => 
        println(s"$table: $time seconds")
        executionTimes(table) += time
      case Failure(e) =>
        println(s"Error for table $table: ${e.getMessage}")
    }
  }
}

val averageTimes = executionTimes.mapValues(times => times.sum / numExecutions)

import spark.implicits._
val df = averageTimes.toSeq.toDF("Table", "Average Execution Time")
df.show()


+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546515|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393486|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983132|      32449|
|   Product F|Philadelphia|   PA|    USA|16259.047580883303|      32521|
|   Product H|   San Diego|   CA|    USA|16238.377853609752|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333423|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552868|      32367|
|   Product J|    San Jose|   CA|    USA|16205.668117725005|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272952989|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536646|      31985|
+------------+------------+-----+-------+----------

tables: Seq[String] = List(zorder_eventid_table_2)
numExecutions: Int = 10
measureExecutionTime: (query: String)Double
executionTimes: scala.collection.immutable.Map[String,scala.collection.mutable.ListBuffer[Double]] = Map(zorder_eventid_table_2 -> ListBuffer(23.022801957, 22.734479329, 22.798117126, 22.484545177, 23.115967342, 22.956612631, 23.741194325, 23.222408678, 22.326311432, 22.240243592))
averageTimes: scala.collection.immutable.Map[String,Double] = Map(zorder_eventid_table_2 -> 22.8642681589)
df: org.apache.spark.sql.DataFrame = [Table: string, Average Execution Time: double]


In [26]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import scala.collection.mutable.ListBuffer
import scala.util.{Try, Success, Failure}



spark.sql("USE zorder")

val tables = Seq("zorder_eventid_table_2")
val numExecutions = 10

def measureExecutionTime(df: org.apache.spark.sql.DataFrame): Double = {
  spark.catalog.clearCache()
  val startTime = System.nanoTime()
  df.show()
  val endTime = System.nanoTime()
  (endTime - startTime) / 1e9
}

val executionTimes = tables.map(table => table -> ListBuffer[Double]()).toMap

for (table <- tables) {
  val df = spark.table(table)
    .join(spark.table("product_dim"), col(s"$table.product_id") === col("product_dim.product_id"))
    .join(spark.table("location_dim"), col(s"$table.location_id") === col("location_dim.location_id"))
    .filter(col(s"$table.event_id").between(459999, 999999) && col(s"$table.actor_id").isin(5001, 5002, 5003))
    .groupBy("product_dim.product_name", "location_dim.city", "location_dim.state", "location_dim.country")
    .agg(
      sum(col(s"$table.value")).alias("total_value"),
      count(col(s"$table.event_id")).alias("event_count")
    )
    .orderBy(desc("total_value"))


  for (_ <- 1 to numExecutions) {
    Try(measureExecutionTime(df)) match {
      case Success(time) => 
        println(s"$table: $time seconds")
        executionTimes(table) += time
      case Failure(e) =>
        println(s"Error for table $table: ${e.getMessage}")
    }
  }
}

val averageTimes = executionTimes.mapValues(times => times.sum / numExecutions)

import spark.implicits._
val df = averageTimes.toSeq.toDF("Table", "Average Execution Time")


+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546513|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393485|      32485|
|   Product B| Los Angeles|   CA|    USA| 16260.50695098313|      32449|
|   Product F|Philadelphia|   PA|    USA|16259.047580883309|      32521|
|   Product H|   San Diego|   CA|    USA|16238.377853609754|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333418|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552868|      32367|
|   Product J|    San Jose|   CA|    USA| 16205.66811772501|      32434|
|   Product D|     Houston|   TX|    USA| 16115.73627295299|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536652|      31985|
+------------+------------+-----+-------+----------

tables: Seq[String] = List(zorder_eventid_table_2)
numExecutions: Int = 10
measureExecutionTime: (df: org.apache.spark.sql.DataFrame)Double
executionTimes: scala.collection.immutable.Map[String,scala.collection.mutable.ListBuffer[Double]] = Map(zorder_eventid_table_2 -> ListBuffer(21.751158897, 23.635414218, 22.433264602, 22.483855331, 22.216420412, 22.870801321, 23.12979216, 22.197630428, 22.480985652, 22.802940655))
averageTimes: scala.collection.immutable.Map[String,Double] = Map(zorder_eventid_table_2 -> 22.600226367599998)
df: org.apache.spark.sql.DataFrame = [Table: string, Average Execution T...


In [2]:
import org.apache.spark.sql.functions._


val numRows = 1000000
val numPartitions = 10

val df = spark.range(numRows).repartition(numPartitions)
  .withColumn("value", col("id") * 2)

val startTime = System.nanoTime()

val result = df
  .filter(col("value") % 2 === 0)
  .groupBy("value")
  .count()
  .collect()

df.show()

val endTime = System.nanoTime()
val duration = (endTime - startTime) / 1e9 

println(s"Scala execution time: $duration seconds")


+-----+------+
|   id| value|
+-----+------+
|29077| 58154|
|30650| 61300|
|27330| 54660|
|38002| 76004|
|12380| 24760|
| 9533| 19066|
|45961| 91922|
|58194|116388|
|62814|125628|
|28900| 57800|
|38297| 76594|
| 2469|  4938|
|15480| 30960|
|56232|112464|
|29772| 59544|
|63957|127914|
|63725|127450|
|  652|  1304|
|51374|102748|
|34207| 68414|
+-----+------+
only showing top 20 rows

Scala execution time: 11.861760716 seconds


numRows = 1000000
numPartitions = 10
df = [id: bigint, value: bigint]
startTime = 64579970230837
result = Array([92620,1], [90398,1], [123688,1], [38108,1], [54184,1], [13518,1], [60882,1], [699492,1], [783848,1], [749836,1], [797856,1], [725018,1], [727744,1], [715112,1], [1400632,1], [1381588,1], [1387108,1], [1391936,1], [1335392,1], [1333766,1], [1394200,1], [595768,1], [538090,1], [618626,1], [617410,1], [659334,1], [604110,1], [559532,1], [590390,1], [1266854,1], [1297432,1], [1255270,1], [1274588,1], [1284128,1], [1281372,1], [1283356,1], [1294130,1], [1261808,1], [1288982,1], [269650,1], [327812,1], [392944,1], [328380,1], [287904,1], [342820,1], [319112,1], [...


Array([92620,1], [90398,1], [123688,1], [38108,1], [54184,1], [13518,1], [60882,1], [699492,1], [783848,1], [749836,1], [797856,1], [725018,1], [727744,1], [715112,1], [1400632,1], [1381588,1], [1387108,1], [1391936,1], [1335392,1], [1333766,1], [1394200,1], [595768,1], [538090,1], [618626,1], [617410,1], [659334,1], [604110,1], [559532,1], [590390,1], [1266854,1], [1297432,1], [1255270,1], [1274588,1], [1284128,1], [1281372,1], [1283356,1], [1294130,1], [1261808,1], [1288982,1], [269650,1], [327812,1], [392944,1], [328380,1], [287904,1], [342820,1], [319112,1], [...