In [1]:
spark

Intitializing Scala interpreter ...

Spark Web UI available at http://12f90e88fb0f:4041
SparkContext available as 'sc' (version = 3.5.5, master = local[*], app id = local-1751376125011)
SparkSession available as 'spark'


res0: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@688a87ad


In [52]:
import org.apache.spark.sql.SparkSession

val session = SparkSession.builder.appName("DatasetAPI").getOrCreate()

case class Event (
    //Option is a way to handle NULL more gracefully
    user_id: Option[Integer],
    device_id: Option[Integer],
    referrer: String,
    host: String,
    url: String,
    event_time: String
)

import org.apache.spark.sql.SparkSession
session: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@688a87ad
defined class Event


In [51]:
val dummyData = List(
    Event(
        user_id=Some(1),
        device_id=Some(2),
        referrer="LinkedIn",
        host="eczachly.com",
        url="/signup",
        event_time="2023-01-01"
    )
)

dummyData: List[Event] = List(Event(Some(1),Some(2),LinkedIn,eczachly.com,/signup,2023-01-01))


In [10]:
case class Device (
    device_id: Integer,
    browser_type: String,
    os_type: String,
    device_type: String
)

defined class Device


In [11]:
case class EventWithDevice (
    user_id: Integer,
    device_id: Integer,
    browser_type: String,
    os_type: String,
    device_type: String,
    referrer: String,
    host: String,
    url: String,
    event_time: String
)

defined class EventWithDevice


In [54]:
import org.apache.spark.sql.Dataset

import session.implicits._

// Applying this case class before hand is very powerful, enforces Nullability/non-nullability at runtime!
val events: Dataset[Event] = session.read.option("header", "true")
                                                .option("inferSchema", "true")
                                                .csv("/home/iceberg/data/events.csv")
                                                .as[Event]

val devices: Dataset[Device] = session.read.option("header", "true")
                                                    .option("inferSchema", "true")
                                                    .csv("/home/iceberg/data/devices.csv")
                                                    .as[Device]

events.createOrReplaceTempView("events")
devices.createOrReplaceTempView("devices")

// For simple transformations, you can see that these approaches are very similar. Dataset is winning slightly because of the quality enforcement
// Dataset
val filteredViaDataset = events.filter(event => event.user_id.isDefined && event.device_id.isDefined)

// DataFrame
val filteredViaDataFrame = events.toDF().where($"user_id".isNotNull && $"device_id".isNotNull)

// Spark SQL
val filteredViaSparkSQL = session.sql("select * from events where user_id is not null and device_id is not null")


import org.apache.spark.sql.Dataset
import session.implicits._
events: org.apache.spark.sql.Dataset[Event] = [user_id: int, device_id: int ... 4 more fields]
devices: org.apache.spark.sql.Dataset[Device] = [device_id: int, browser_type: string ... 2 more fields]
filteredViaDataset: org.apache.spark.sql.Dataset[Event] = [user_id: int, device_id: int ... 4 more fields]
filteredViaDataFrame: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_id: int, device_id: int ... 4 more fields]
filteredViaSparkSQL: org.apache.spark.sql.DataFrame = [user_id: int, device_id: int ... 4 more fields]


In [55]:
val combinedViaDataset = filteredViaDataset
                .joinWith(devices,
                        events("device_id") === devices("device_id"),
                        "inner")
                .map {
                    case (event: Event, device: Device) => EventWithDevice(
                        user_id=event.user_id.get,
                        device_id=device.device_id,
                        browser_type=device.browser_type,
                        os_type=device.os_type,
                        device_type=device.device_type,
                        referrer=event.referrer,
                        host=event.host,
                        url=event.url,
                        event_time=event.event_time
                    )
                }
                .map {
                    eventWithDevice =>
                        // Convert browser_type to uppercase while maintaining immutability
                        eventWithDevice.copy(
                            browser_type = eventWithDevice.browser_type.toUpperCase
                        )
                }

combinedViaDataset: org.apache.spark.sql.Dataset[EventWithDevice] = [user_id: int, device_id: int ... 7 more fields]


In [56]:
// DataFrames give up some of the intellisense because you no longer have static typing
val combinedViaDataFrame = filteredViaDataFrame.as("e")
                            .join(devices.as("d"),
                                    $"e.device_id" === $"d.device_id",
                                    "inner")
                            .select(
                                $"e.user_id",
                                $"d.device_id",
                                $"d.browser_type",
                                $"d.os_type",
                                $"d.device_type",
                                $"e.referrer",
                                $"e.host",
                                $"e.url",
                                $"e.event_time"
                            )

combinedViaDataFrame: org.apache.spark.sql.DataFrame = [user_id: int, device_id: int ... 7 more fields]


In [57]:
//Creating temp views is a good strategy if you're leveraging SparkSQL
filteredViaSparkSQL.createOrReplaceTempView("filtered_events")
val combinedViaSparkSQL = spark.sql(f"""
    select 
        e.user_id,
        d.device_id,
        d.browser_type,
        d.os_type,
        d.device_type,
        e.referrer,
        e.host,
        e.url,
        e.event_time
    from filtered_events e
    join devices d
    on e.device_id = d.device_id
""")

combinedViaSparkSQL: org.apache.spark.sql.DataFrame = [user_id: int, device_id: int ... 7 more fields]


In [49]:
val rows = combinedViaDataset.take(5)
rows.foreach(println)

combinedViaDataset.show(5)

combinedViaDataFrame.show(5)

combinedViaSparkSQL.show(5)

EventWithDevice(1037710827,532630305,OTHER,Other,Other,null,www.zachwilson.tech,/,2021-03-08 17:27:24.241)
EventWithDevice(925588856,532630305,OTHER,Other,Other,null,www.eczachly.com,/,2021-05-10 11:26:21.247)
EventWithDevice(-1180485268,532630305,OTHER,Other,Other,null,admin.zachwilson.tech,/,2021-02-17 16:19:30.738)
EventWithDevice(-1044833855,532630305,OTHER,Other,Other,null,www.zachwilson.tech,/,2021-09-24 15:53:14.466)
EventWithDevice(747494706,532630305,OTHER,Other,Other,null,www.zachwilson.tech,/,2021-09-26 16:03:17.535)
+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+
|    user_id|device_id|browser_type|os_type|device_type|referrer|                host|url|          event_time|
+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+
| 1037710827|532630305|       OTHER|  Other|      Other|    NULL| www.zachwilson.tech|  /|2021-03-08 17:27:...|
|  925588856|53263

rows: Array[EventWithDevice] = Array(EventWithDevice(1037710827,532630305,OTHER,Other,Other,null,www.zachwilson.tech,/,2021-03-08 17:27:24.241), EventWithDevice(925588856,532630305,OTHER,Other,Other,null,www.eczachly.com,/,2021-05-10 11:26:21.247), EventWithDevice(-1180485268,532630305,OTHER,Other,Other,null,admin.zachwilson.tech,/,2021-02-17 16:19:30.738), EventWithDevice(-1044833855,532630305,OTHER,Other,Other,null,www.zachwilson.tech,/,2021-09-24 15:53:14.466), EventWithDevice(747494706,532630305,OTHER,Other,Other,null,www.zachwilson.tech,/,2021-09-26 16:03:17.535))
