## Scala Spark: Load to Data Lake

In [8]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}

import spark.implicits._

val yellowSourcePath = "wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/yellow/puYear=2018/puMonth=*/*.parquet"
val taxiZoneSourcePath = "abfss://demo@datakickstartadls.dfs.core.windows.net/nyctaxi/lookups/taxi_zone_lookup.csv"

val taxiZonePath = "abfss://demo@datakickstartadls.dfs.core.windows.net/nyctaxi/lookups/taxi_zone"
val taxiRatePath = "abfss://demo@datakickstartadls.dfs.core.windows.net/nyctaxi/lookups/taxi_rate_code"
val yellowDeltaPath = "abfss://demo@datakickstartadls.dfs.core.windows.net/nyctaxi/tripdata/yellow_delta"

val dateFormat = "yyyy-MM-dd HH:mm:ss"

// Define a schema that Spark understands. This is one of several ways to do it.
val taxiZoneSchema = StructType(Seq(
    StructField("LocationID", IntegerType),
    StructField("Borough", StringType),
    StructField("Zone", StringType),
    StructField("ServiceZone", StringType)
))

import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
import spark.implicits._
yellowSourcePath: String = wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/yellow/puYear=2018/puMonth=*/*.parquet
taxiZoneSourcePath: String = abfss://demo@datakickstartadls.dfs.core.windows.net/nyctaxi/lookups/taxi_zone_lookup.csv
taxiZonePath: String = abfss://demo@datakickstartadls.dfs.core.windows.net/nyctaxi/lookups/taxi_zone
taxiRatePath: String = abfss://demo@datakickstartadls.dfs.core.windows.net/nyctaxi/lookups/taxi_rate_code
yellowDeltaPath: String = abfss://demo@datakickstartadls.dfs.core.windows.net/nyctaxi/tripdata/yellow_delta
dateFormat: String = yyyy-MM-dd HH:mm:ss
taxiZoneSchema: org.apache.spark.sql.types.StructType = StructType(StructField(LocationID,IntegerType,true), StructField(Borough,StringType,true), StructField(Zone,StringType,true), StructField(ServiceZone,StringType,true))

In [10]:
val zoneDF = spark.read.option("header","true").schema(taxiZoneSchema).csv(taxiZoneSourcePath) 

zoneDF.write.format("delta").mode("overwrite").save(taxiZonePath)

zoneDF.show()

zoneDF: org.apache.spark.sql.DataFrame = [LocationID: int, Borough: string ... 2 more fields]
+----------+-------------+--------------------+-----------+
|LocationID|      Borough|                Zone|ServiceZone|
+----------+-------------+--------------------+-----------+
|         1|          EWR|      Newark Airport|        EWR|
|         2|       Queens|         Jamaica Bay|  Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|  Boro Zone|
|         4|    Manhattan|       Alphabet City|Yellow Zone|
|         5|Staten Island|       Arden Heights|  Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|  Boro Zone|
|         7|       Queens|             Astoria|  Boro Zone|
|         8|       Queens|        Astoria Park|  Boro Zone|
|         9|       Queens|          Auburndale|  Boro Zone|
|        10|       Queens|        Baisley Park|  Boro Zone|
|        11|     Brooklyn|          Bath Beach|  Boro Zone|
|        12|    Manhattan|        Battery Park|Yellow Zone|
|     

In [14]:
val inputDF = spark.read.parquet(yellowSourcePath)

// Take your pick on how to transform, withColumn or SQL Expressions. Only one of these is needed.

// Option A
// val transformedDF = {
//     inputDF
//      .withColumn("yearMonth", regexp_replace(substring("tpepPickupDatetime",1,7), '-', '_'))
//      .withColumn("pickupDt", to_date("tpepPickupDatetime", dateFormat)) 
//      .withColumn("dropoffDt", to_date("tpepDropoffDatetime", dateFormat))
//      .withColumn("tipPct", col("tipAmount") / col("totalAmount"))
// }

// Option B
val transformedDF = inputDF.selectExpr(
                  "*",
                  "replace(left(tpepPickupDatetime, 7),'-','_') as yearMonth",
                  s"to_date(tpepPickupDatetime, '$dateFormat') as pickupDt",
                  s"to_date(tpepDropoffDatetime, '$dateFormat') as dropoffDt",
                  "tipAmount/totalAmount as tipPct")

val zoneDF = spark.read.format("delta").load(taxiZonePath)

// Join to bring in Taxi Zone data
val tripDF = {
    transformedDF.as("t")
        .join(zoneDF.as("z"), expr("t.PULocationID == z.LocationID"), joinType="left").drop("LocationID")
        .withColumnRenamed("Burough", "PickupBurrough")
        .withColumnRenamed("Zone", "PickupZone")
        .withColumnRenamed("ServiceZone", "PickupServiceZone")
}

tripDF.write.mode("overwrite").partitionBy("yearMonth").format("delta").save(yellowDeltaPath)

inputDF: org.apache.spark.sql.DataFrame = [vendorID: string, tpepPickupDateTime: timestamp ... 19 more fields]
transformedDF: org.apache.spark.sql.DataFrame = [vendorID: string, tpepPickupDateTime: timestamp ... 23 more fields]
zoneDF: org.apache.spark.sql.DataFrame = [LocationID: int, Borough: string ... 2 more fields]
tripDF: org.apache.spark.sql.DataFrame = [vendorID: string, tpepPickupDateTime: timestamp ... 26 more fields]

## Test read
Simple test read of the delta formatted data that was just saved.

In [16]:
val testDF = spark.read.format("delta").load(yellowDeltaPath).limit(20)
testDF.select("VendorID", "tpepPickupDatetime", "tpepDropoffDatetime", "passengerCount").show()

testDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [vendorID: string, tpepPickupDateTime: timestamp ... 26 more fields]
+--------+-------------------+-------------------+--------------+
|VendorID| tpepPickupDatetime|tpepDropoffDatetime|passengerCount|
+--------+-------------------+-------------------+--------------+
|       2|2018-03-06 09:12:00|2018-03-06 09:29:46|             1|
|       2|2018-03-24 17:42:04|2018-03-25 01:04:03|             4|
|       2|2018-03-31 05:57:24|2018-03-31 06:19:42|             1|
|       2|2018-03-02 10:34:37|2018-03-02 10:39:59|             1|
|       2|2018-03-05 22:26:30|2018-03-05 22:42:32|             1|
|       2|2018-03-02 00:37:08|2018-03-02 00:45:17|             1|
|       2|2018-03-31 11:21:32|2018-03-31 11:29:36|             1|
|       1|2018-03-02 03:27:16|2018-03-02 03:29:38|             1|
|       2|2018-03-05 20:19:56|2018-03-05 20:26:06|             2|
|       1|2018-03-02 05:43:37|2018-03-02 05:48:21|             1|
|       