## Batch load to Data Lake using C# Spark

A good reference for additional syntax examples: https://github.com/dotnet/spark/blob/master/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/Basic.cs


In [1]:
// from pyspark.sql.functions import col, desc, regexp_replace, substring, to_date, from_json, explode, expr
// from pyspark.sql.types import StructType, StringType
using Microsoft.Spark.Sql;
using Microsoft.Spark.Sql.Types;
using static Microsoft.Spark.Sql.Functions;


var taxi_zone_path = "abfss://demo@dvtrainingadls.dfs.core.windows.net/nyctaxi/lookups/taxi_zone";
var taxi_rate_path = "abfss://demo@dvtrainingadls.dfs.core.windows.net/nyctaxi/lookups/taxi_rate_code";
var yellow_delta_path = "abfss://demo@dvtrainingadls.dfs.core.windows.net/nyctaxi/tripdata/yellow_delta";

var date_format = "yyyy-MM-dd HH:mm:ss";

// Define a schema that Spark understands. This is one of several ways to do it.
var trip_schema = new StructType(new[]
{
    new StructField("VendorID", new IntegerType()),
    new StructField("tpep_pickup_datetime", new StringType()),
    new StructField("tpep_dropoff_datetime", new StringType()),
    new StructField("passenger_count", new IntegerType()),
    new StructField("trip_distance", new DoubleType()),
    new StructField("RatecodeID", new IntegerType()),
    new StructField("store_and_fwd_flag", new StringType()),
    new StructField("PULocationID", new IntegerType()),
    new StructField("DOLocationID", new IntegerType()),
    new StructField("payment_type", new IntegerType()),
    new StructField("fare_amount", new DoubleType()),
    new StructField("extra", new DoubleType()),
    new StructField("mta_tax", new DoubleType()),
    new StructField("tip_amount", new DoubleType()),
    new StructField("tolls_amount", new DoubleType()),
    new StructField("improvement_surcharge", new DoubleType()),
    new StructField("total_amount", new DoubleType())
});

In [3]:
var input_df = spark.Read()
    .Option("header","true")
    .Option("inferSchema", "true")
    .Csv("abfss://demo@dvtrainingadls.dfs.core.windows.net/nyctaxi/lookups/taxi_zone_lookup.csv"); 


var df = input_df.WithColumnRenamed("service_zone", "ServiceZone");

df.Write().Format("delta").Mode("overwrite").Save(taxi_zone_path);

In [4]:
// If you want to delete the trips table before starting, keep following line uncommented
// dbutils.fs.rm(yellow_delta_path,recurse=True)
spark.Conf().Set("spark.sql.shuffle.partitions", "18");

var input_df = spark.Read()
    .Option("header","true")
    .Schema(trip_schema)
    .Csv("abfss://demo@dvtrainingadls.dfs.core.windows.net/nyctaxi/tripdata/yellow/2019/yellow_tripdata_2019-*");
// .Option("inferSchema", "true")

// Take your pick on how to transform, withColumn or SQL Expressions. Only one of these is needed.
// Option A
var transformed_df = input_df
    .WithColumn("year_month", RegexpReplace(Substring(Col("tpep_pickup_datetime"),1,7), "-", "_"))
    .WithColumn("pickup_dt", ToDate(Col("tpep_pickup_datetime"), date_format)) 
    .WithColumn("dropoff_dt", ToDate(Col("tpep_dropoff_datetime"), date_format))
    .WithColumn("tip_pct", Col("tip_amount") / Col("total_amount"));
  
// Option B
// var transformed_df = input_df.SelectExpr(
//                   "*",
//                   "replace(left(tpep_pickup_datetime, 7),\"-\",\"_\") as year_month",
//                   $"to_date(tpep_pickup_datetime, \"{date_format}\") as pickup_dt",
//                   $"to_date(tpep_dropoff_datetime, \"{date_format}\") as dropoff_dt",
//                   $"tip_amount/total_amount as tip_pct");

var zone_df = spark.Read().Format("delta").Load(taxi_zone_path);

// Join to bring in Taxi Zone data
var trip_df = transformed_df
     .Join(zone_df, transformed_df["PULocationID"] == zone_df["LocationID"], "left").Drop("LocationID")
     .WithColumnRenamed("Burough", "PickupBurrough")
     .WithColumnRenamed("Zone", "PickupZone")
     .WithColumnRenamed("ServiceZone", "PickupServiceZone");

trip_df.Write().Mode("overwrite").PartitionBy("year_month").Format("delta").Save(yellow_delta_path);


## Test read
Simple test read of the delta formatted data that was just saved.


In [2]:
var test_df = spark.Read().Format("delta").Load(yellow_delta_path).Limit(20);
test_df.Select("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count").Show();

+--------+--------------------+---------------------+---------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|
+--------+--------------------+---------------------+---------------+
|       1| 2019-01-01 00:46:40|  2019-01-01 00:53:20|              1|
|       1| 2019-01-01 00:59:47|  2019-01-01 01:18:59|              1|
|       1| 2019-01-01 00:21:28|  2019-01-01 00:28:37|              1|
|       1| 2019-01-01 00:32:01|  2019-01-01 00:45:39|              1|
|       1| 2019-01-01 00:57:32|  2019-01-01 01:09:32|              2|
|       1| 2019-01-01 00:24:04|  2019-01-01 00:47:06|              2|
|       1| 2019-01-01 00:21:59|  2019-01-01 00:28:24|              1|
|       1| 2019-01-01 00:45:21|  2019-01-01 01:31:05|              1|
|       1| 2019-01-01 00:43:19|  2019-01-01 01:07:42|              1|
|       1| 2019-01-01 00:58:24|  2019-01-01 01:15:18|              1|
|       2| 2019-01-01 00:23:14|  2019-01-01 00:25:40|              1|
|       2| 2019-01-0