# Set up local SparkSession

In [1]:
from datetime import date

from pyspark.sql import SparkSession, Row
import pyspark.sql.functions as F

from delta.tables import DeltaTable

In [2]:
spark = (
    SparkSession
    .builder
    .master("local[*]")
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.2.1")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.repl.eagerEval.maxNumRows", 10)
    .getOrCreate()
)

22/05/20 13:31:09 WARN Utils: Your hostname, HF-LPT-1115A.local resolves to a loopback address: 127.0.0.1; using 10.10.128.183 instead (on interface en0)
22/05/20 13:31:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/guido.kosloffgancedo/Git/pp/jupyter-demo/.venv/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/guido.kosloffgancedo/.ivy2/cache
The jars for the packages stored in: /Users/guido.kosloffgancedo/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ef2e95ba-20ad-4dfa-89c4-9f97f934ef7c;1.0
	confs: [default]
	found io.delta#delta-core_2.12;1.2.1 in central
	found io.delta#delta-storage;1.2.1 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in spark-list
:: resolution report :: resolve 1590ms :: artifacts dl 5ms
	:: modules in use:
	io.delta#delta-core_2.12;1.2.1 from central in [default]
	io.delta#delta-storage;1.2.1 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from spark-list in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       co

# Read flights data from CSV

In [3]:
df = (
    spark
    .read
    .option("header", True)
    .option("inferSchema", True)
    .csv("flights.csv")
).cache()
df

year,month,day,departure_time,arrival_time,carrier,flight_number,aircraft_registration,flight_time,airport_origin,airport_destination,distance,is_cancelled,is_diverted
2008,1,3,1343,1451,WN,588,N240WN,68,HOU,LIT,393,0,0
2008,1,3,1125,1247,WN,1343,N523SW,82,HOU,MAF,441,0,0
2008,1,3,2009,2136,WN,3841,N280WN,87,HOU,MAF,441,0,0
2008,1,3,903,1203,WN,3,N308SA,120,HOU,MCO,848,0,0
2008,1,3,1423,1726,WN,25,N462WN,123,HOU,MCO,848,0,0
2008,1,3,2024,2325,WN,51,N483WN,121,HOU,MCO,848,0,0
2008,1,3,1753,2053,WN,940,N493WN,120,HOU,MCO,848,0,0
2008,1,3,622,935,WN,2621,N266WN,133,HOU,MCO,848,0,0
2008,1,3,1944,2210,WN,389,N266WN,146,HOU,MDW,937,0,0
2008,1,3,1453,1716,WN,519,N514SW,143,HOU,MDW,937,0,0


# Perform some simple aggregations and transformations

In [4]:
df = (
    df
    .withColumn("date", F.to_date(F.concat("year", F.lit("-"), "month", F.lit("-"), "day")))
    .where("is_cancelled = 0 and is_diverted = 0")
    .groupBy("date", "airport_origin", "airport_destination")
    .agg({"*": "count", "flight_time": "avg"})
    .withColumnRenamed("count(1)", "count")
    .withColumnRenamed("avg(flight_time)", "avg_flight_time")
    .withColumn("avg_flight_time", F.col("avg_flight_time").cast("int"))
)
df.orderBy("count", ascending=False)

date,airport_origin,airport_destination,count,avg_flight_time
2008-01-03,LAX,OAK,20,80
2008-01-03,LAS,PHX,18,64
2008-01-03,LAX,LAS,14,63
2008-01-03,LAS,LAX,14,67
2008-01-03,LAS,SAN,14,63
2008-01-03,LAS,BUR,13,62
2008-01-03,LAX,PHX,13,66
2008-01-03,LAX,SJC,13,64
2008-01-03,LAS,RNO,13,74
2008-01-03,LAS,OAK,13,95


# Write to partitioned Delta table

In [5]:
(
    df
    .coalesce(5)
    .write
    .mode("overwrite")
    .partitionBy(["date"])
    .format("delta")
    .saveAsTable("flights")
)

                                                                                

# Get Delta table

In [6]:
flights_table = DeltaTable.forName(spark, "flights")

# Perform some Delta operations on the table: update, delete

In [9]:
flights_table.update(
    condition="avg_flight_time = 80",
    set={"avg_flight_time": "null"}
)

flights_table.delete("airport_origin = 'LAS' or airport_destination = 'LAS'")

flights_table.toDF().orderBy("count", ascending=False)

date,airport_origin,airport_destination,count,avg_flight_time
2008-01-03,LAX,OAK,20,
2008-01-03,LAX,PHX,13,66.0
2008-01-03,LAX,SJC,13,64.0
2008-01-03,MCI,MDW,12,81.0
2008-01-03,MDW,MCI,12,79.0
2008-01-03,HOU,MSY,11,58.0
2008-01-03,MHT,BWI,11,79.0
2008-01-03,LAX,SMF,10,71.0
2008-01-03,MDW,STL,10,59.0
2008-01-03,HRL,HOU,9,54.0


# Perorm some Delta operations on the table: merge

In [10]:
incoming_df = spark.createDataFrame([
    {"date": date(2008, 1, 3), "airport_origin": "LAX", "airport_destination": "OAK", "count": 21, "avg_flight_time": 83},
    {"date": date(2008, 1, 4), "airport_origin": "JFK", "airport_destination": "EZE", "count": 22, "avg_flight_time": 660},
])

(
    flights_table
    .alias("existing")
    .merge(
        incoming_df.alias("incoming"),
        "existing.date = incoming.date "
        "AND existing.airport_origin = incoming.airport_origin "
        "AND existing.airport_destination = incoming.airport_destination"
    )
    .whenMatchedUpdate(set={
        "count": "incoming.count",
        "avg_flight_time": "incoming.avg_flight_time",
    })
    .whenNotMatchedInsert(values={
        "date": "incoming.date",
        "airport_origin": "incoming.airport_origin",
        "airport_destination": "incoming.airport_destination",
        "count": "incoming.count",
        "avg_flight_time": "incoming.avg_flight_time",
    })
    .execute()
)

flights_table.toDF().orderBy("count", ascending=False)

                                                                                

date,airport_origin,airport_destination,count,avg_flight_time
2008-01-04,JFK,EZE,22,660
2008-01-03,LAX,OAK,21,83
2008-01-03,LAX,PHX,13,66
2008-01-03,LAX,SJC,13,64
2008-01-03,MCI,MDW,12,81
2008-01-03,MDW,MCI,12,79
2008-01-03,MHT,BWI,11,79
2008-01-03,HOU,MSY,11,58
2008-01-03,MDW,STL,10,59
2008-01-03,LAX,SMF,10,71
