# Set up local SparkSession

In [4]:
from datetime import date

from pyspark.sql import SparkSession, Row
import pyspark.sql.functions as F

from delta.tables import DeltaTable

In [5]:
spark = (
    SparkSession
    .builder
    .master("local[*]")
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.2.1")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.repl.eagerEval.maxNumRows", 10)
    .getOrCreate()
)

# Read weather data from CSV

In [6]:
df = (
    spark
    .read
    .option("header", True)
    .option("inferSchema", True)
    .csv("weather.csv")
).cache()

# Perform some simple transformations

In [7]:
df = (
    df
    .withColumn("date", F.to_date(F.col("date"), "M/d/yyyy"))
    .withColumn("year", F.year("date"))
    .withColumn("month", F.month("date"))
    .select("date", "temperature_mean_f", "wind_speed_mean_mph", "cloud_cover", "year", "month")
)
df

date,temperature_mean_f,wind_speed_mean_mph,cloud_cover,year,month
2014-01-01,29,9,5,2014,1
2014-01-02,26,18,8,2014,1
2014-01-03,14,20,3,2014,1
2014-01-04,17,7,3,2014,1
2014-01-05,36,7,7,2014,1
2014-01-06,37,21,7,2014,1
2014-01-07,13,23,1,2014,1
2014-01-08,17,13,3,2014,1
2014-01-09,28,7,4,2014,1
2014-01-10,31,4,8,2014,1


# Write to partitioned Delta table

In [8]:
(
    df
    .coalesce(5)
    .write
    .mode("overwrite")
    .partitionBy(["year", "month"])
    .format("delta")
    .saveAsTable("weather")
)

                                                                                

# Perform some Delta operations on the table (update, delete, merge...)

In [9]:
weather_table = DeltaTable.forName(spark, "weather")

weather_table.update(
    condition="cloud_cover == 5",
    set={"cloud_cover": "null"}
)

weather_table.delete("date < '2016-06-25'")

incoming_df = spark.createDataFrame([
    {"date": date(2016, 6, 30), "temperature_mean_f": 30, "wind_speed_mean_mph": 10, "cloud_cover": 1, "year": 2016, "month": 6},
    {"date": date(2021, 1, 1), "temperature_mean_f": 22, "wind_speed_mean_mph": 12, "cloud_cover": 5, "year": 2021, "month": 1},
])

(
    weather_table
    .alias("existing")
    .merge(
        incoming_df.alias("incoming"),
        "existing.date = incoming.date "
        "AND existing.year = incoming.year "
        "AND existing.month = incoming.month"
    )
    .whenMatchedUpdate(set={
        "temperature_mean_f": "incoming.temperature_mean_f",
        "wind_speed_mean_mph": "incoming.wind_speed_mean_mph",
        "cloud_cover": "incoming.cloud_cover",
    })
    .whenNotMatchedInsert(values={
        "date": "incoming.date",
        "temperature_mean_f": "incoming.temperature_mean_f",
        "wind_speed_mean_mph": "incoming.wind_speed_mean_mph",
        "cloud_cover": "incoming.cloud_cover",
        "year": "incoming.year",
        "month": "incoming.month",
    })
    .execute()
)

weather_table.toDF().orderBy("date", ascending=False)

22/05/20 09:53:45 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


date,temperature_mean_f,wind_speed_mean_mph,cloud_cover,year,month
2021-01-01,22,12,5.0,2021,1
2016-06-30,30,10,1.0,2016,6
2016-06-29,80,8,,2016,6
2016-06-28,73,11,8.0,2016,6
2016-06-27,74,16,,2016,6
2016-06-26,73,10,4.0,2016,6
2016-06-25,73,9,1.0,2016,6
