# Set up local SparkSession

In [14]:
from pyspark.sql import SparkSession, Row
import pyspark.sql.functions as F
from delta.tables import DeltaTable

In [2]:
spark = (
    SparkSession
    .builder
    .master("local[*]")
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.1.0")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.repl.eagerEval.maxNumRows", 10)
    .getOrCreate()
)

21/12/21 19:36:53 WARN Utils: Your hostname, HF-LPT-1115A.local resolves to a loopback address: 127.0.0.1; using 192.168.1.49 instead (on interface en0)
21/12/21 19:36:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/gk/Git/jupyter-template/.venv/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/gk/.ivy2/cache
The jars for the packages stored in: /Users/gk/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1347a5c1-633d-4819-a425-dd1bb78b129d;1.0
	confs: [default]
	found io.delta#delta-core_2.12;1.1.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 129ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-core_2.12;1.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default

# Read weather data from CSV

In [3]:
df = (
    spark
    .read
    .option("header", True)
    .option("inferSchema", True)
    .csv("weather.csv")
).cache()

# Perform some simple transformations

In [4]:
df = (
    df
    .withColumn("date", F.to_date(F.col("date"), "M/d/yyyy"))
    .withColumn("year", F.year("date"))
    .withColumn("month", F.month("date"))
    .select("date", "temperature_mean_f", "wind_speed_mean_mph", "cloud_cover", "year", "month")
)
df

date,temperature_mean_f,wind_speed_mean_mph,cloud_cover,year,month
2014-01-01,29,9,5,2014,1
2014-01-02,26,18,8,2014,1
2014-01-03,14,20,3,2014,1
2014-01-04,17,7,3,2014,1
2014-01-05,36,7,7,2014,1
2014-01-06,37,21,7,2014,1
2014-01-07,13,23,1,2014,1
2014-01-08,17,13,3,2014,1
2014-01-09,28,7,4,2014,1
2014-01-10,31,4,8,2014,1


# Write to partitioned Delta table

In [6]:
(
    df
    .coalesce(5)
    .write
    .mode("overwrite")
    .partitionBy(["year", "month"])
    .format("delta")
    .saveAsTable("weather")
)

                                                                                

# Perform some Delta operations on the table (update, delete, merge...)

In [15]:
weather_table = DeltaTable.forName(spark, "weather")

weather_table.update(
    condition=F.expr("cloud_cover == 5"),
    set={"cloud_cover": F.expr("cloud_cover + 1")}
)
# TODO continue