In [1]:
import os
import shutil
import pyspark

from delta import *

import pyspark.sql.functions as F
import pyspark.sql.types as T

In [2]:
# IMPORTANT: https://docs.delta.io/latest/quick-start.html#pyspark-shell

builder = (
    pyspark.sql.SparkSession.builder
    .appName("SparkSQLExampleApp")
    .master("local[4]")
    .config("spark.sql.shuffle.partition", 4)
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", 
            "org.apache.spark.sql.delta.catalog.DeltaCatalog"
    )
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY")
)

spark = (
    configure_spark_with_delta_pip(builder).getOrCreate()
)


:: loading settings :: url = jar:file:/Users/emif/Documents/spark-lab/.venv/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/emif/.ivy2/cache
The jars for the packages stored in: /Users/emif/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-632bce36-bffd-4ff1-b226-8b695cf85990;1.0
	confs: [default]
	found io.delta#delta-core_2.12;1.2.1 in central
	found io.delta#delta-storage;1.2.1 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 303ms :: artifacts dl 20ms
	:: modules in use:
	io.delta#delta-core_2.12;1.2.1 from central in [default]
	io.delta#delta-storage;1.2.1 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evic

In [3]:
schema = """CallNumber INT,
            UnitID STRING,                     
            IncidentNumber INT,           
            CallType STRING,                   
            CallDate STRING,                   
            WatchDate STRING,                  
            CallFinalDisposition STRING,       
            AvailableDtTm STRING,              
            Address STRING,                    
            City STRING,                       
            Zipcode  INT,                   
            Battalion STRING,                  
            StationArea INT,                
            Box INT,                        
            OriginalPriority INT,           
            Priority INT,                   
            FinalPriority INT,              
            ALSUnit BOOLEAN,                    
            CallTypeGroup STRING,              
            NumAlarms INT,                
            UnitType STRING,                   
            UnitSequenceInCallDispatch INT, 
            FirePreventionDistrict INT,     
            SupervisorDistrict INT,         
            Neighborhood STRING,               
            Location STRING,                   
            RowID STRING,                      
            Delay FLOAT
"""

In [4]:
# Path to file
csv_file = "/Users/emif/Documents/spark-lab/datasets/sf-fire-calls.csv"

# Read and create a temporary view
df = (
    spark
    .read
    .format("csv")
    .schema(schema)
    .option("header", "true")
    .load(csv_file)
)

In [5]:
df_cleansed = (
    df
    .dropDuplicates(subset=["CallNumber"])
    .drop("Location", "RowID")
)

In [6]:
df_final = (
    df_cleansed
    .withColumn("CallDate", F.to_date(F.col("CallDate"), "dd/MM/yyyy"))
    .withColumn("WatchDate", F.to_date(F.col("WatchDate"), "dd/MM/yyyy"))
    .withColumn("AvailableDtTm", F.to_timestamp(F.col("AvailableDtTm"), "dd/MM/yyyy HH:mm:ss"))
    .withColumn("CallYear", F.year("CallDate").cast(T.IntegerType()))
    .na.drop(subset=["CallYear"])
    .withColumn("creation_date", F.current_timestamp())
)

In [7]:
df_final.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: date (nullable = true)
 |-- WatchDate: date (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: timestamp (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: integer (nullable = true)
 |-- Box: integer (nullable = true)
 |-- OriginalPriority: integer (nullable = true)
 |-- Priority: integer (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: integer (nullable = tru

In [8]:
path = f"{os.getcwd()}/storage/sf_fire_calls"

In [9]:
# delete old directory data
try:
    shutil.rmtree(path)
except FileNotFoundError:
    print("there is directory")

In [10]:
# write base delta table
(
    df_final
    .repartition(1)
    .write
    .format("delta")
    .partitionBy("CallYear")
    .mode("overwrite")
    .save(path)
)

22/06/16 15:36:54 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [11]:
df_new_data = (
    df_final
    .filter(F.col("CallYear") == 2018)
    .withColumn("CallNumber", F.col("CallNumber") * F.lit(2))
    .withColumn("CallYear", F.col("CallYear") + F.lit(1))
    .withColumn("creation_date", F.current_timestamp())
)

In [12]:
tgt_deltaTable = DeltaTable.forPath(spark, path)

In [13]:
# merging new data to delta table
(
    tgt_deltaTable.alias("tgt")
    .merge(
        df_new_data.alias("src"), "src.CallNumber = tgt.CallNumber")
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
)

                                                                                

In [14]:
df_updated_data_2010 = (
    df_final
    .filter(F.col("CallYear") == 2010)
    .withColumn("Delay", F.col("Delay") * 2)
    .withColumn("creation_date", F.current_timestamp())
)

In [15]:
tgt_deltaTable2 = DeltaTable.forPath(spark, path)

In [16]:
# merging updated 2010 data to delta table
(
    tgt_deltaTable2.alias("tgt")
    .merge(
        df_updated_data_2010.alias("src"), 
        "src.CallNumber = tgt.CallNumber")
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
)

                                                                                

In [22]:
# check versions
spark.sql(f"describe history '{path}'").show(vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 version             | 2                                                                                                                                                                                                                                                                                        
 timestamp           | 2022-06-16 15:37:45.246                                                                                                                                                                                                                                                                  
 userId              | null                                                          

22/06/18 02:55:50 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 952738 ms exceeds timeout 120000 ms
22/06/18 02:55:50 WARN SparkContext: Killing executors is not supported by current scheduler.


In [17]:
# using time travel to check previous versions
(
    spark
    .read
    .format("delta")
    .option("versionAsOf", 1)
    .load(path)
    ###
    .filter(F.col("CallYear") == 2010)
    .select("CallNumber", "CallYear", "delay", "creation_date")
    .orderBy(F.col("CallNumber").asc())
    .show(truncate=False)
)

                                                                                

+----------+--------+---------+-----------------------+
|CallNumber|CallYear|delay    |creation_date          |
+----------+--------+---------+-----------------------+
|100010030 |2010    |42.983334|2022-06-16 15:36:56.917|
|100010087 |2010    |21.433332|2022-06-16 15:36:56.917|
|100010098 |2010    |7.616667 |2022-06-16 15:36:56.917|
|100010109 |2010    |23.883333|2022-06-16 15:36:56.917|
|100010120 |2010    |5.35     |2022-06-16 15:36:56.917|
|100010145 |2010    |2.1166666|2022-06-16 15:36:56.917|
|100010147 |2010    |19.5     |2022-06-16 15:36:56.917|
|100010163 |2010    |2.8833334|2022-06-16 15:36:56.917|
|100010171 |2010    |1.2166667|2022-06-16 15:36:56.917|
|100010187 |2010    |3.05     |2022-06-16 15:36:56.917|
|100010203 |2010    |4.6      |2022-06-16 15:36:56.917|
|100010205 |2010    |2.65     |2022-06-16 15:36:56.917|
|100010209 |2010    |1.7666667|2022-06-16 15:36:56.917|
|100010229 |2010    |2.9      |2022-06-16 15:36:56.917|
|100010245 |2010    |2.4666667|2022-06-16 15:36: