

# Delta.io with fake Customer Data

In [1]:
spark

In [2]:
# make sure to have clean start for demo!
!rm -rf mock_delta

In [3]:
# read directory of JSON datasets with multiple datasets per file 
# then save as delta. parquet files could also be updated inplace
customersClassic = spark.read.format("json").load("mock/")
# for simplicity we read from and write to the fs, but it could be S3 or another object store
customersClassic.write.format("delta").save("mock_delta")

In [4]:
#reread customer data in delta format
customers = spark.read.format("delta").load("mock_delta")

In [5]:
customers.count()

30

In [6]:
customers.printSchema()

root
 |-- car_make: string (nullable = true)
 |-- car_year: long (nullable = true)
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- id: long (nullable = true)
 |-- last_name: string (nullable = true)



In [7]:
customers.show(5)

+--------+--------+--------------------+----------+---+----------+
|car_make|car_year|               email|first_name| id| last_name|
+--------+--------+--------------------+----------+---+----------+
|    Jeep|    2008|  creddihoughk@de.vu|     Charo| 21|Reddihough|
|   Lexus|    2006|mbrewinl@yolasite...|    Maddie| 22|    Brewin|
|   Dodge|    1997|cblaskettm@siteme...|   Chelsea| 23|  Blaskett|
|Cadillac|    1994|kdullardn@indiego...|     Karee| 24|   Dullard|
|    Saab|    2006|etunsleyo@reuters...|    Eduard| 25|   Tunsley|
+--------+--------+--------------------+----------+---+----------+
only showing top 5 rows



# Check for oldtimers from Audi

In [8]:
customers.filter(customers['car_year'] < 2002).filter(customers['car_make'] == "Audi").show()

+--------+--------+--------------------+----------+---+----------+
|car_make|car_year|               email|first_name| id| last_name|
+--------+--------+--------------------+----------+---+----------+
|    Audi|    1991|binkin5@washingto...|    Briana|  6|     Inkin|
|    Audi|    1986|fbuckthorpe6@syma...|   Findley|  7|Buckthorpe|
|    Audi|    1987|  cmoehled@imgur.com|     Camel| 14|    Moehle|
+--------+--------+--------------------+----------+---+----------+



In [9]:
# register as table
customers.createOrReplaceTempView("customers")

In [10]:
# now we can use sql
spark.sql('SELECT email,car_make FROM customers WHERE car_make == "Audi" AND car_year < 2002 ').show()

+--------------------+--------+
|               email|car_make|
+--------------------+--------+
|binkin5@washingto...|    Audi|
|fbuckthorpe6@syma...|    Audi|
|  cmoehled@imgur.com|    Audi|
+--------------------+--------+



In [11]:
# NOTE, there is no update in Spark SQL. But delta enables it.
# this update uses ACID transactions and the data is potentially in different JSON files
spark.sql('UPDATE customers SET email = "hello@world.com" WHERE car_make = "Audi" ')

DataFrame[]

In [12]:
# verify update
spark.sql('SELECT email, car_make FROM customers WHERE car_make == "Audi" AND car_year < 2002 ').show()

+---------------+--------+
|          email|car_make|
+---------------+--------+
|hello@world.com|    Audi|
|hello@world.com|    Audi|
|hello@world.com|    Audi|
+---------------+--------+



In [13]:
spark.sql('UPDATE customers SET car_make = "Porsche" WHERE car_make = "Audi" ')

DataFrame[]

In [14]:
# try different version of the table 0,1,2 
df = spark.read.format("delta").option("versionAsOf", 2).load("mock_delta")

df.show(30)


+-----------+--------+--------------------+----------+---+------------+
|   car_make|car_year|               email|first_name| id|   last_name|
+-----------+--------+--------------------+----------+---+------------+
|       Jeep|    2008|  creddihoughk@de.vu|     Charo| 21|  Reddihough|
|      Lexus|    2006|mbrewinl@yolasite...|    Maddie| 22|      Brewin|
|      Dodge|    1997|cblaskettm@siteme...|   Chelsea| 23|    Blaskett|
|   Cadillac|    1994|kdullardn@indiego...|     Karee| 24|     Dullard|
|       Saab|    2006|etunsleyo@reuters...|    Eduard| 25|     Tunsley|
|     Nissan|    1992|dyakobovitzp@beha...|    Drusie| 26|  Yakobovitz|
|       Ford|    1984|  tvondraq@google.de|      Tome| 27|      Vondra|
|      Mazda|    1992|hmcasterr@census.gov|  Halimeda| 28|     McAster|
|      Dodge|    2006|     coneils@sun.com|    Cassie| 29|      O'Neil|
|      Acura|    1996|avoset@amazonaws.com|     Angie| 30|        Vose|
|      Eagle|    1994|jmissona@craigsli...|     James| 11|      

In [15]:
from delta.tables import *
from pyspark.sql.functions import *

# Access the Delta Lake table
deltaTable = DeltaTable.forPath(spark,"mock_delta")
# Delete all on-time and early flights
deltaTable.history().show(5)

# select("version","operation","operationParameters")

+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+
|version|           timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|
+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+
|      2|2021-06-07 16:04:...|  null|    null|   UPDATE|{predicate -> (ca...|null|    null|     null|          1|          null|        false|{numRemovedFiles ...|        null|
|      1|2021-06-07 16:04:...|  null|    null|   UPDATE|{predicate -> (ca...|null|    null|     null|          0|          null|        false|{numRemovedFiles ...|        null|
|      0|2021-06-07 16:04:...|  null|    null|    WRITE|{mode -> ErrorIfE...|null|    null|     null|       null|  

In [16]:
! ls mock_delta 

[34m_delta_log[m[m
part-00000-68110d7c-63ac-497b-96a4-0752a032d4df-c000.snappy.parquet
part-00000-a35ec775-7be2-4073-9cff-bd9f2a105525-c000.snappy.parquet
part-00000-c1d830a7-2d97-4c31-9585-f2ce6b9a1c86-c000.snappy.parquet
part-00001-05a49402-ae89-4b11-939b-5877ad01204b-c000.snappy.parquet
part-00001-311f4dfa-144d-4141-9efa-3debd6f0efae-c000.snappy.parquet
part-00001-319d629f-ed3d-4eed-a771-12797bb66ae8-c000.snappy.parquet
part-00002-e02b2d25-991a-467d-bdf3-fcc0242f069f-c000.snappy.parquet


In [17]:
! ls -la mock_delta/_delta_log

total 24
drwxr-xr-x   5 frank.munz  staff   160 Jun  7 16:04 [34m.[m[m
drwxr-xr-x  17 frank.munz  staff   544 Jun  7 16:04 [34m..[m[m
-rw-r--r--   1 frank.munz  staff  1484 Jun  7 16:04 00000000000000000000.json
-rw-r--r--   1 frank.munz  staff  1080 Jun  7 16:04 00000000000000000001.json
-rw-r--r--   1 frank.munz  staff  1080 Jun  7 16:04 00000000000000000002.json


In [18]:
df = spark.read.format("delta").option("versionAsOf", 2).load("mock_delta")

df.show(30)

+-----------+--------+--------------------+----------+---+------------+
|   car_make|car_year|               email|first_name| id|   last_name|
+-----------+--------+--------------------+----------+---+------------+
|       Jeep|    2008|  creddihoughk@de.vu|     Charo| 21|  Reddihough|
|      Lexus|    2006|mbrewinl@yolasite...|    Maddie| 22|      Brewin|
|      Dodge|    1997|cblaskettm@siteme...|   Chelsea| 23|    Blaskett|
|   Cadillac|    1994|kdullardn@indiego...|     Karee| 24|     Dullard|
|       Saab|    2006|etunsleyo@reuters...|    Eduard| 25|     Tunsley|
|     Nissan|    1992|dyakobovitzp@beha...|    Drusie| 26|  Yakobovitz|
|       Ford|    1984|  tvondraq@google.de|      Tome| 27|      Vondra|
|      Mazda|    1992|hmcasterr@census.gov|  Halimeda| 28|     McAster|
|      Dodge|    2006|     coneils@sun.com|    Cassie| 29|      O'Neil|
|      Acura|    1996|avoset@amazonaws.com|     Angie| 30|        Vose|
|      Eagle|    1994|jmissona@craigsli...|     James| 11|      

In [19]:
# time is in days, so we will not see effect immediately
deltaTable.vacuum()

DataFrame[]

# misc


In [20]:
spark.catalog.listDatabases()
#spark.sql('show databases').show()
spark.catalog.listTables('default')

[Table(name='customers', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [21]:
%lsmagic


Available line magics:
%alias  %alias_magic  %autoawait  %autocall  %automagic  %autosave  %bookmark  %cat  %cd  %clear  %colors  %conda  %config  %connect_info  %cp  %debug  %dhist  %dirs  %doctest_mode  %ed  %edit  %env  %gui  %hist  %history  %killbgscripts  %ldir  %less  %lf  %lk  %ll  %load  %load_ext  %loadpy  %logoff  %logon  %logstart  %logstate  %logstop  %ls  %lsmagic  %lx  %macro  %magic  %man  %matplotlib  %mkdir  %more  %mv  %notebook  %page  %pastebin  %pdb  %pdef  %pdoc  %pfile  %pinfo  %pinfo2  %pip  %popd  %pprint  %precision  %prun  %psearch  %psource  %pushd  %pwd  %pycat  %pylab  %qtconsole  %quickref  %recall  %rehashx  %reload_ext  %rep  %rerun  %reset  %reset_selective  %rm  %rmdir  %run  %save  %sc  %set_env  %store  %sx  %system  %tb  %time  %timeit  %unalias  %unload_ext  %who  %who_ls  %whos  %xdel  %xmode

Available cell magics:
%%!  %%HTML  %%SVG  %%bash  %%capture  %%debug  %%file  %%html  %%javascript  %%js  %%latex  %%markdown  %%perl  %%prun  %%pypy  %%