In [0]:
%run ../Includes/Copy-Datasets

In [0]:
%sql
SELECT COUNT(*)
FROM orders_silver
WHERE order_timestamp >= '2020-01-01'

count(1)
4000


Check and see if all orders have a timestamp greater than 2020-01-01

In [0]:
%sql
ALTER TABLE orders_silver ADD CONSTRAINT timestamp_within_range CHECK (order_timestamp >= '2020-01-01');

Look at the constraint under table properties 

In [0]:
%sql
DESCRIBE EXTENDED orders_silver

col_name,data_type,comment
order_id,string,
order_timestamp,timestamp,
customer_id,string,
quantity,bigint,
total,bigint,
books,array>,
,,
# Delta Statistics Columns,,
Column Names,"quantity, order_id, order_timestamp, customer_id, total, books",
Column Selection Method,first-32,


The following job will fail because the values we are trying to add into the table violate the constraint we have just set. 

In [0]:
%sql
INSERT INTO orders_silver
VALUES ('1', '2022-02-01 00:00:00.000', 'C00001', 0, 0, NULL),
       ('2', '2019-05-01 00:00:00.000', 'C00001', 0, 0, NULL),
       ('3', '2023-01-01 00:00:00.000', 'C00001', 0, 0, NULL)

Note that if an execution fails, ALL of the records won't be added to the table, even the ones that don't violate any constraints - verify this:

In [0]:
%sql
SELECT *
FROM orders_silver
WHERE order_id IN ('1', '2', '3')

order_id,order_timestamp,customer_id,quantity,total,books


In [0]:
%sql
ALTER TABLE orders_silver ADD CONSTRAINT valid_quantity CHECK (quantity > 0);

In [0]:
%sql
DESCRIBE EXTENDED orders_silver

col_name,data_type,comment
order_id,string,
order_timestamp,timestamp,
customer_id,string,
quantity,bigint,
total,bigint,
books,array>,
,,
# Delta Statistics Columns,,
Column Names,"quantity, order_id, order_timestamp, customer_id, total, books",
Column Selection Method,first-32,


We need to make sure that the rows that violate our constraint are first removed. We take a look at what these rows are:

In [0]:
%sql
SELECT *
FROM orders_silver
where quantity <= 0

order_id,order_timestamp,customer_id,quantity,total,books
4650,2022-02-20T11:03:00Z,C00419,0,0,List()
4150,2022-01-09T17:01:00Z,C00204,0,0,List()
4750,2022-02-28T04:06:00Z,C00403,0,0,List()
4800,2022-03-03T10:02:00Z,C00420,0,0,List()
3700,2021-12-04T13:09:00Z,C00076,0,0,List()
3950,2021-12-25T00:08:00Z,C00126,0,0,List()
4200,2022-01-14T03:08:00Z,C00225,0,0,List()
4300,2022-01-21T18:02:00Z,C00275,0,0,List()
4600,2022-02-16T08:05:00Z,C00376,0,0,List()
5050,2022-03-22T23:03:00Z,C00495,0,0,List()


One way to deal with this is to filter out the data that violates the constraints out before streaming to the silver table from the bronze layer. Then, the constraints can be set on the silver table without issue

In [0]:
from pyspark.sql import functions as F

json_schema = "order_id STRING, order_timestamp Timestamp, customer_id STRING, quantity BIGINT, total BIGINT, books ARRAY<STRUCT<book_id STRING, quantity BIGINT, subtotal BIGINT>>"

query = (spark.readStream.table("bronze")
        .filter("topic = 'orders'")
        .select(F.from_json(F.col("value").cast("string"), json_schema).alias("v"))
        .select("v.*")
        .filter("quantity > 0")
     .writeStream
        .option("checkpointLocation", "dbfs:/mnt/demo_pro/checkpoints/orders_silver")
        .trigger(availableNow=True)
        .table("orders_silver"))

query.awaitTermination()

How to remove a constraint:

In [0]:
%sql
ALTER TABLE orders_silver DROP CONSTRAINT timestamp_within_range;

In [0]:
%sql
DESCRIBE EXTENDED orders_silver

col_name,data_type,comment
order_id,string,
order_timestamp,timestamp,
customer_id,string,
quantity,bigint,
total,bigint,
books,array>,
,,
# Delta Statistics Columns,,
Column Names,"quantity, order_id, order_timestamp, customer_id, total, books",
Column Selection Method,first-32,


We will do more transformations to the data before streaming, so we will drop the table here and then delete the checkpoint. Not recommended in practice 

In [0]:
%sql
DROP TABLE orders_silver

In [0]:
%sql
SELECT * FROM orders_silver LIMIT 20

In [0]:
dbutils.fs.rm("dbfs:/mnt/demo_pro/checkpoints/orders_silver", True)

True