In [0]:
df = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load("/Volumes/workspace/pyspark/filedata/ecommerce_orders_large.csv")

In [0]:
df.display()

order_id,user_id,order_date,product_id,product_category,product_name,quantity,price_per_unit,payment_method,order_status
1001,U188,2025-04-20,P940,Fashion,Sneakers,2,58.53,PayPal,Cancelled
1002,U062,2025-04-16,P794,Fashion,T-Shirt,3,83.76,UPI,Returned
1003,U058,2025-04-18,P326,Fashion,Sunglasses,2,78.85,PayPal,Processing
1004,U011,2025-04-10,P574,Fashion,Sunglasses,5,46.49,PayPal,Delivered
1005,U003,2025-04-19,P988,Home Decor,Photo Frame,2,78.61,PayPal,Returned
1006,U017,2025-04-15,P328,Kitchen,Knife Set,4,53.51,Credit Card,Returned
1007,U129,2025-04-23,P786,Home Decor,Wall Clock,5,12.71,Credit Card,Returned
1008,U102,2025-04-15,P101,Home Decor,Photo Frame,1,46.6,Debit Card,Cancelled
1009,U040,2025-04-04,P610,Kitchen,Toaster,4,35.87,Credit Card,Processing
1010,U186,2025-04-29,P354,Kitchen,Microwave,1,30.95,Credit Card,Processing


#### TempView

In [0]:
df.createOrReplaceTempView("orders_temp")

#### Managed Table

In [0]:
%sql
create catalog tbl_catalog;

In [0]:
%sql
create schema tbl_catalog.core;

In [0]:
%sql
create table tbl_catalog.core.orders_managed as
select * from orders_temp;

num_affected_rows,num_inserted_rows


#### Query

In [0]:
query = spark.sql("select * from tbl_catalog.core.orders_managed")
query.display()

order_id,user_id,order_date,product_id,product_category,product_name,quantity,price_per_unit,payment_method,order_status
1001,U188,2025-04-20,P940,Fashion,Sneakers,2,58.53,PayPal,Cancelled
1002,U062,2025-04-16,P794,Fashion,T-Shirt,3,83.76,UPI,Returned
1003,U058,2025-04-18,P326,Fashion,Sunglasses,2,78.85,PayPal,Processing
1004,U011,2025-04-10,P574,Fashion,Sunglasses,5,46.49,PayPal,Delivered
1005,U003,2025-04-19,P988,Home Decor,Photo Frame,2,78.61,PayPal,Returned
1006,U017,2025-04-15,P328,Kitchen,Knife Set,4,53.51,Credit Card,Returned
1007,U129,2025-04-23,P786,Home Decor,Wall Clock,5,12.71,Credit Card,Returned
1008,U102,2025-04-15,P101,Home Decor,Photo Frame,1,46.6,Debit Card,Cancelled
1009,U040,2025-04-04,P610,Kitchen,Toaster,4,35.87,Credit Card,Processing
1010,U186,2025-04-29,P354,Kitchen,Microwave,1,30.95,Credit Card,Processing


#### Filter

In [0]:
%sql
select * from tbl_catalog.core.orders_managed
where product_category = 'Fashion';

order_id,user_id,order_date,product_id,product_category,product_name,quantity,price_per_unit,payment_method,order_status
1001,U188,2025-04-20,P940,Fashion,Sneakers,2,58.53,PayPal,Cancelled
1002,U062,2025-04-16,P794,Fashion,T-Shirt,3,83.76,UPI,Returned
1003,U058,2025-04-18,P326,Fashion,Sunglasses,2,78.85,PayPal,Processing
1004,U011,2025-04-10,P574,Fashion,Sunglasses,5,46.49,PayPal,Delivered
1012,U148,2025-04-24,P315,Fashion,Sunglasses,5,69.14,Credit Card,Processing
1013,U140,2025-05-03,P516,Fashion,Sneakers,5,90.64,Credit Card,Cancelled
1015,U184,2025-04-11,P930,Fashion,Sunglasses,1,61.0,UPI,Cancelled
1026,U198,2025-04-05,P793,Fashion,Sneakers,4,77.54,Credit Card,Processing
1028,U063,2025-04-22,P347,Fashion,T-Shirt,5,42.37,PayPal,Cancelled
1039,U020,2025-04-19,P834,Fashion,Sneakers,4,60.17,UPI,Returned


#### Agg

In [0]:
%sql
select 
  month(order_date) as order_month,
  product_category,
  count(order_id) as total_orders
from tbl_catalog.core.orders_managed
group by order_month, product_category
order by order_month, total_orders desc;

order_month,product_category,total_orders
4,Electronics,196
4,Kitchen,177
4,Books,174
4,Home Decor,172
4,Fashion,167
5,Electronics,27
5,Kitchen,26
5,Home Decor,21
5,Fashion,21
5,Books,19


#### Subquery

In [0]:
%sql
select * from (
  select 
    month(order_date) as order_month,
    product_category,
    count(order_id) as total_orders
  from tbl_catalog.core.orders_managed
  group by order_month, product_category
  order by order_month, total_orders desc
) where product_category = 'Home Decor';

order_month,product_category,total_orders
4,Home Decor,172
5,Home Decor,21


#### Conditionals

In [0]:
%sql
select
  *,
  case when (payment_method like '%Card') and order_status in ('Cancelled', 'Returned') then 'Card'
    when (payment_method in ('PayPal', 'UPI')) and order_status in ('Cancelled', 'Returned') then 'Cash'
    else 'n/a'
  end as payment_flag
from tbl_catalog.core.orders_managed;

order_id,user_id,order_date,product_id,product_category,product_name,quantity,price_per_unit,payment_method,order_status,payment_flag
1001,U188,2025-04-20,P940,Fashion,Sneakers,2,58.53,PayPal,Cancelled,Cash
1002,U062,2025-04-16,P794,Fashion,T-Shirt,3,83.76,UPI,Returned,Cash
1003,U058,2025-04-18,P326,Fashion,Sunglasses,2,78.85,PayPal,Processing,
1004,U011,2025-04-10,P574,Fashion,Sunglasses,5,46.49,PayPal,Delivered,
1005,U003,2025-04-19,P988,Home Decor,Photo Frame,2,78.61,PayPal,Returned,Cash
1006,U017,2025-04-15,P328,Kitchen,Knife Set,4,53.51,Credit Card,Returned,Card
1007,U129,2025-04-23,P786,Home Decor,Wall Clock,5,12.71,Credit Card,Returned,Card
1008,U102,2025-04-15,P101,Home Decor,Photo Frame,1,46.6,Debit Card,Cancelled,Card
1009,U040,2025-04-04,P610,Kitchen,Toaster,4,35.87,Credit Card,Processing,
1010,U186,2025-04-29,P354,Kitchen,Microwave,1,30.95,Credit Card,Processing,


#### Distinct

In [0]:
%sql
select distinct payment_method from tbl_catalog.core.orders_managed;

payment_method
Debit Card
Credit Card
PayPal
UPI


#### CTE

In [0]:
%sql
with t1 as (
  select 
    month(order_date) as order_month,
    product_category,
    count(order_id) as total_orders
  from tbl_catalog.core.orders_managed
  group by order_month, product_category
  order by order_month, total_orders desc
)
select * from t1 where product_category = 'Home Decor'

order_month,product_category,total_orders
4,Home Decor,172
5,Home Decor,21


#### Window

In [0]:
%sql
select 
  price_per_unit,
  row_number() over (order by price_per_unit desc) as rn,
  rank() over (order by price_per_unit desc) as rank,
  dense_rank() over (order by price_per_unit desc) as dense_rank
from tbl_catalog.core.orders_managed

price_per_unit,rn,rank,dense_rank
99.93,1,1,1
99.91,2,2,2
99.8,3,3,3
99.65,4,4,4
99.5,5,5,5
99.43,6,6,6
99.4,7,7,7
99.39,8,8,8
99.22,9,9,9
99.14,10,10,10


#### Window Agg

In [0]:
%sql
select 
  price_per_unit,
  sum(price_per_unit) over (order by price_per_unit desc rows between unbounded preceding and unbounded following) as total,
  sum(price_per_unit) over (order by price_per_unit desc rows between unbounded preceding and current row) as cum_sum
from tbl_catalog.core.orders_managed

price_per_unit,total,cum_sum
99.93,55205.35999999995,99.93
99.91,55205.35999999995,199.84
99.8,55205.35999999995,299.64
99.65,55205.35999999995,399.29
99.5,55205.35999999995,498.79
99.43,55205.35999999995,598.22
99.4,55205.35999999995,697.62
99.39,55205.35999999995,797.01
99.22,55205.35999999995,896.23
99.14,55205.35999999995,995.37


#### Merge/ Upsert

In [0]:
df = spark.read.table("tbl_catalog.core.orders_managed")
df.createOrReplaceTempView("orders_src")

In [0]:
%sql
merge into tbl_catalog.core.orders_managed as dst
using orders_src as src
on dst.order_id = src.order_id
when matched then 
  update set *
when not matched then
  insert *;

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
1000,1000,0,0


### **Functions**

#### Scalar Functions (UDF)

In [0]:
%sql
create or replace function tbl_catalog.core.dis_price(price decimal(10, 2))
returns decimal(10, 2)
language sql
return price * 0.90;

In [0]:
%sql
select 
    price_per_unit,
    tbl_catalog.core.dis_price(price_per_unit) as discounted_price
from tbl_catalog.core.orders_managed

price_per_unit,discounted_price
58.53,52.68
83.76,75.38
78.85,70.97
46.49,41.84
78.61,70.75
53.51,48.16
12.71,11.44
46.6,41.94
35.87,32.28
30.95,27.86


#### Table Functions (UDTF)

In [0]:
%sql
create or replace function tbl_catalog.core.filter_orders(cat string)
returns table
language sql
return
(select * from tbl_catalog.core.orders_managed where product_category = cat);
    
select * from tbl_catalog.core.filter_orders('Home Decor');


order_id,user_id,order_date,product_id,product_category,product_name,quantity,price_per_unit,payment_method,order_status
1005,U003,2025-04-19,P988,Home Decor,Photo Frame,2,78.61,PayPal,Returned
1007,U129,2025-04-23,P786,Home Decor,Wall Clock,5,12.71,Credit Card,Returned
1008,U102,2025-04-15,P101,Home Decor,Photo Frame,1,46.6,Debit Card,Cancelled
1034,U016,2025-04-23,P948,Home Decor,Wall Clock,3,54.5,UPI,Returned
1035,U150,2025-04-10,P242,Home Decor,Cushion Cover,2,34.04,Credit Card,Cancelled
1036,U171,2025-05-02,P585,Home Decor,Cushion Cover,2,17.22,Credit Card,Processing
1040,U172,2025-04-08,P644,Home Decor,Wall Art,2,97.17,Debit Card,Processing
1051,U049,2025-04-25,P219,Home Decor,Wall Art,3,78.9,UPI,Processing
1058,U090,2025-04-16,P250,Home Decor,Wall Clock,5,72.65,UPI,Cancelled
1061,U069,2025-04-06,P463,Home Decor,Wall Clock,1,96.35,PayPal,Delivered


#### Dynamic Masking

In [0]:
%sql
create or replace function tbl_catalog.core.mask_pii(user_id string)
returns string
language sql
return
case when is_account_group_member('Admin') then user_id else '*****' end;

alter table tbl_catalog.core.orders_managed
alter column user_id set mask tbl_catalog.core.mask_pii;

In [0]:
%sql
select * from tbl_catalog.core.orders_managed;

order_id,user_id,order_date,product_id,product_category,product_name,quantity,price_per_unit,payment_method,order_status
1001,*****,2025-04-20,P940,Fashion,Sneakers,2,58.53,PayPal,Cancelled
1002,*****,2025-04-16,P794,Fashion,T-Shirt,3,83.76,UPI,Returned
1003,*****,2025-04-18,P326,Fashion,Sunglasses,2,78.85,PayPal,Processing
1004,*****,2025-04-10,P574,Fashion,Sunglasses,5,46.49,PayPal,Delivered
1005,*****,2025-04-19,P988,Home Decor,Photo Frame,2,78.61,PayPal,Returned
1006,*****,2025-04-15,P328,Kitchen,Knife Set,4,53.51,Credit Card,Returned
1007,*****,2025-04-23,P786,Home Decor,Wall Clock,5,12.71,Credit Card,Returned
1008,*****,2025-04-15,P101,Home Decor,Photo Frame,1,46.6,Debit Card,Cancelled
1009,*****,2025-04-04,P610,Kitchen,Toaster,4,35.87,Credit Card,Processing
1010,*****,2025-04-29,P354,Kitchen,Microwave,1,30.95,Credit Card,Processing


#### Row-level Security

In [0]:
%sql
-- Mapping Table
create table tbl_catalog.core.mapping_table (
  payment_category string,
  email string
);

insert into tbl_catalog.core.mapping_table values 
('Credit Card', 'phyominnthwin@gmail.com'),
('Debit Card', 'phyominnthwin@gmail.com'),
('PayPal', 'other-phyominnthwin@gmail.com'),
('UPI', 'other-phyominnthwin@gmail.com');

num_affected_rows,num_inserted_rows
4,4


In [0]:
%sql
create function tbl_catalog.core.row_security(payment_method string)
returns boolean
language sql
return
(exists(
  select 1 from tbl_catalog.core.mapping_table
  where payment_category = payment_method and email = current_user()
));

In [0]:
%sql
alter table tbl_catalog.core.orders_managed
set row filter tbl_catalog.core.row_security on (payment_method);

In [0]:
%sql
select * from tbl_catalog.core.orders_managed;

order_id,user_id,order_date,product_id,product_category,product_name,quantity,price_per_unit,payment_method,order_status
1006,*****,2025-04-15,P328,Kitchen,Knife Set,4,53.51,Credit Card,Returned
1007,*****,2025-04-23,P786,Home Decor,Wall Clock,5,12.71,Credit Card,Returned
1008,*****,2025-04-15,P101,Home Decor,Photo Frame,1,46.6,Debit Card,Cancelled
1009,*****,2025-04-04,P610,Kitchen,Toaster,4,35.87,Credit Card,Processing
1010,*****,2025-04-29,P354,Kitchen,Microwave,1,30.95,Credit Card,Processing
1012,*****,2025-04-24,P315,Fashion,Sunglasses,5,69.14,Credit Card,Processing
1013,*****,2025-05-03,P516,Fashion,Sneakers,5,90.64,Credit Card,Cancelled
1014,*****,2025-04-18,P111,Books,Data Engineering 101,1,93.91,Credit Card,Cancelled
1018,*****,2025-04-15,P650,Electronics,Wireless Mouse,2,18.72,Debit Card,Processing
1019,*****,2025-05-03,P973,Kitchen,Blender,3,35.68,Debit Card,Delivered


#### DML

In [0]:
%sql
update tbl_catalog.core.orders_source
set product_category = 'GenZ Fashion'
where product_category = 'Fashion';

num_affected_rows
0


### Delta Lake

#### Describe

In [0]:
%sql
describe tbl_catalog.core.orders_managed;

col_name,data_type,comment
order_id,int,
user_id,string,
order_date,date,
product_id,string,
product_category,string,
product_name,string,
quantity,int,
price_per_unit,double,
payment_method,string,
order_status,string,


In [0]:
%sql
create or replace table tbl_catalog.core.orders_source as 
select * from tbl_catalog.core.orders_managed;

num_affected_rows,num_inserted_rows


#### Table History

In [0]:
%sql
describe history tbl_catalog.core.orders_source;

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
2,2025-08-10T16:17:23.000Z,78362573187593,phyominnthwin@gmail.com,UPDATE,"Map(predicate -> [""(product_category#17470 = Fashion)""])",,,0810-150244-hqtnfft0-v2n,1.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 409, numDeletionVectorsUpdated -> 0, scanTimeMs -> 407, numAddedFiles -> 0, numUpdatedRows -> 0, numAddedBytes -> 0, rewriteTimeMs -> 0)",,Databricks-Runtime/17.0.x-aarch64-photon-scala2.13
1,2025-08-10T16:16:47.000Z,78362573187593,phyominnthwin@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,,0810-150244-hqtnfft0-v2n,0.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 10849, numOutputRows -> 475, numOutputBytes -> 10849)",,Databricks-Runtime/17.0.x-aarch64-photon-scala2.13
0,2025-08-10T16:16:06.000Z,78362573187593,phyominnthwin@gmail.com,CREATE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,,0810-150244-hqtnfft0-v2n,,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 475, numOutputBytes -> 10849)",,Databricks-Runtime/17.0.x-aarch64-photon-scala2.13


#### Time-traveling

In [0]:
%sql
restore table tbl_catalog.core.orders_source to version as of 1;

table_size_after_restore,num_of_files_after_restore,num_removed_files,num_restored_files,removed_files_size,restored_files_size
10849,1,0,0,0,0
