## Spark Overview



<img src="https://spark.apache.org/docs/3.5.1/img/cluster-overview.png" width="1200" />









<img src="https://github.com/dbrownems/SparkDataEngineeringForSQLServerProfessionals/blob/main/cluster_overview2.png?raw=true" width="1200" />

# Introduction to Notebooks



Notebooks are the primary development tool for Spark.

 - Interactive development and data analysis tool
 - But they can also be saved and run as part of a job

 - And in addition to code, they support markdown, so you can embed rich documentation in your jobs

<details>
Additional resources:

Develop, execute, and manage Microsoft Fabric notebooks
https://learn.microsoft.com/en-us/fabric/data-engineering/author-execute-notebook

Python for beginners
https://learn.microsoft.com/en-us/training/paths/beginner-python/

Spark docs
https://spark.apache.org/docs/latest/

Delta docs
https://docs.delta.io/latest/index.html



</details>

# Python code in Notebooks

In [2]:
%%pyspark

# top level variables in notebooks have session scope
msg = "hello from python"

def print_message():
    msg2 = msg
    print(msg)

a = 2


StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 4, Finished, Available)

In [3]:
%%pyspark

#print the session variable
print(msg)

#run the function
print_message()

#change the value
msg = "hello again"

#print the changed value
print_message()

#what is the print_message object?
print(print_message)

#asssign a variable to the function
f = print_message

#run that
f()

print(f)

#msg2 isn't defined; it's a local variable in the print_message method
print(msg2)

#notice that all the other commands ran: python is an "interpreted" language

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 5, Finished, Available)

hello from python
hello from python
hello again
<function print_message at 0x7f3a6f419990>
hello again
<function print_message at 0x7f3a6f419990>


NameError: name 'msg2' is not defined

# Working with Data

## Dataframe basics

In [5]:
%%pyspark
df=spark.read.format("Delta").load("Tables/Sales_Customers")
df = df.where("CustomerName like 'A%'")
display(df)

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 7, Finished, Available)

SynapseWidget(Synapse.DataFrame, 3ffdaba3-80ab-45b0-952b-f1e6907559b6)

In [6]:
%%pyspark

# the dataframe object has an API to transform the dataframe
# and you can easilly do stuff like rename all the columns

def fix_col_name(name):
    name = name.lower()\
               .replace("cust_","customer_")\
               .replace("addr_","address")
               
    return "".join(x.capitalize() for x in name.lower().split("_"))

df = spark.sql("select 1 ID, 'Ann' CUST_NAME, '123 Garden Way' CUST_ADDRESS")

display(df)
for col in df.columns:
    df = df.withColumnRenamed(col, fix_col_name(col))

display(df)

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 8, Finished, Available)

SynapseWidget(Synapse.DataFrame, 97941480-b79a-41ed-b44b-fe3c123fc3fb)

SynapseWidget(Synapse.DataFrame, 565d5e55-a60f-491a-b531-eb3c73fc6723)

In [7]:
show tables

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 9, Finished, Available)

<Spark SQL result set with 31 rows and 3 fields>

In [8]:
%%pyspark
df = spark.sql("SELECT * FROM WideWorldImporters_bronze.Sales_Customers LIMIT 1000")
display(df)

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 10, Finished, Available)

SynapseWidget(Synapse.DataFrame, 4398dfe3-8e72-4c5b-807a-200dd73f1f4a)

## Loading a Dimension

### Generating Dimension Keys

https://spark.apache.org/docs/latest/api/sql/index.html

In [9]:
--hash of business key and source system
select xxhash64(CustomerId,"CRM") ID, *
from WideWorldImporters_bronze.Sales_Customers limit 10;

--or use a GUID
select uuid() ID, * 
from WideWorldImporters_bronze.Sales_Customers limit 10;

StatementMeta(, , -1, Finished, Available)

<Spark SQL result set with 10 rows and 31 fields>

<Spark SQL result set with 10 rows and 31 fields>

In [10]:
-- or use https://spark.apache.org/docs/latest/api/sql/index.html#monotonically_increasing_id
-- But the sequence has big gaps when processing across multiple worker nodes
select monotonically_increasing_id() ID, *
from WideWorldImporters_gold.Dimension_Customer_by_postalcode

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 13, Finished, Available)

<Spark SQL result set with 403 rows and 12 fields>

In [11]:
--or use SQL analytic functions to assign monotonically increasing keys
select  coalesce(c.CustomerKey,max(c.CustomerKey) over() 
                             + row_number() over (partition by c.CustomerKey order by s.CustomerID)) CustomerKey, 
       c.CustomerKey ExistingDimKey, 
       s.CustomerID
from WideWorldImporters_bronze.Sales_Customers s 
left join WideWorldImporters_gold.Dimension_Customer c 
  on s.CustomerID = c.WWICustomerID
order by CustomerKey;

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 14, Finished, Available)

<Spark SQL result set with 663 rows and 3 fields>

### Temporary Views and Temporary Tables

In [12]:
--temporary views are very cool
--like Common Table Expressions or subqueries, but much more powerfull
--They have session lifetime, rather than statement lifetime
create or replace temp view CustomerKeys as
select  coalesce(c.CustomerKey,max(c.CustomerKey) over() 
                             + row_number() over (partition by c.CustomerKey order by s.CustomerID)) CustomerKey, 
        s.CustomerID
from WideWorldImporters_bronze.Sales_Customers s 
left join WideWorldImporters_gold.Dimension_Customer c 
  on s.CustomerID = c.WWICustomerID

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 15, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>

In [13]:
explain select * from CustomerKeys

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 16, Finished, Available)

<Spark SQL result set with 1 rows and 1 fields>

In [14]:
--but temp views can be cached, and they become, essentially temp tables
--data is cached on the executor VMs, so this is useful for Delta tables too
cache table CustomerKeys

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 17, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>

In [15]:
explain  select * from CustomerKeys

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 18, Finished, Available)

<Spark SQL result set with 1 rows and 1 fields>

### Merging the dimension


In [16]:
select * from WideWorldImporters_gold.Dimension_Customer limit 10

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 19, Finished, Available)

<Spark SQL result set with 10 rows and 11 fields>

In [17]:
-- describe WideWorldImporters_gold.Dimension_Customer;
-- describe WideWorldImporters_bronze.Sales_Customers;
 create or replace temp view CustomerMergeSource 
 as
 select k.CustomerKey CustomerKey,
        c.CustomerID WWICustomerID,
        c.CustomerName Customer,
        bc.CustomerName BillToCustomer,
        cat.CustomerCategoryName Category,
        bg.BuyingGroupName BuyingGroup,
        p.FullName PrimaryContact,
        c.PostalPostalCode PostalCode,
        cast(0 as int) LineageKey,
        c.ValidFrom,
        c.ValidTo
    from WideWorldImporters_bronze.Sales_Customers c
    left join CustomerKeys k
       on k.CustomerID = c.CustomerID
    left join WideWorldImporters_bronze.Sales_Customers bc 
       on c.BillToCustomerID = bc.CustomerID
    left join WideWorldImporters_bronze.Sales_CustomerCategories cat 
       on cat.CustomerCategoryID = c.CustomerCategoryID
    left join WideWorldImporters_bronze.Sales_BuyingGroups bg 
       on c.BuyingGroupId = bg.BuyingGroupID
    left join WideWorldImporters_bronze.Application_People p 
       on p.PersonID = c.PrimaryContactPersonID
        



StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 20, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>

### Validate the data

In [18]:
%%pyspark 

ids = spark.sql("select WWICustomerID from CustomerMergeSource where CustomerKey is null").collect()

if len(ids) > 0:
    raise( ValueError(f"Invalid CustomerKey values for {len(ids)} keys example: {ids[0]}"))

ids = spark.sql("select CustomerKey from CustomerMergeSource group by CustomerKey having count(*)>1").collect()

if len(ids) > 0:
    raise( ValueError(f"Duplicate CustomerKey values for {len(ids)} keys example: {ids[0]}"))

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 21, Finished, Available)

### Upsert the Dimension

In [19]:
merge into WideWorldImporters_gold.Dimension_Customer dest
using CustomerMergeSource src
on src.WWICustomerID = dest.WWICustomerID
when matched then update set *
when not matched then insert *


StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 22, Finished, Available)

<Spark SQL result set with 1 rows and 4 fields>

## Bring in Unstructured Data

In [20]:
%%sql

alter table WideWorldImporters_gold.Dimension_Customer add columns( Latitude float, Longitude float )

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 23, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>

```
[
    {
        "CustomerID": 1,
        "Location": "POINT (-102.6201979 41.4972022)"
    },
    {
        "CustomerID": 2,
        "Location": "POINT (-115.8743507 48.7163356)"
    },
    {
        "CustomerID": 3,
        "Location": "POINT (-112.7271223 34.2689145)"
    },
    {
        "CustomerID": 4,
        "Location": "POINT (-98.580361 37.2811339)"
    },
```

In [21]:
%%pyspark 

df = spark.read.text("Files/CustomerLocations.json").take(20)
display(df)

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 24, Finished, Available)

SynapseWidget(Synapse.DataFrame, 6ebd6e88-dfa1-4b23-9599-7de80744d495)

In [22]:
%%pyspark

# to read data files without a built-in schema, supply the schema explicitly
# You can infer the schema, and then save and modify it if you like

from pyspark.sql.types import *

schema = StructType([
    StructField("CustomerID",IntegerType(),True),
    StructField("Location",StringType(),True)
])

dfCustLocations = spark.read\
                       .schema(schema)\
                       .option("multiLine", True)\
                       .json("Files/CustomerLocations.json")

display(dfCustLocations)

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 25, Finished, Available)

SynapseWidget(Synapse.DataFrame, d367dc16-1abd-4804-ba05-75fb8aaceac5)

In [23]:
select * 
from dfCustLocations

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 26, Finished, Available)

Error: [TABLE_OR_VIEW_NOT_FOUND] The table or view `dfCustLocations` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 2 pos 5;
'Project [*]
+- 'UnresolvedRelation [dfCustLocations], [], false


In [24]:
%%pyspark

dfCustLocations.createOrReplaceTempView("CustLocations")

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 27, Finished, Available)

In [25]:
--   'POINT (-123.8860114 47.4631419)''
  
  select CustomerId, split(Location,' ') locSplit
  from CustLocations

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 28, Finished, Available)

<Spark SQL result set with 663 rows and 2 fields>

In [26]:


-- select CustomerId, split(Location,' ') locSplit
-- from CustLocations;
-- "["POINT","(-120.1290272","36.0041223)"]"
create or replace temp view CustLocations2 as
with q AS
(
  select CustomerId, split(Location,' ') locSplit
  from CustLocations
)
select CustomerId, try_cast(replace(locSplit[1],'(','') as double) Long, try_cast(replace(locSplit[2],')','') as double) Lat
from q;


StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 29, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>

In [27]:
%%pyspark
%pip install shapely

StatementMeta(, , -1, Finished, Available)

Collecting shapely
  Downloading shapely-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: shapely
Successfully installed shapely-2.0.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.



In [28]:
%%pyspark
from shapely import wkt

shape = wkt.loads('POINT (-123.8860114 47.4631419)')
shape.centroid.x

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 36, Finished, Available)

-123.8860114

In [29]:
%%pyspark
from shapely import wkt
from pyspark.sql.functions import *
from pyspark.sql.types import *

def lat(s):
    shape = wkt.loads(s)
    return float(shape.centroid.y)


def lon(s):
    shape = wkt.loads(s)
    return float(shape.centroid.x)


spark.udf.register("lat", lat, FloatType())
spark.udf.register("lon", lon, FloatType())

lon('POINT (-123.8860114 47.4631419)')

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 37, Finished, Available)

-123.8860114

In [30]:
select lat('POINT (-123.8860114 47.4631419)') lat

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 38, Finished, Available)

<Spark SQL result set with 1 rows and 1 fields>

In [31]:
create or replace temp view CustLocations2 as

select CustomerId, lon(location) Long, lat(location) Lat
from CustLocations;

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 39, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>

In [32]:
with q as
(
    select c.*, l.Long NewLongitude, l.Lat NewLatitude
    from WideWorldImporters_gold.Dimension_Customer c
    left join CustLocations2 l 
    on c.WWICustomerID = l.CustomerID
)
update q set Latitude = NewLatitude, Longitude = NewLongitude

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 40, Finished, Available)

Error: UPDATE destination only supports Delta sources.
Some(Project [CustomerKey#8702, WWICustomerID#8703, Customer#8704, BillToCustomer#8705, Category#8706, BuyingGroup#8707, PrimaryContact#8708, PostalCode#8709, ValidFrom#8710, ValidTo#8711, LineageKey#8712, Latitude#8713, Longitude#8714, Long#8718 AS NewLongitude#8700, Lat#8719 AS NewLatitude#8701]
+- Join LeftOuter, (WWICustomerID#8703 = CustomerID#8717)
   :- Relation spark_catalog.wideworldimporters_gold.dimension_customer[CustomerKey#8702,WWICustomerID#8703,Customer#8704,BillToCustomer#8705,Category#8706,BuyingGroup#8707,PrimaryContact#8708,PostalCode#8709,ValidFrom#8710,ValidTo#8711,LineageKey#8712,Latitude#8713,Longitude#8714] parquet
   +- Project [cast(CustomerId#8653 as int) AS CustomerId#8717, cast(Long#8715 as float) AS Long#8718, cast(Lat#8716 as float) AS Lat#8719]
      +- Project [CustomerId#8653, lon(location#8654)#8720 AS Long#8715, lat(location#8654)#8721 AS Lat#8716]
         +- Relation [CustomerID#8653,Location#8654] json
)

In [33]:
update WideWorldImporters_gold.Dimension_Customer c
set Latitude = (select Latitude from CustLocations2 l where l.CustomerID = c.WWICustomerID ),
    Longitude = (select Longitude from CustLocations2 l where l.CustomerID = c.WWICustomerID )

   

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 41, Finished, Available)

Error: [UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.MUST_AGGREGATE_CORRELATED_SCALAR_SUBQUERY] Unsupported subquery expression: Correlated scalar subqueries must be aggregated to return at most one row.; line 2 pos 15;
UpdateCommand Delta[version=11, ... msit-onelake.dfs.fabric.microsoft.com/54be16ef-ba86-4fd6-b494-2013153c2245/Tables/Dimension_Customer], [CustomerKey#8725, WWICustomerID#8726, Customer#8727, BillToCustomer#8728, Category#8729, BuyingGroup#8730, PrimaryContact#8731, PostalCode#8732, ValidFrom#8733, ValidTo#8734, LineageKey#8735, scalar-subquery#8723 [Latitude#8736 && WWICustomerID#8726], scalar-subquery#8724 [Longitude#8737 && WWICustomerID#8726]]
   +- SubqueryAlias c
      +- SubqueryAlias spark_catalog.WideWorldImporters_gold.Dimension_Customer
         +- Relation spark_catalog.wideworldimporters_gold.dimension_customer[CustomerKey#8725,WWICustomerID#8726,Customer#8727,BillToCustomer#8728,Category#8729,BuyingGroup#8730,PrimaryContact#8731,PostalCode#8732,ValidFrom#8733,ValidTo#8734,LineageKey#8735,Latitude#8736,Longitude#8737] parquet


In [34]:
merge into  WideWorldImporters_gold.Dimension_Customer dest 
using CustLocations2 src
on src.CustomerID = dest.WWICustomerID
when matched then update set Latitude = src.Lat, Longitude = src.Long

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 42, Finished, Available)

<Spark SQL result set with 1 rows and 4 fields>

In [35]:
select * from  WideWorldImporters_gold.Dimension_Customer limit 10


StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 43, Finished, Available)

<Spark SQL result set with 10 rows and 13 fields>

## Delta table history

In [36]:
describe history WideWorldImporters_gold.Dimension_Customer

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 44, Finished, Available)

<Spark SQL result set with 13 rows and 15 fields>

In [37]:
SELECT * FROM WideWorldImporters_gold.Dimension_Customer VERSION AS OF 1

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 45, Finished, Available)

<Spark SQL result set with 403 rows and 11 fields>

In [38]:
restore table WideWorldImporters_gold.Dimension_Customer to version as of 1

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 46, Finished, Available)

<Spark SQL result set with 1 rows and 6 fields>

# Notebook Orchestration

In [39]:
%%pyspark

mssparkutils.notebook.runMultiple(["LoadCustomerDim","LoadDateDim","LoadEmployeeDim"])
mssparkutils.notebook.runMultiple(["LoadPurchaseFact","LoadSaleFact"])
mssparkutils.notebook.runMultiple(["LoadOrderFact"])

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 47, Finished, Available)

VBox(children=(HBox(children=(HTML(value='Status: Pending', description='0'), FloatProgress(value=0.0, descrip…

VBox(children=(HBox(children=(HTML(value='Status: Pending', description='0'), FloatProgress(value=0.0, descrip…

VBox(children=(HBox(children=(HTML(value='Status: Pending', description='0'), FloatProgress(value=0.0, descrip…

{'0': {'exitVal': '', 'exception': None}}

StatementMeta(, 7ea32629-e727-40c4-a936-72b485c0e672, 48, Finished, Available)