In [3]:
%%pyspark
import pandas as pd
from datetime import datetime

StatementMeta(sparkpool03, 18, 3, Finished, Available)

In [4]:
def deep_ls(path: str, max_depth=1):
    """
    List all files and folders in specified path and
    subfolders within maximum recursion depth.
    """

    # List all files in path
    li = mssparkutils.fs.ls(path)

    # Return all files
    for x in li:
        if x.size != 0:
            yield x

    # If the max_depth has not been reached, start
    # listing files and folders in subdirectories
    if max_depth > 1:
        for x in li:
            if x.size != 0:
                continue
            for y in deep_ls(x.path, max_depth - 1):
                yield y

    # If max_depth has been reached,
    # return the folders
    else:
        for x in li:
            if x.size == 0:
                yield x

def convertfiles2df(files):
    """
    Converts FileInfo object into Pandas DataFrame to enable display
    """
    # Disable Arrow-based transfers since the Pandas DataFrame is tiny
    spark.conf.set("spark.sql.execution.arrow.enabled", "false")

    schema = ['path','name','size']
    df = pd.DataFrame([[getattr(i,j) for j in schema] for i in files], columns = schema).sort_values('path')
    return(df)

StatementMeta(sparkpool03, 18, 4, Finished, Available)

In [5]:
root = 'abfss://raw@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/Product'
files = list(deep_ls(root, max_depth=20))

StatementMeta(sparkpool03, 18, 5, Finished, Available)

In [6]:
df_list_of_files = convertfiles2df(files)

StatementMeta(sparkpool03, 18, 6, Finished, Available)

In [7]:
df_list_of_files.columns

StatementMeta(sparkpool03, 18, 7, Finished, Available)

Index(['path', 'name', 'size'], dtype='object')

In [8]:
df_list_of_files = df_list_of_files.sort_values(['path'], ascending=[False])
display(df_list_of_files)

StatementMeta(sparkpool03, 18, 8, Finished, Available)

SynapseWidget(Synapse.DataFrame, 55d8620d-5e78-4c32-8e6f-e9c8151b4456)

In [9]:
source_path=df_list_of_files.iloc[0]['path']

StatementMeta(sparkpool03, 18, 9, Finished, Available)

In [8]:
source_path

StatementMeta(sparkpool03, 15, 8, Finished, Available)

'abfss://raw@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/Product/Year=2022/Month=04/Day=07/Time=10:10:18/Product.csv'

In [10]:
df = (spark 
        .read
        .format("csv")
        .option("inferSchema", "true")
        .option("header","true")
        .load(source_path)
)

StatementMeta(sparkpool03, 18, 10, Finished, Available)

In [11]:
df.show()

StatementMeta(sparkpool03, 18, 11, Finished, Available)

+---------+--------------------+-------------+-----+------------+---------+----+-------+-----------------+--------------+--------------------+--------------------+----------------+--------------------+----------------------+--------------------+--------------------+
|ProductID|                Name|ProductNumber|Color|StandardCost|ListPrice|Size| Weight|ProductCategoryID|ProductModelID|       SellStartDate|         SellEndDate|DiscontinuedDate|      ThumbNailPhoto|ThumbnailPhotoFileName|             rowguid|        ModifiedDate|
+---------+--------------------+-------------+-----+------------+---------+----+-------+-----------------+--------------+--------------------+--------------------+----------------+--------------------+----------------------+--------------------+--------------------+
|      680|HL Road Frame - B...|   FR-R92B-58|Black|     1059.31|   1431.5|  58|1016.04|               18|             6|2002-06-01 00:00:...|                null|            null|0x474946383961500..

In [12]:
display(df.limit(10))

StatementMeta(sparkpool03, 18, 12, Finished, Available)

SynapseWidget(Synapse.DataFrame, 36771e87-3c03-45b4-9cbb-d5dab7bf0a42)

In [13]:
df.count()

StatementMeta(sparkpool03, 18, 13, Finished, Available)

295

In [14]:
df.printSchema()

StatementMeta(sparkpool03, 18, 14, Finished, Available)

root
 |-- ProductID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- ProductNumber: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- StandardCost: double (nullable = true)
 |-- ListPrice: double (nullable = true)
 |-- Size: string (nullable = true)
 |-- Weight: double (nullable = true)
 |-- ProductCategoryID: integer (nullable = true)
 |-- ProductModelID: integer (nullable = true)
 |-- SellStartDate: string (nullable = true)
 |-- SellEndDate: string (nullable = true)
 |-- DiscontinuedDate: string (nullable = true)
 |-- ThumbNailPhoto: string (nullable = true)
 |-- ThumbnailPhotoFileName: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: string (nullable = true)

In [15]:
# Replace null values with zero
df = df.na.fill(0)

StatementMeta(sparkpool03, 18, 15, Finished, Available)

In [16]:
# Replace null strings with empty
df = df.na.fill("")

StatementMeta(sparkpool03, 18, 16, Finished, Available)

In [17]:
display(df.limit(10))

StatementMeta(sparkpool03, 18, 17, Finished, Available)

SynapseWidget(Synapse.DataFrame, 16bc3d9c-0415-46a2-9fa9-ead5b3a05c23)

In [18]:
# Create temp views to easily query with Spark SQL

df.createOrReplaceTempView('merge')

StatementMeta(sparkpool03, 18, 18, Finished, Available)

In [19]:
%%sql
MERGE INTO aw_product
USING merge
ON aw_product.ProductID = merge.ProductID
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *

StatementMeta(sparkpool03, 18, 19, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>

In [20]:
%%sql
-- Retrieve the version/change history of the Delta table
DESCRIBE HISTORY aw_product

StatementMeta(sparkpool03, 18, 20, Finished, Available)

<Spark SQL result set with 2 rows and 14 fields>

In [21]:
%%sql
SELECT *
FROM aw_product
where ProductID = 680

StatementMeta(sparkpool03, 18, 21, Finished, Available)

<Spark SQL result set with 1 rows and 17 fields>

In [22]:
# Load a previous version of the DELTA_Employees table into a dataframe
df = spark.read.format("delta").option("versionAsOf", 1).load("abfss://enriched@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/Product")
df.show()

StatementMeta(sparkpool03, 18, 22, Finished, Available)

+---------+--------------------+-------------+------------+------------+----------+----+-----------+-----------------+--------------+--------------------+--------------------+----------------+--------------------+----------------------+--------------------+--------------------+
|ProductID|                Name|ProductNumber|       Color|StandardCost| ListPrice|Size|     Weight|ProductCategoryID|ProductModelID|       SellStartDate|         SellEndDate|DiscontinuedDate|      ThumbNailPhoto|ThumbnailPhotoFileName|             rowguid|        ModifiedDate|
+---------+--------------------+-------------+------------+------------+----------+----+-----------+-----------------+--------------+--------------------+--------------------+----------------+--------------------+----------------------+--------------------+--------------------+
|      769|  Road-650 Black, 48|   BK-R50B-48|       Black|   486.70660| 782.99000|  48| 8677.18000|                6|            30|2005-07-01 00:00:...|2007-06-3

In [23]:
display(spark.read.text("abfss://enriched@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/Product/_delta_log/00000000000000000000.json"))

StatementMeta(sparkpool03, 18, 23, Finished, Available)

SynapseWidget(Synapse.DataFrame, 03034be2-0db1-43a1-af2f-5bc0d89d9dab)