In [1]:
%%pyspark
import pandas as pd
from datetime import datetime

StatementMeta(sparkpool03, 16, 1, Finished, Available)

In [2]:
def deep_ls(path: str, max_depth=1):
    """
    List all files and folders in specified path and
    subfolders within maximum recursion depth.
    """

    # List all files in path
    li = mssparkutils.fs.ls(path)

    # Return all files
    for x in li:
        if x.size != 0:
            yield x

    # If the max_depth has not been reached, start
    # listing files and folders in subdirectories
    if max_depth > 1:
        for x in li:
            if x.size != 0:
                continue
            for y in deep_ls(x.path, max_depth - 1):
                yield y

    # If max_depth has been reached,
    # return the folders
    else:
        for x in li:
            if x.size == 0:
                yield x

def convertfiles2df(files):
    """
    Converts FileInfo object into Pandas DataFrame to enable display
    """
    # Disable Arrow-based transfers since the Pandas DataFrame is tiny
    spark.conf.set("spark.sql.execution.arrow.enabled", "false")

    schema = ['path','name','size']
    df = pd.DataFrame([[getattr(i,j) for j in schema] for i in files], columns = schema).sort_values('path')
    return(df)

StatementMeta(sparkpool03, 16, 2, Finished, Available)

In [3]:
root = 'abfss://raw@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/ProductCategory'
files = list(deep_ls(root, max_depth=20))

StatementMeta(sparkpool03, 16, 3, Finished, Available)

In [4]:
df_list_of_files = convertfiles2df(files)

StatementMeta(sparkpool03, 16, 4, Finished, Available)

In [5]:
df_list_of_files.columns

StatementMeta(sparkpool03, 16, 5, Finished, Available)

Index(['path', 'name', 'size'], dtype='object')

In [6]:
df_list_of_files = df_list_of_files.sort_values(['path'], ascending=[False])
display(df_list_of_files)

StatementMeta(sparkpool03, 16, 6, Finished, Available)

SynapseWidget(Synapse.DataFrame, 3c3b74cf-77a9-4b15-a8f0-4cc5a41b7e95)

In [7]:
source_path=df_list_of_files.iloc[0]['path']

StatementMeta(sparkpool03, 16, 7, Finished, Available)

In [8]:
source_path

StatementMeta(sparkpool03, 16, 8, Finished, Available)

'abfss://raw@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/ProductCategory/Year=2022/Month=04/Day=07/Time=10:23:41/ProductCategory.csv'

In [9]:
df = (spark 
        .read
        .format("csv")
        .option("inferSchema", "true")
        .option("header","true")
        .load(source_path)
)

StatementMeta(sparkpool03, 16, 9, Finished, Available)

In [10]:
df.show()

StatementMeta(sparkpool03, 16, 10, Finished, Available)

+-----------------+-----------------------+---------------+--------------------+--------------------+
|ProductCategoryID|ParentProductCategoryID|           Name|             rowguid|        ModifiedDate|
+-----------------+-----------------------+---------------+--------------------+--------------------+
|                1|                   null|          Bikes|cfbda25c-df71-47a...|2002-06-01 00:00:...|
|                2|                   null|     Components|c657828d-d808-4ab...|2002-06-01 00:00:...|
|                3|                   null|       Clothing|10a7c342-ca82-48d...|2002-06-01 00:00:...|
|                4|                   null|    Accessories|2be3be36-d9a2-4ee...|2002-06-01 00:00:...|
|                5|                      1| Mountain Bikes|2d364ade-264a-433...|2002-06-01 00:00:...|
|                6|                      1|     Road Bikes|000310c0-bcc8-42c...|2002-06-01 00:00:...|
|                7|                      1|  Touring Bikes|02c5061d-ecdc-427...|20

In [11]:
display(df.limit(10))

StatementMeta(sparkpool03, 16, 11, Finished, Available)

SynapseWidget(Synapse.DataFrame, 892f72f5-7971-4403-aecc-b52d2d228cac)

In [12]:
df.count()

StatementMeta(sparkpool03, 16, 12, Finished, Available)

41

In [13]:
df.printSchema()

StatementMeta(sparkpool03, 16, 13, Finished, Available)

root
 |-- ProductCategoryID: integer (nullable = true)
 |-- ParentProductCategoryID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: string (nullable = true)

In [14]:
# Replace null values with zero
df = df.na.fill(0)

StatementMeta(sparkpool03, 16, 14, Finished, Available)

In [15]:
# Replace null strings with empty
df = df.na.fill("")

StatementMeta(sparkpool03, 16, 15, Finished, Available)

In [16]:
display(df.limit(10))

StatementMeta(sparkpool03, 16, 16, Finished, Available)

SynapseWidget(Synapse.DataFrame, 1b9dc2a8-99ff-445e-ac94-c5de8585c5f2)

In [17]:
# Create temp views to easily query with Spark SQL

df.createOrReplaceTempView('merge')

StatementMeta(sparkpool03, 16, 17, Finished, Available)

In [18]:
%%sql
MERGE INTO aw_product_category
USING merge
ON aw_product_category.ProductCategoryID = merge.ProductCategoryID
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *

StatementMeta(sparkpool03, 16, 18, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>

In [19]:
%%sql
-- Retrieve the version/change history of the Delta table
DESCRIBE HISTORY aw_product_category

StatementMeta(sparkpool03, 16, 19, Finished, Available)

<Spark SQL result set with 2 rows and 14 fields>

In [21]:
%%sql
SELECT *
FROM aw_product_category
where ProductCategoryID = 1

StatementMeta(sparkpool03, 16, 21, Finished, Available)

<Spark SQL result set with 1 rows and 5 fields>

In [22]:
# Load a previous version of the DELTA_Employees table into a dataframe
df = spark.read.format("delta").option("versionAsOf", 1).load("abfss://enriched@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/ProductCategory")
df.show()

StatementMeta(sparkpool03, 16, 22, Finished, Available)

+-----------------+-----------------------+-----------------+--------------------+--------------------+
|ProductCategoryID|ParentProductCategoryID|             Name|             rowguid|        ModifiedDate|
+-----------------+-----------------------+-----------------+--------------------+--------------------+
|               32|                      4|Bottles and Cages|9b7dff41-9fa3-477...|2002-06-01 00:00:...|
|               41|                      4|  Tires and Tubes|3c17c9ae-e906-48b...|2002-06-01 00:00:...|
|               36|                      4|  Hydration Packs|646a8906-fc87-426...|2002-06-01 00:00:...|
|                5|                      1|   Mountain Bikes|2d364ade-264a-433...|2002-06-01 00:00:...|
|                7|                      1|    Touring Bikes|02c5061d-ecdc-427...|2002-06-01 00:00:...|
|               31|                      4|      Bike Stands|43b445c8-b820-424...|2002-06-01 00:00:...|
|               13|                      2|      Derailleurs|183

In [23]:
display(spark.read.text("abfss://enriched@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/ProductCategory/_delta_log/00000000000000000000.json"))

StatementMeta(sparkpool03, 16, 23, Finished, Available)

SynapseWidget(Synapse.DataFrame, 44a70a04-d5bc-4fed-9a29-a10569cfe95e)