In [1]:
%%pyspark
import pandas as pd
from datetime import datetime

StatementMeta(sparkpool03, 17, 1, Finished, Available)

In [2]:
def deep_ls(path: str, max_depth=1):
    """
    List all files and folders in specified path and
    subfolders within maximum recursion depth.
    """

    # List all files in path
    li = mssparkutils.fs.ls(path)

    # Return all files
    for x in li:
        if x.size != 0:
            yield x

    # If the max_depth has not been reached, start
    # listing files and folders in subdirectories
    if max_depth > 1:
        for x in li:
            if x.size != 0:
                continue
            for y in deep_ls(x.path, max_depth - 1):
                yield y

    # If max_depth has been reached,
    # return the folders
    else:
        for x in li:
            if x.size == 0:
                yield x

def convertfiles2df(files):
    """
    Converts FileInfo object into Pandas DataFrame to enable display
    """
    # Disable Arrow-based transfers since the Pandas DataFrame is tiny
    spark.conf.set("spark.sql.execution.arrow.enabled", "false")

    schema = ['path','name','size']
    df = pd.DataFrame([[getattr(i,j) for j in schema] for i in files], columns = schema).sort_values('path')
    return(df)

StatementMeta(sparkpool03, 17, 2, Finished, Available)

In [3]:
root = 'abfss://raw@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/ProductModel'
files = list(deep_ls(root, max_depth=20))

StatementMeta(sparkpool03, 17, 3, Finished, Available)

In [4]:
df_list_of_files = convertfiles2df(files)

StatementMeta(sparkpool03, 17, 4, Finished, Available)

In [5]:
df_list_of_files.columns

StatementMeta(sparkpool03, 17, 5, Finished, Available)

Index(['path', 'name', 'size'], dtype='object')

In [6]:
df_list_of_files = df_list_of_files.sort_values(['path'], ascending=[False])
display(df_list_of_files)

StatementMeta(sparkpool03, 17, 6, Finished, Available)

SynapseWidget(Synapse.DataFrame, ae138163-856f-48dc-8d67-45eb31c53223)

In [7]:
source_path=df_list_of_files.iloc[0]['path']

StatementMeta(sparkpool03, 17, 7, Finished, Available)

In [8]:
source_path

StatementMeta(sparkpool03, 17, 8, Finished, Available)

'abfss://raw@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/ProductModel/Year=2022/Month=04/Day=07/Time=10:29:19/ProductModel.csv'

In [9]:
df = (spark 
        .read
        .format("csv")
        .option("inferSchema", "true")
        .option("header","true")
        .load(source_path)
)

StatementMeta(sparkpool03, 17, 9, Finished, Available)

In [10]:
df.show()

StatementMeta(sparkpool03, 17, 10, Finished, Available)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|      ProductModelID|                Name|  CatalogDescription|             rowguid|        ModifiedDate|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|                   1|        Classic Vest|                null|29321d47-1e4c-4aa...|2007-06-01 00:00:...|
|                   2|         Cycling Cap|                null|474fb654-3c96-4cb...|2005-06-01 00:00:...|
|                   3|  Full-Finger Gloves|                null|a75483fe-3c47-4aa...|2006-06-01 00:00:...|
|                   4|  Half-Finger Gloves|                null|14b56f2a-d4aa-40a...|2006-06-01 00:00:...|
|                   5|   HL Mountain Frame|                null|fdd5407b-c2db-49d...|2005-06-01 00:00:...|
|                   6|       HL Road Frame|                null|4d332ecc-48b3-4e0...|2002-05-02 00:00:...|
|                   7|    HL Touring 

In [11]:
display(df.limit(10))

StatementMeta(sparkpool03, 16, 11, Finished, Available)

SynapseWidget(Synapse.DataFrame, 892f72f5-7971-4403-aecc-b52d2d228cac)

In [11]:
df.count()

StatementMeta(sparkpool03, 17, 11, Finished, Available)

164

In [12]:
df.printSchema()

StatementMeta(sparkpool03, 17, 12, Finished, Available)

root
 |-- ProductModelID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- CatalogDescription: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: string (nullable = true)

In [13]:
# Replace null values with zero
df = df.na.fill(0)

StatementMeta(sparkpool03, 17, 13, Finished, Available)

In [14]:
# Replace null strings with empty
df = df.na.fill("")

StatementMeta(sparkpool03, 17, 14, Finished, Available)

In [15]:
display(df.limit(10))

StatementMeta(sparkpool03, 17, 15, Finished, Available)

SynapseWidget(Synapse.DataFrame, df6ea368-35c6-460e-8f1b-348eedba6dba)

In [17]:
# Create temp views to easily query with Spark SQL

df.createOrReplaceTempView('merge')

StatementMeta(sparkpool03, 17, 17, Finished, Available)

In [18]:
%%sql
MERGE INTO aw_product_model
USING merge
ON aw_product_model.ProductModelID = merge.ProductModelID
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *

StatementMeta(sparkpool03, 17, 18, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>

In [19]:
%%sql
-- Retrieve the version/change history of the Delta table
DESCRIBE HISTORY aw_product_model

StatementMeta(sparkpool03, 17, 19, Finished, Available)

<Spark SQL result set with 2 rows and 14 fields>

In [20]:
%%sql
SELECT *
FROM aw_product_model
where ProductModelID = 1

StatementMeta(sparkpool03, 17, 20, Finished, Available)

<Spark SQL result set with 1 rows and 5 fields>

In [21]:
# Load a previous version of the DELTA_Employees table into a dataframe
df = spark.read.format("delta").option("versionAsOf", 1).load("abfss://enriched@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/ProductModel")
df.show()

StatementMeta(sparkpool03, 17, 21, Finished, Available)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|      ProductModelID|                Name|  CatalogDescription|             rowguid|        ModifiedDate|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 ...|                    |                    |                    |                    |
|                 ...|                    |                    |                    |                    |
|                 ...|                    |                    |                    |                    |
|                 ...|                    |                    |                    |                    |
|                  34|        Touring-1000|<?xml-stylesheet ...|                    |                    |
|                  19|        Mountain-100|<?xml-stylesheet ...|                    |                    |
|                 ...|               

In [22]:
display(spark.read.text("abfss://enriched@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/ProductModel/_delta_log/00000000000000000000.json"))

StatementMeta(sparkpool03, 17, 22, Finished, Available)

SynapseWidget(Synapse.DataFrame, 511ebad2-4ac7-4d38-9585-7079a65eec94)