In [51]:
%%pyspark
import pandas as pd
from datetime import datetime

StatementMeta(sparkpool03, 14, 26, Finished, Available)

In [52]:
def deep_ls(path: str, max_depth=1):
    """
    List all files and folders in specified path and
    subfolders within maximum recursion depth.
    """

    # List all files in path
    li = mssparkutils.fs.ls(path)

    # Return all files
    for x in li:
        if x.size != 0:
            yield x

    # If the max_depth has not been reached, start
    # listing files and folders in subdirectories
    if max_depth > 1:
        for x in li:
            if x.size != 0:
                continue
            for y in deep_ls(x.path, max_depth - 1):
                yield y

    # If max_depth has been reached,
    # return the folders
    else:
        for x in li:
            if x.size == 0:
                yield x

def convertfiles2df(files):
    """
    Converts FileInfo object into Pandas DataFrame to enable display
    """
    # Disable Arrow-based transfers since the Pandas DataFrame is tiny
    spark.conf.set("spark.sql.execution.arrow.enabled", "false")

    schema = ['path','name','size']
    df = pd.DataFrame([[getattr(i,j) for j in schema] for i in files], columns = schema).sort_values('path')
    return(df)

StatementMeta(sparkpool03, 14, 27, Finished, Available)

In [53]:
root = 'abfss://raw@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/Customer'
files = list(deep_ls(root, max_depth=20))

StatementMeta(sparkpool03, 14, 28, Finished, Available)

In [54]:
df_list_of_files = convertfiles2df(files)

StatementMeta(sparkpool03, 14, 29, Finished, Available)

In [55]:
df_list_of_files.columns

StatementMeta(sparkpool03, 14, 30, Finished, Available)

Index(['path', 'name', 'size'], dtype='object')

In [56]:
df_list_of_files = df_list_of_files.sort_values(['path'], ascending=[False])
display(df_list_of_files)

StatementMeta(sparkpool03, 14, 31, Finished, Available)

SynapseWidget(Synapse.DataFrame, 729befe8-fec7-440d-b6cf-1bbaa3104b53)

In [57]:
source_path=df_list_of_files.iloc[0]['path']

StatementMeta(sparkpool03, 14, 32, Finished, Available)

In [58]:
source_path

StatementMeta(sparkpool03, 14, 33, Finished, Available)

'abfss://raw@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/Customer/Year=2022/Month=04/Day=07/Time=09:48:11/Customer.csv'

In [59]:
df = (spark 
        .read
        .format("csv")
        .option("inferSchema", "true")
        .option("header","true")
        .load(source_path)
)

StatementMeta(sparkpool03, 14, 34, Finished, Available)

In [60]:
df.show()

StatementMeta(sparkpool03, 14, 35, Finished, Available)

+----------+---------+-----+-----------+----------+----------+------+--------------------+--------------------+--------------------+-------------------+--------------------+------------+--------------------+--------------------+
|CustomerID|NameStyle|Title|  FirstName|MiddleName|  LastName|Suffix|         CompanyName|         SalesPerson|        EmailAddress|              Phone|        PasswordHash|PasswordSalt|             rowguid|        ModifiedDate|
+----------+---------+-----+-----------+----------+----------+------+--------------------+--------------------+--------------------+-------------------+--------------------+------------+--------------------+--------------------+
|         1|    false|  Mr.|    Orlando|        N.|       Gee|  null|        A Bike Store|adventure-works\p...|orlando0@adventur...|       245-555-0173|L/Rlwxzp4w7RWmEgX...|    1KjXYs4=|3f5ae95e-b87d-4ae...|2005-08-01 00:00:...|
|         2|    false|  Mr.|      Keith|      null|    Harris|  null|  Progressive S

In [61]:
display(df.limit(10))

StatementMeta(sparkpool03, 14, 36, Finished, Available)

SynapseWidget(Synapse.DataFrame, 47c1da0b-ee60-4233-9ebc-cb3845245d23)

In [62]:
df.count()

StatementMeta(sparkpool03, 14, 37, Finished, Available)

847

In [68]:
df.printSchema()

StatementMeta(sparkpool03, 14, 43, Finished, Available)

root
 |-- CustomerID: integer (nullable = true)
 |-- NameStyle: boolean (nullable = true)
 |-- Title: string (nullable = false)
 |-- FirstName: string (nullable = false)
 |-- MiddleName: string (nullable = false)
 |-- LastName: string (nullable = false)
 |-- Suffix: string (nullable = false)
 |-- CompanyName: string (nullable = false)
 |-- SalesPerson: string (nullable = false)
 |-- EmailAddress: string (nullable = false)
 |-- Phone: string (nullable = false)
 |-- PasswordHash: string (nullable = false)
 |-- PasswordSalt: string (nullable = false)
 |-- rowguid: string (nullable = false)
 |-- ModifiedDate: string (nullable = false)

In [69]:
# Replace null values with zero
df = df.na.fill(0)

StatementMeta(sparkpool03, 14, 44, Finished, Available)

In [70]:
# Replace null strings with empty
df = df.na.fill("")

StatementMeta(sparkpool03, 14, 45, Finished, Available)

In [71]:
display(df.limit(10))

StatementMeta(sparkpool03, 14, 46, Finished, Available)

SynapseWidget(Synapse.DataFrame, 0cc79cc2-7f68-4499-a4b0-cb214d427c8b)

In [72]:
# Create temp views to easily query with Spark SQL

df.createOrReplaceTempView('merge')

StatementMeta(sparkpool03, 14, 47, Finished, Available)

In [73]:
%%sql
MERGE INTO aw_customer
USING merge
ON aw_customer.CustomerID = merge.CustomerID
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *

StatementMeta(sparkpool03, 14, 48, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>

In [74]:
%%sql
-- Retrieve the version/change history of the Delta table
DESCRIBE HISTORY aw_customer

StatementMeta(sparkpool03, 14, 49, Finished, Available)

<Spark SQL result set with 2 rows and 14 fields>

In [75]:
%%sql
SELECT *
FROM aw_customer
where CustomerID = 10

StatementMeta(sparkpool03, 14, 50, Finished, Available)

<Spark SQL result set with 1 rows and 15 fields>

In [77]:
# Load a previous version of the DELTA_Employees table into a dataframe
df = spark.read.format("delta").option("versionAsOf", 1).load("abfss://enriched@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/Customer")
df.show()

StatementMeta(sparkpool03, 14, 52, Finished, Available)

+----------+---------+-----+---------+----------+---------+------+--------------------+--------------------+--------------------+-------------------+--------------------+------------+--------------------+--------------------+
|CustomerID|NameStyle|Title|FirstName|MiddleName| LastName|Suffix|         CompanyName|         SalesPerson|        EmailAddress|              Phone|        PasswordHash|PasswordSalt|             rowguid|        ModifiedDate|
+----------+---------+-----+---------+----------+---------+------+--------------------+--------------------+--------------------+-------------------+--------------------+------------+--------------------+--------------------+
|       185|    false|  Mr.|  Sandeep|          | Kaliyath|      |  Weekend Bike Tours|adventure-works\shu0|sandeep1@adventur...|       495-555-0113|Rrgza3uGhmNF9StSE...|    ekE0M9I=|5014e91d-7b0e-495...|2007-07-01 00:00:...|
|       353|    false|  Ms.|     Joan|        M.| Campbell|      |Two-Wheeled Trans...|adventure

In [78]:
display(spark.read.text("abfss://enriched@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/Customer/_delta_log/00000000000000000000.json"))

StatementMeta(sparkpool03, 14, 53, Finished, Available)

SynapseWidget(Synapse.DataFrame, 4d33a2f1-8d1b-43a0-b0d7-740136fca20d)