In [26]:
%%pyspark
import pandas as pd
from datetime import datetime

StatementMeta(sparkpool03, 14, 1, Finished, Available)

In [27]:
def deep_ls(path: str, max_depth=1):
    """
    List all files and folders in specified path and
    subfolders within maximum recursion depth.
    """

    # List all files in path
    li = mssparkutils.fs.ls(path)

    # Return all files
    for x in li:
        if x.size != 0:
            yield x

    # If the max_depth has not been reached, start
    # listing files and folders in subdirectories
    if max_depth > 1:
        for x in li:
            if x.size != 0:
                continue
            for y in deep_ls(x.path, max_depth - 1):
                yield y

    # If max_depth has been reached,
    # return the folders
    else:
        for x in li:
            if x.size == 0:
                yield x

def convertfiles2df(files):
    """
    Converts FileInfo object into Pandas DataFrame to enable display
    """
    # Disable Arrow-based transfers since the Pandas DataFrame is tiny
    spark.conf.set("spark.sql.execution.arrow.enabled", "false")

    schema = ['path','name','size']
    df = pd.DataFrame([[getattr(i,j) for j in schema] for i in files], columns = schema).sort_values('path')
    return(df)

StatementMeta(sparkpool03, 14, 2, Finished, Available)

In [28]:
root = 'abfss://raw@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/SalesOrderDetail'
files = list(deep_ls(root, max_depth=20))

StatementMeta(sparkpool03, 14, 3, Finished, Available)

In [29]:
df_list_of_files = convertfiles2df(files)

StatementMeta(sparkpool03, 14, 4, Finished, Available)

In [30]:
df_list_of_files.columns

StatementMeta(sparkpool03, 14, 5, Finished, Available)

Index(['path', 'name', 'size'], dtype='object')

In [31]:
df_list_of_files = df_list_of_files.sort_values(['path'], ascending=[False])
display(df_list_of_files)

StatementMeta(sparkpool03, 14, 6, Finished, Available)

SynapseWidget(Synapse.DataFrame, f4f0c1ec-ac64-4236-992d-8b30b41218d5)

In [32]:
source_path=df_list_of_files.iloc[0]['path']

StatementMeta(sparkpool03, 14, 7, Finished, Available)

In [33]:
source_path

StatementMeta(sparkpool03, 14, 8, Finished, Available)

'abfss://raw@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/SalesOrderDetail/Year=2022/Month=04/Day=07/Time=09:40:06/SalesOrderDetail.csv'

In [34]:
df = (spark 
        .read
        .format("csv")
        .option("inferSchema", "true")
        .option("header","true")
        .load(source_path)
)

StatementMeta(sparkpool03, 14, 9, Finished, Available)

In [35]:
df.show()

StatementMeta(sparkpool03, 14, 10, Finished, Available)

+------------+------------------+--------+---------+---------+-----------------+---------+--------------------+--------------------+
|SalesOrderID|SalesOrderDetailID|OrderQty|ProductID|UnitPrice|UnitPriceDiscount|LineTotal|             rowguid|        ModifiedDate|
+------------+------------------+--------+---------+---------+-----------------+---------+--------------------+--------------------+
|       71774|            110562|       1|      836|  356.898|              0.0|  356.898|e3a1994c-7a68-4ce...|2008-06-01 00:00:...|
|       71774|            110563|       1|      822|  356.898|              0.0|  356.898|5c77f557-fdb6-43b...|2008-06-01 00:00:...|
|       71776|            110567|       1|      907|     63.9|              0.0|     63.9|6dbfe398-d15d-425...|2008-06-01 00:00:...|
|       71780|            110616|       4|      905|  218.454|              0.0|  873.816|377246c9-4483-48e...|2008-06-01 00:00:...|
|       71780|            110617|       2|      983|  461.694|       

In [36]:
display(df.limit(10))

StatementMeta(sparkpool03, 14, 11, Finished, Available)

SynapseWidget(Synapse.DataFrame, 33af2999-0ff6-4585-af7d-162c25ba8771)

In [37]:
df.count()

StatementMeta(sparkpool03, 14, 12, Finished, Available)

542

In [38]:
df.printSchema()

StatementMeta(sparkpool03, 14, 13, Finished, Available)

root
 |-- SalesOrderID: integer (nullable = true)
 |-- SalesOrderDetailID: integer (nullable = true)
 |-- OrderQty: integer (nullable = true)
 |-- ProductID: integer (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- UnitPriceDiscount: double (nullable = true)
 |-- LineTotal: double (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: string (nullable = true)

In [39]:
# Replace null values with zero
df = df.na.fill(0)

StatementMeta(sparkpool03, 14, 14, Finished, Available)

In [40]:
# Replace null strings with empty
df = df.na.fill("")

StatementMeta(sparkpool03, 14, 15, Finished, Available)

In [41]:
display(df.limit(10))

StatementMeta(sparkpool03, 14, 16, Finished, Available)

SynapseWidget(Synapse.DataFrame, 56f188cf-84c8-4f6b-93ae-db14f9eeed99)

In [43]:
# Create temp views to easily query with Spark SQL

df.createOrReplaceTempView('merge')

StatementMeta(sparkpool03, 14, 18, Finished, Available)

In [44]:
%%sql
MERGE INTO aw_sales_order_detail
USING merge
ON aw_sales_order_detail.SalesOrderID = merge.SalesOrderID AND
aw_sales_order_detail.SalesOrderDetailID = merge.SalesOrderDetailID
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *

StatementMeta(sparkpool03, 14, 19, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>

In [45]:
%%sql
-- Retrieve the version/change history of the Delta table
DESCRIBE HISTORY aw_sales_order_deyail

StatementMeta(sparkpool03, 14, 20, Finished, Available)

Error: Table or view 'aw_sales_order_deyail' not found in database 'default'

In [46]:
%%sql
SELECT *
FROM aw_sales_order_detail
where SalesOrderID = 71774

StatementMeta(sparkpool03, 14, 21, Finished, Available)

<Spark SQL result set with 2 rows and 9 fields>

In [48]:
# Load a previous version of the DELTA_Employees table into a dataframe
df = spark.read.format("delta").option("versionAsOf", 1).load("abfss://enriched@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/SalesOrderDetail")
df.show()

StatementMeta(sparkpool03, 14, 23, Finished, Available)

+------------+------------------+--------+---------+----------+-----------------+------------+--------------------+--------------------+
|SalesOrderID|SalesOrderDetailID|OrderQty|ProductID| UnitPrice|UnitPriceDiscount|   LineTotal|             rowguid|        ModifiedDate|
+------------+------------------+--------+---------+----------+-----------------+------------+--------------------+--------------------+
|       71780|            110643|       7|      869|  41.99400|          0.00000|  293.958000|169c75f6-a364-46e...|2008-06-01 00:00:...|
|       71782|            110698|       2|      945|  54.89400|          0.00000|  109.788000|2d3aef50-9ace-4b5...|2008-06-01 00:00:...|
|       71784|            110782|       2|      916|  31.58400|          0.00000|   63.168000|deefdf0c-fa8e-40c...|2008-06-01 00:00:...|
|       71797|            111078|      11|      870|   2.89420|          0.02000|   31.199476|e35d5ac3-be8e-4f3...|2008-06-01 00:00:...|
|       71902|            112995|       7

In [50]:
display(spark.read.text("abfss://enriched@csresearchdpolaplakest.dfs.core.windows.net/erpcore/AdventureWorks/SalesOrderDetail/_delta_log/00000000000000000000.json"))

StatementMeta(sparkpool03, 14, 25, Finished, Available)

SynapseWidget(Synapse.DataFrame, 41c4cb73-095d-4046-afa5-d32381735a8b)