# Process the Customers vs Orders Synapse Link Data

In [67]:
# Load the SynapseLink Customers and Orders data into a Dataframes.
# Then, filter the Orders for just the 'order' doctype.

df_customers = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", "demoCosmosDB")\
    .option("spark.cosmos.container", "customers")\
    .load()

df_orders = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", "demoCosmosDB")\
    .option("spark.cosmos.container", "orders")\
    .load()

df_order_docs = df_orders.filter(df_orders["doctype"].isin(["order"]))

print('df_customers, shape: {} x {}'.format(
        df_customers.count(), len(df_customers.columns)))
df_customers.printSchema()

print('df_orders, shape: {} x {}'.format(
        df_orders.count(), len(df_orders.columns)))
df_orders.printSchema()

print('df_order_docs, shape: {} x {}'.format(
        df_order_docs.count(), len(df_order_docs.columns)))



StatementMeta(poolspark3s, 43, 66, Finished, Available)

df_customers, shape: 100000 x 13
root
 |-- _rid: string (nullable = true)
 |-- _ts: long (nullable = true)
 |-- pk: string (nullable = true)
 |-- doctype: string (nullable = true)
 |-- customerId: string (nullable = true)
 |-- name: string (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- address: struct (nullable = true)
 |    |-- street: string (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- zip: string (nullable = true)
 |-- doc_epoch: long (nullable = true)
 |-- doc_time: string (nullable = true)
 |-- id: string (nullable = true)
 |-- _etag: string (nullable = true)

df_orders, shape: 1049182 x 21
root
 |-- _rid: string (nullable = true)
 |-- _ts: long (nullable = true)
 |-- pk: long (nullable = true)
 |-- doctype: string (nullable = true)
 |-- orderId: long (nullable = true)
 |-- lineNumber: long (nullable = true)
 |-- customerId: string (nullable = true)
 |-- sku: long (nu

In [68]:
# Display the first few rows of the df_customers Dataframe

display(df_customers.limit(3))


StatementMeta(poolspark3s, 43, 67, Finished, Available)

SynapseWidget(Synapse.DataFrame, ac537255-a277-457b-9c12-fd990df55078)

In [69]:
# Display the first few rows of the df_order_docs Dataframe

display(df_order_docs.limit(3))

StatementMeta(poolspark3s, 43, 68, Finished, Available)

SynapseWidget(Synapse.DataFrame, 6dd9a236-649b-4e52-901a-c4be58c06e15)

In [70]:
# Create Narrower/Minimal Dataframes for the Join operation 

from pyspark.sql.functions import col

df_customers_minimal = df_customers.select(
    col('customerId'),
    col('name'))

print('df_customers_minimal, shape: {} x {}'.format(
        df_customers_minimal.count(), len(df_customers_minimal.columns)))
df_customers_minimal.printSchema()

df_orders_minimal = df_order_docs.select(
    col('orderId'),
    col('customerId'),
    col('item_count'),
    col('order_total'))

print('df_orders_minimal, shape: {} x {}'.format(
        df_orders_minimal.count(), len(df_orders_minimal.columns)))
df_orders_minimal.printSchema()

StatementMeta(poolspark3s, 43, 69, Finished, Available)

df_customers_minimal, shape: 100000 x 2
root
 |-- customerId: string (nullable = true)
 |-- name: string (nullable = true)

df_orders_minimal, shape: 300000 x 4
root
 |-- orderId: long (nullable = true)
 |-- customerId: string (nullable = true)
 |-- item_count: long (nullable = true)
 |-- order_total: double (nullable = true)

In [71]:
# Join the (narrow) Customers to their (narrow) Order documents

df_joined = df_orders_minimal.join(df_customers_minimal, ['customerId']) \
    .sort("customerId", ascending=False)


print('df_joined, shape: {} x {}'.format(
        df_joined.count(), len(df_joined.columns)))
df_joined.printSchema()


StatementMeta(poolspark3s, 43, 70, Finished, Available)

df_joined, shape: 300000 x 5
root
 |-- customerId: string (nullable = true)
 |-- orderId: long (nullable = true)
 |-- item_count: long (nullable = true)
 |-- order_total: double (nullable = true)
 |-- name: string (nullable = true)

In [72]:
# Display the first few rows of the df_joined Dataframe

display(df_joined.limit(20))

StatementMeta(poolspark3s, 43, 71, Finished, Available)

SynapseWidget(Synapse.DataFrame, f54caf6d-cc3d-40bd-9f4c-6f8da926da01)

In [73]:
# Group the df_joined Dataframe by customerId, sum on order total and total_orders

df_grouped = df_joined.groupby("customerId") \
    .sum("order_total").alias('total_orders') \
    .sort("customerId", ascending=False)

display(df_grouped.printSchema())
print((df_grouped.count(), len(df_grouped.columns)))



StatementMeta(poolspark3s, 43, 72, Finished, Available)

root
 |-- customerId: string (nullable = true)
 |-- sum(order_total): double (nullable = true)

(95119, 2)

In [74]:
import pyspark.sql.functions as F 

df_agg = df_joined.groupBy("customerId") \
    .agg(
        F.count("customerId").alias('order_count'), \
        F.sum("order_total").alias("total_dollar_amount"), \
        F.sum("item_count").alias("total_item_count")) \
        .sort("customerId", ascending=False)

display(df_agg.printSchema())
print((df_agg.count(), len(df_agg.columns)))

StatementMeta(poolspark3s, 43, 73, Finished, Available)

root
 |-- customerId: string (nullable = true)
 |-- order_count: long (nullable = false)
 |-- total_dollar_amount: double (nullable = true)
 |-- total_item_count: long (nullable = true)

(95119, 4)

In [75]:
display(df_agg.limit(30))

StatementMeta(poolspark3s, 43, 74, Finished, Available)

SynapseWidget(Synapse.DataFrame, 78f60f87-0100-42a8-9b44-c0a9d55ae228)

In [79]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

import pyspark.sql.functions as F 

# See https://github.com/Azure-Samples/Synapse/blob/main/Notebooks/PySpark/02%20Read%20and%20write%20data%20from%20Azure%20Blob%20Storage%20WASB.ipynb

# Azure storage access info
blob_account_name   = 'cjoakimstorage'
blob_container_name = 'synapse'
blob_relative_path  = 'ecomm'
linked_service_name = 'cjoakimstorageAzureBlobStorage'

blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(
    linked_service_name)
print('blob_sas_token: {}'.format(blob_sas_token))

# Allow Spark to access from Blob remotely
wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (
    blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (
    blob_container_name, blob_account_name), blob_sas_token)
print('Remote blob path: ' + wasbs_path)

blob_path = '{}{}'.format(wasbs_path,'sales_by_customer')

df_agg.coalesce(1).write.csv(blob_path, mode='overwrite', header='true')


StatementMeta(poolspark3s, 43, 78, Finished, Available)

blob_sas_token: ?sv=2020-02-10&ss=bf&srt=sco&se=2021-10-20T20%3A10%3A29Z&sp=rwdl&sig=sdEc8zZLuIxjJ1Yu%2BPjRxbQl3Bl5xXJiKBoeXuvc5hU%3D
Remote blob path: wasbs://synapse@cjoakimstorage.blob.core.windows.net/ecomm