# Process the Customers vs Orders Synapse Link Data

In [45]:
# Load the SynapseLink Customers and Orders SynapseLink Data into a Dataframes.

from pyspark.sql.functions import col

df_customers = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", "demoCosmosDB")\
    .option("spark.cosmos.container", "customers")\
    .load().filter(col("_ts") > min_timestamp)

# initialize variables
use_push_down_predicate = True
df_orders, df_order_docs = None, None
min_timestamp = 1635168000

if use_push_down_predicate == True:
    # this is more efficient, as the dataframe is filtered on load
    df_order_docs = spark.read\
        .format("cosmos.olap")\
        .option("spark.synapse.linkedService", "demoCosmosDB")\
        .option("spark.cosmos.container", "orders")\
        .load().filter(col("doctype") == "order").filter(col("_ts") > min_timestamp) 
        # push-down predicate filters on doctype and _ts 
else:
    # this is less efficient, as the dataframe is filtered after load
    df_orders = spark.read\
        .format("cosmos.olap")\
        .option("spark.synapse.linkedService", "demoCosmosDB")\
        .option("spark.cosmos.container", "orders")\
        .load()

    print('df_orders, shape: {} x {}'.format(
        df_orders.count(), len(df_orders.columns)))
    df_orders.printSchema()

    df_order_docs = df_orders.filter(df_orders["doctype"].isin(["order"]))

print('df_customers, shape: {} x {}'.format(
        df_customers.count(), len(df_customers.columns)))
df_customers.printSchema()

print('df_order_docs, shape: {} x {}'.format(
        df_order_docs.count(), len(df_order_docs.columns)))

# df_orders, shape: 4199812 x 26
# df_order_docs, shape: 1200000 x 26
# df_order_docs, shape: 300000 x 26

StatementMeta(poolspark3s, 54, 45, Finished, Available)

df_customers, shape: 100000 x 14
root
 |-- _rid: string (nullable = true)
 |-- _ts: long (nullable = true)
 |-- pk: string (nullable = true)
 |-- doctype: string (nullable = true)
 |-- customerId: string (nullable = true)
 |-- name: string (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- address: struct (nullable = true)
 |    |-- street: string (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- zip: string (nullable = true)
 |-- doc_epoch: long (nullable = true)
 |-- doc_time: string (nullable = true)
 |-- id: string (nullable = true)
 |-- _etag: string (nullable = true)
 |-- customer_id: string (nullable = true)

df_order_docs, shape: 300000 x 26

In [46]:
# Display the first few rows of the df_customers Dataframe

display(df_customers.limit(3))


StatementMeta(poolspark3s, 54, 46, Finished, Available)

SynapseWidget(Synapse.DataFrame, 3a736dec-2fbb-406a-9e34-b3ddda257a57)

In [47]:
# Display the first few rows of the df_order_docs Dataframe

display(df_order_docs.limit(3))

StatementMeta(poolspark3s, 54, 47, Finished, Available)

SynapseWidget(Synapse.DataFrame, d064a02b-1f8b-4c31-8eee-1c793981f014)

In [48]:
# Create Narrower/Minimal Dataframes for the Join operation 

from pyspark.sql.functions import col

df_customers_minimal = df_customers.select(
    col('id'),
    col('customer_id'),
    col('name'))

print('df_customers_minimal, shape: {} x {}'.format(
        df_customers_minimal.count(), len(df_customers_minimal.columns)))
df_customers_minimal.printSchema()

df_orders_minimal = df_order_docs.select(
    col('order_id'),
    col('customer_id'),
    col('item_count'),
    col('order_total'))

print('df_orders_minimal, shape: {} x {}'.format(
        df_orders_minimal.count(), len(df_orders_minimal.columns)))
df_orders_minimal.printSchema()

StatementMeta(poolspark3s, 54, 48, Finished, Available)

df_customers_minimal, shape: 100000 x 3
root
 |-- id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- name: string (nullable = true)

df_orders_minimal, shape: 300000 x 4
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- item_count: long (nullable = true)
 |-- order_total: double (nullable = true)

In [49]:
# Join the (narrow) Customers to their (narrow) Order documents

df_joined = df_orders_minimal.join(df_customers_minimal, ['customerId']) \
    .sort("customer_id", ascending=False)


print('df_joined, shape: {} x {}'.format(
        df_joined.count(), len(df_joined.columns)))
df_joined.printSchema()


StatementMeta(poolspark3s, 54, 49, Finished, Available)

AnalysisException: USING column `customerId` cannot be resolved on the left side of the join. The left-side columns: [order_id, customer_id, item_count, order_total]

In [None]:
# Display the first few rows of the df_joined Dataframe

display(df_joined.limit(20))

StatementMeta(, , , Cancelled, )

In [None]:
# Group the df_joined Dataframe by customerId, sum on order total and total_orders

df_grouped = df_joined.groupby("customer_id") \
    .sum("order_total").alias('total_orders') \
    .sort("customer_id", ascending=False)

display(df_grouped.printSchema())
print((df_grouped.count(), len(df_grouped.columns)))



StatementMeta(, , , Cancelled, )

In [None]:
import pyspark.sql.functions as F 
#from pyspark.sql.functions import col

df_agg = df_joined.groupBy("customerId") \
    .agg(
        F.first('id').alias('id'), \
        F.count("customer_id").alias('order_count'), \
        F.sum("order_total").alias("total_dollar_amount"), \
        F.sum("item_count").alias("total_item_count")) \
        .sort("customer_id", ascending=False)

display(df_agg.printSchema())
print((df_agg.count(), len(df_agg.columns)))

StatementMeta(, , , Cancelled, )

In [None]:
display(df_agg.limit(30))

StatementMeta(, , , Cancelled, )

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

import pyspark.sql.functions as F 

# See https://github.com/Azure-Samples/Synapse/blob/main/Notebooks/PySpark/02%20Read%20and%20write%20data%20from%20Azure%20Blob%20Storage%20WASB.ipynb

# Azure storage access info
blob_account_name   = 'cjoakimstorage'
blob_container_name = 'synapse'
blob_relative_path  = 'ecomm/'
linked_service_name = 'cjoakimstorageAzureBlobStorage'

blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(
    linked_service_name)
print('blob_sas_token: {}'.format(blob_sas_token))

# Allow Spark to access from Blob remotely
wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (
    blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (
    blob_container_name, blob_account_name), blob_sas_token)
print('Remote wasbs_path: ' + wasbs_path)

csv_path  = '{}{}'.format(wasbs_path,'sales_by_customer_csv')
json_path = '{}{}'.format(wasbs_path,'sales_by_customer_json')

df_agg.coalesce(1).write.csv(csv_path, mode='overwrite', header='true')
#df_agg.coalesce(1).write.json(json_path, mode='overwrite')


StatementMeta(, , , Cancelled, )

In [None]:

# Write to CosmosDB - linked service 'demoCosmosDB'
# See https://docs.microsoft.com/en-us/azure/synapse-analytics/synapse-link/how-to-query-analytical-store-spark#write-spark-dataframe-to-azure-cosmos-db-container

df_agg.write.format("cosmos.oltp")\
    .option("spark.synapse.linkedService", "demoCosmosDB")\
    .option("spark.cosmos.container", "sales_by_customer")\
    .option("spak.cosmos.write.upsertenabled", "true")\
    .mode('append')\
    .save()


StatementMeta(, , , Cancelled, )