In [12]:
# Load the SynapseLink Sales Data into a Dataframe.
# Select just the "sale" document types from the sales container, 
# which have a minimum _ts (timestamp) value

from pyspark.sql.functions import col

# initialize variables
begin_timestamp = 0 
end_timestamp   = 1699999999

# read just the doctype "sales", not "line_item"
# "cosmos.oltp" = CosmosDB live database
# "cosmos.olap" = Synapse Link Analytic Datastore

df_sales = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", "CosmosSqlDemoDB")\
    .option("spark.cosmos.container", "sales")\
    .load().filter(col("doctype") == "sale")\
    .filter(col("_ts") > begin_timestamp)\
    .filter(col("_ts") < end_timestamp)

display(df_sales.limit(3))

StatementMeta(pool3s, 0, 13, Finished, Available)

SynapseWidget(Synapse.DataFrame, ea4dc6c3-e81f-475a-bd5b-be049fb33c53)

In [13]:
# Display the shape and observed schema of the DataFrame

print('df_sales, shape: {} x {}'.format(
        df_sales.count(), len(df_sales.columns)))
        
df_sales.printSchema()


StatementMeta(pool3s, 0, 14, Finished, Available)

df_sales, shape: 31397 x 20
root
 |-- _rid: string (nullable = true)
 |-- _ts: long (nullable = true)
 |-- pk: long (nullable = true)
 |-- id: string (nullable = true)
 |-- sale_id: long (nullable = true)
 |-- doctype: string (nullable = true)
 |-- date: string (nullable = true)
 |-- line_num: long (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- store_id: long (nullable = true)
 |-- upc: string (nullable = true)
 |-- price: double (nullable = true)
 |-- qty: long (nullable = true)
 |-- cost: double (nullable = true)
 |-- doc_epoch: long (nullable = true)
 |-- doc_time: string (nullable = true)
 |-- _etag: string (nullable = true)
 |-- dow: string (nullable = true)
 |-- item_count: long (nullable = true)
 |-- total_cost: double (nullable = true)

In [14]:
# Aggregate Sales by Customer 

import pyspark.sql.functions as F 

df_customer_aggregated = df_sales.groupBy("customer_id") \
    .agg(
        F.first('id').alias('id'), \
        F.first('customer_id').alias('pk'), \
        F.count("customer_id").alias('order_count'), \
        F.sum("total_cost").alias("total_dollar_amount"), \
        F.sum("item_count").alias("total_item_count")) \
        .sort("customer_id", ascending=True)

display(df_customer_aggregated.limit(10))


StatementMeta(pool3s, 0, 15, Finished, Available)

SynapseWidget(Synapse.DataFrame, 3bfe6ef4-2b60-4af0-892b-7b01d1cee251)

In [15]:
# Write the customer-aggregated DataFrame to the CosmosDB
# sales_aggregates container.  The id and pk is the customer ID,
# and upserts are enabled.

df_customer_aggregated.write.format("cosmos.oltp")\
    .option("spark.synapse.linkedService", "CosmosSqlDemoDB")\
    .option("spark.cosmos.container", "sales_aggregates")\
    .option("spark.cosmos.write.upsertenabled", "true")\
    .mode('append')\
    .save()


StatementMeta(pool3s, 0, 16, Finished, Available)