In [20]:
# Load the SynapseLink Customers and Sales SynapseLink Data into a Dataframes.

from pyspark.sql.functions import col

# initialize variables
min_timestamp = 1635168000

# read just the doctype "sales", not "line_item"
df_sales = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", "CosmosDemoSQL")\
    .option("spark.cosmos.container", "sales")\
    .load().filter(col("doctype") == "sale").filter(col("_ts") > min_timestamp) 

StatementMeta(poolspark3s, 3, 20, Finished, Available)

In [21]:
# Display the shape and observed schema of the DataFrame

print('df_sales, shape: {} x {}'.format(
        df_sales.count(), len(df_sales.columns)))
        
df_sales.printSchema()

StatementMeta(poolspark3s, 3, 21, Finished, Available)

df_sales, shape: 28758 x 20
root
 |-- _rid: string (nullable = true)
 |-- _ts: long (nullable = true)
 |-- pk: long (nullable = true)
 |-- id: string (nullable = true)
 |-- sale_id: long (nullable = true)
 |-- doctype: string (nullable = true)
 |-- date: string (nullable = true)
 |-- line_num: long (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- store_id: long (nullable = true)
 |-- upc: string (nullable = true)
 |-- price: double (nullable = true)
 |-- qty: long (nullable = true)
 |-- cost: double (nullable = true)
 |-- doc_epoch: long (nullable = true)
 |-- doc_time: string (nullable = true)
 |-- _etag: string (nullable = true)
 |-- dow: string (nullable = true)
 |-- item_count: long (nullable = true)
 |-- total_cost: double (nullable = true)

In [22]:
# Display the first 10 rows of the DataFrame

display(df_sales.limit(4))


StatementMeta(poolspark3s, 3, 22, Finished, Available)

SynapseWidget(Synapse.DataFrame, 600f8e7e-703e-4d74-a11b-73ae71d8c96e)

In [25]:
# Aggregate Sales by Customer 

import pyspark.sql.functions as F 

df_customer_aggregated = df_sales.groupBy("customer_id") \
    .agg(
        F.first('id').alias('id'), \
        F.first('customer_id').alias('pk'), \
        F.count("customer_id").alias('order_count'), \
        F.sum("total_cost").alias("total_dollar_amount"), \
        F.sum("item_count").alias("total_item_count")) \
        .sort("customer_id", ascending=True)

display(df_customer_aggregated.limit(4))


StatementMeta(poolspark3s, 3, 25, Finished, Available)

SynapseWidget(Synapse.DataFrame, 3fd35383-cd93-4d99-864f-e37fdab7a3b3)

In [26]:
# Write to CosmosDB - linked service 'CosmosDemoSQL'
# See https://docs.microsoft.com/en-us/azure/synapse-analytics/synapse-link/how-to-query-analytical-store-spark#write-spark-dataframe-to-azure-cosmos-db-container

df_customer_aggregated.write.format("cosmos.oltp")\
    .option("spark.synapse.linkedService", "CosmosDemoSQL")\
    .option("spark.cosmos.container", "sales_aggregates")\
    .option("spark.cosmos.write.upsertenabled", "true")\
    .mode('append')\
    .save()

StatementMeta(poolspark3s, 3, 26, Finished, Available)