# Cosmos DB Synapse Sales Processing Spark Notebook

## This Spark/PySpark Notebook demonstrates how to:

- **Read the Synapse Link Analytic Datastore with Spark/PySpark in Azure Synapse**
- **Source Cosmos DB is the Mongo API**
- **Aggregating the sales data by customer_id**
- Displaying the "shape" of the dataframes, and observed schema
- Filter the sales data (by doctype, timestamp)
- Writing the aggregated "materialized view" of sales-by-customer to the Cosmos DB views container


In [68]:
# Define common PySpark functions used in this Notebook.
# This Notebook defaults to PySpark.
# Also display the installed packages/libraries.

# import pkg_resources
# pkg_list = list()

# for d in sorted(pkg_resources.working_set):
#     pkg_list.append(str(d))
# for p in sorted(pkg_list):
#     pass
#     #print(p)

def print_df_shape(df, msg):
    print("shape of df {} - row count: {}, column count: {}".format(
        msg, str(df.count()), str(len(df.columns)) ))


StatementMeta(sparkpool3m, 10, 2, Finished, Available)

In [69]:
# Read the Synapse Link Analytic Datastore into a Spark Dataframe,
# filtering by the timestamp of the Cosmos DB documents.

from pyspark.sql.functions import col
import pyspark.sql.functions as F 

# Initialize timestamp variables.  Alternatively, these can be computed
# values such as for a daily or monthly report.
min_doc_timestamp = 1640995200  # 2022-01-01T00:00:00.000Z
max_doc_timestamp = 1999999999  # distant future

# "cosmos.olap" is Synapse Link
# "cosmos.oltp" is Cosmos DB

sales_df = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", "gbbcjmongo_retail")\
    .option("spark.cosmos.container", "sales")\
    .load()\
    .filter(col("_ts") >= min_doc_timestamp)\
    .filter(col("_ts") <= max_doc_timestamp)

display(sales_df.limit(10))


StatementMeta(sparkpool3m, 10, 3, Finished, Available)

SynapseWidget(Synapse.DataFrame, 0be01b3e-7d7b-4fe1-9cf9-394b697bceeb)

In [70]:
# Display the shape and the inferred schema of the input dataframe.

print_df_shape(sales_df, "sales_df")
sales_df.printSchema()


StatementMeta(sparkpool3m, 10, 4, Finished, Available)

shape of df sales_df - row count: 108062, column count: 21
root
 |-- _rid: string (nullable = true)
 |-- _ts: long (nullable = true)
 |-- id: string (nullable = true)
 |-- _etag: string (nullable = true)
 |-- _id: struct (nullable = true)
 |    |-- objectId: string (nullable = true)
 |-- pk: struct (nullable = true)
 |    |-- string: string (nullable = true)
 |-- sale_id: struct (nullable = true)
 |    |-- int32: integer (nullable = true)
 |-- doctype: struct (nullable = true)
 |    |-- string: string (nullable = true)
 |-- date: struct (nullable = true)
 |    |-- string: string (nullable = true)
 |-- line_num: struct (nullable = true)
 |    |-- int32: integer (nullable = true)
 |-- customer_id: struct (nullable = true)
 |    |-- int32: integer (nullable = true)
 |-- store_id: struct (nullable = true)
 |    |-- int32: integer (nullable = true)
 |-- upc: struct (nullable = true)
 |    |-- string: string (nullable = true)
 |-- price: struct (nullable = true)
 |    |-- float64: double (nu

In [71]:
# Extract just a few attributes of the Dataframe for processing.
# The input documents look like this:.

# {
# 	"_id" : ObjectId("64458d9a59d30378b040b2a7"),
# 	"pk" : "1",
# 	"id" : "d5aec4a6-3735-4f73-9432-b2f8692784d7",
# 	"sale_id" : 1,                   <-- extracted
# 	"doctype" : "sale",              <-- extracted
# 	"date" : "2021-01-01",
# 	"dow" : "fri",
# 	"customer_id" : 6168,            <-- extracted
# 	"store_id" : 60,
# 	"item_count" : 2,                <-- extracted
# 	"total_cost" : 1943.65,          <-- extracted
# 	"epoch" : 1682279834.2830074
# }

from pyspark.sql.functions import col
import pyspark.sql.functions as F 

# "Full-Fidelity" schema logic to unpack the Synapse Link values.
# Alternative is "Well-Defined" schema.
sales_df_unpacked = sales_df.select(
    col('sale_id.*'),
    col('customer_id.*'),
    col('doctype.*'),
    col('item_count.*'),
    col('total_cost.*'))

# Rename the columns of the unpacked DataFrame to friendly names
new_column_names = ['sale_id', 'customer_id', 'doctype', 'item_count', 'total_cost']
df_sales = sales_df_unpacked.toDF(*new_column_names).filter(col("doctype") == "sale")

print_df_shape(df_sales, "df_sales")
display(df_sales.limit(10))
df_sales.printSchema()


StatementMeta(sparkpool3m, 10, 5, Finished, Available)

shape of df df_sales - row count: 30840, column count: 5


SynapseWidget(Synapse.DataFrame, 487d2a25-2be0-4a47-bd35-223ad0a7ca9b)

root
 |-- sale_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- doctype: string (nullable = true)
 |-- item_count: integer (nullable = true)
 |-- total_cost: double (nullable = true)



In [72]:
# Aggregate the Sales by Customer 

# But first, consider an alternative implementation using JUST CosmosDB, and not Spark:
# 1. Read customers container to get the unique set of customer ids (cross partition)
# 2. Loop through the customer id list:
#    - Read all of the sale documents for each customer in the timeframe (cross partition)
#    - sum the sales item_count and item_count for each customer (memory intensive)

import pyspark.sql.functions as F 

df_customer_aggregated = df_sales.groupBy("customer_id") \
    .agg(
        F.first('customer_id').alias('id'), \
        F.first('customer_id').alias('pk'), \
        F.count("customer_id").alias('order_count'), \
        F.sum("total_cost").alias("total_dollar_amount"), \
        F.sum("item_count").alias("total_item_count")) \
        .sort("customer_id", ascending=True)

print_df_shape(df_customer_aggregated, "df_customer_aggregated")
display(df_customer_aggregated.limit(10))
df_customer_aggregated.printSchema()


StatementMeta(sparkpool3m, 10, 6, Finished, Available)

shape of df df_customer_aggregated - row count: 9532, column count: 6


SynapseWidget(Synapse.DataFrame, e76ae269-5fff-44c6-b6ae-93a2cf8ada13)

root
 |-- customer_id: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- pk: integer (nullable = true)
 |-- order_count: long (nullable = false)
 |-- total_dollar_amount: double (nullable = true)
 |-- total_item_count: long (nullable = true)



In [73]:
df_customer_aggregated.createOrReplaceTempView('agg')
print('df_customer_aggregated saved to tmp view: agg')


StatementMeta(sparkpool3m, 10, 7, Finished, Available)

df_customer_aggregated saved to tmp view: agg


In [74]:
%%sql

-- Use the %%sql "magick" to toggle into SQL mode and execute a query.

select * from agg limit 3


StatementMeta(sparkpool3m, 10, 8, Finished, Available)

<Spark SQL result set with 3 rows and 6 fields>

In [75]:
%%pyspark

# Use a SQL SELECT to cast the id and pk attributes from ints to Strings.

df_for_cosmos = spark.sql("SELECT String(id), String(pk), order_count, total_dollar_amount, total_item_count FROM agg")

print_df_shape(df_for_cosmos, "df_for_cosmos")
display(df_for_cosmos.limit(10))
df_for_cosmos.printSchema()


StatementMeta(sparkpool3m, 10, 9, Finished, Available)

shape of df df_for_cosmos - row count: 9532, column count: 5


SynapseWidget(Synapse.DataFrame, dd2c6358-8212-4e6a-ac27-1ae0532bbf84)

root
 |-- id: string (nullable = true)
 |-- pk: string (nullable = true)
 |-- order_count: long (nullable = false)
 |-- total_dollar_amount: double (nullable = true)
 |-- total_item_count: long (nullable = true)



In [76]:

# Write the Dataframe to Cosmos DB NoSQL API.
# See https://spark.apache.org/docs/3.2.4/api/python/reference/api/pyspark.sql.DataFrameWriter.mode.html

df_for_cosmos.write.format("cosmos.oltp")\
    .option("spark.synapse.linkedService", "gbbcjcdbnosql_retail_db")\
    .option("spark.cosmos.container", "views")\
    .mode('append')\
    .save()


StatementMeta(sparkpool3m, 10, 10, Finished, Available)

In [81]:
%%spark 

// Obtain configuration values from spark.conf
// See https://learn.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-azure-create-spark-configuration

val connStr  = spark.conf.get("spark.azurepg.jdbc.connstring")
val driver   = spark.conf.get("spark.azurepg.jdbc.driver")
val server   = spark.conf.get("spark.azurepg.jdbc.server")
val table    = spark.conf.get("spark.azurepg.jdbc.table")
val user     = spark.conf.get("spark.azurepg.jdbc.user")
val password = spark.conf.get("spark.azurepg.jdbc.pass")

println("driver:   " + driver)
println("server:   " + server)
println("table:    " + table)
println("user:     " + user)
//println("password: " + password)

val df_customer_aggregated = spark.read.table("agg")  // Spark SQL table

println("df_customer_aggregated row count: " + df_customer_aggregated.count())

// Using JDBC, write the Dataframe to Azure PostgreSQL
df_customer_aggregated.write
  .format("jdbc")
  .option("url", connStr)
  .option("driver", driver)
  .option("dbtable", table)
  .option("user", user)
  .option("password", password)
  .mode("overwrite")
  .save()

println("done\n\n\n\n\n\n\n\n")


StatementMeta(sparkpool3m, 10, 16, Finished, Available)

driver:   org.postgresql.Driver
server:   gbbcjflexpg.postgres.database.azure.com
table:    public.customer_sales
user:     chjoakim@gbbcjflexpg.postgres.database.azure.com
df_customer_aggregated row count: 9532
done








connStr: String = jdbc:postgresql://gbbcjflexpg.postgres.database.azure.com:5432/postgres?user=chjoakim&password=PX58VSv9DWZc&sslmode=require
driver: String = org.postgresql.Driver
server: String = gbbcjflexpg.postgres.database.azure.com
table: String = public.customer_sales
user: String = chjoakim@gbbcjflexpg.postgres.database.azure.com
password: String = PX58VSv9DWZc
df_customer_aggregated: org.apache.spark.sql.DataFrame = [customer_id: int, id: int ... 4 more fields]
