# Cosmos DB Synapse Sales Processing Spark Notebook

## This Spark/PySpark Notebook demonstrates how to:

- **Read the Synapse Link Analytic Datastore with Spark/PySpark in Azure Synapse**
- **Source Cosmos DB is the Mongo API**
- **Aggregating the sales data by customer_id**
- Displaying the "shape" of the dataframes, and observed schema
- Filter the sales data (by doctype, timestamp)
- Writing the aggregated "materialized view" of sales-by-customer to the Cosmos DB views container


In [None]:
# Define common PySpark functions used in this Notebook

def print_df_shape(df, msg):
    print("shape of df {} - row count: {}, column count: {}".format(
        msg, str(df.count()), str(len(df.columns)) ))


In [None]:
# Read the Synapse Link Analytic Datastore into a Spark Dataframe

from pyspark.sql.functions import col
import pyspark.sql.functions as F 

# initialize variables; these can be computed values
# such as for a daily or monthly report.
min_doc_timestamp = 1640995200  # 2022-01-01T00:00:00.000Z
max_doc_timestamp = 1999999999  # distant future

# Note: "cosmos.olap" is Synapse Link, "cosmos.oltp" is Cosmos DB

sales_df = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", "gbbcjmongo_retail")\
    .option("spark.cosmos.container", "sales")\
    .load()\
    .filter(col("_ts") >= min_doc_timestamp)\
    .filter(col("_ts") <= max_doc_timestamp)

display(sales_df.limit(10))


In [None]:
# Display the shape and the schema of the input dataframe

print_df_shape(sales_df, "sales_df")
sales_df.printSchema()


In [None]:
from pyspark.sql.functions import col
import pyspark.sql.functions as F 

# The input documents look like this; we'll select just a few
# of these attributes for processing.

# {
# 	"_id" : ObjectId("64458d9959d30378b040b2a5"),
# 	"pk" : "1",
# 	"id" : "051424eb-9de0-4ff4-810a-522cdd9ece07",
# 	"sale_id" : 1,
# 	"doctype" : "line_item",
# 	"date" : "2021-01-01",
# 	"line_num" : 1,
# 	"customer_id" : 6168,
# 	"store_id" : 60,
# 	"upc" : "0760981980837",
# 	"price" : 84.65,
# 	"qty" : 1,
# 	"cost" : 84.65,
# 	"epoch" : 1682279833.3812423
# }

sales_df_unpacked = sales_df.select(
    col('sale_id.*'),
    col('customer_id.*'),
    col('doctype.*'),
    col('item_count.*'),
    col('total_cost.*'))

# Rename the columns of the unpacked DataFrame to friendly names
new_column_names = ['sale_id', 'customer_id', 'doctype', 'item_count', 'total_cost']
df_sales = sales_df_unpacked.toDF(*new_column_names).filter(col("doctype") == "sale")

print_df_shape(df_sales, "df_sales")
display(df_sales.limit(10))
df_sales.printSchema()


In [None]:
# Aggregate Sales by Customer 

# But first, consider an alternative implementation using JUST CosmosDB, and not Spark:
# 1. Read customers container to get the unique set of customer ids (cross partition)
# 2. Loop through the customer id list:
#    - Read all of the sale documents for each customer in the timeframe (cross partition)
#    - sum the sales item_count and item_count for each customer (memory intensive)

import pyspark.sql.functions as F 

df_customer_aggregated = df_sales.groupBy("customer_id") \
    .agg(
        F.first('customer_id').alias('id'), \
        F.first('customer_id').alias('pk'), \
        F.count("customer_id").alias('order_count'), \
        F.sum("total_cost").alias("total_dollar_amount"), \
        F.sum("item_count").alias("total_item_count")) \
        .sort("customer_id", ascending=True)

print_df_shape(df_customer_aggregated, "df_customer_aggregated")
display(df_customer_aggregated.limit(10))
df_customer_aggregated.printSchema()


In [None]:
# Display Installed Packages

import pkg_resources
pkg_list = list()

for d in sorted(pkg_resources.working_set):
    pkg_list.append(str(d))
for p in sorted(pkg_list):
    pass
    #print(p)


In [None]:
df_customer_aggregated.createOrReplaceTempView('agg')
print('df_customer_aggregated saved to tmp view: agg')


In [None]:
# Note the use of the %%sql "magick" command to use Spark SQL 

%%sql
select * from agg limit 3


In [None]:
# Use spark.sql and SELECT to convert id and pk to String values

%%pyspark
df_for_cosmos = spark.sql("SELECT String(id), String(pk), order_count, total_dollar_amount, total_item_count FROM agg")

print_df_shape(df_for_cosmos, "df_for_cosmos")
display(df_for_cosmos.limit(10))
df_for_cosmos.printSchema()



In [None]:

df_for_cosmos.write.format("cosmos.oltp")\
    .option("spark.synapse.linkedService", "gbbcjcdbnosql_retail_db")\
    .option("spark.cosmos.container", "views")\
    .mode('append')\
    .save()


