In [48]:
# Load the SynapseLink Customers and Sales SynapseLink Data into a Dataframes.
# Select just the "sale" document types from the sales container, which have a
# minimum _ts (timestamp) value

# The documents in CosmosDB OLTP data look like this:
# {
#   "pk": 28923,
#   "id": "8500eb9c-390c-462b-87ce-5fb5b8f6359a",
#   "sale_id": 28923,
#   "doctype": "sale",
#   "date": "2022-01-25",
#   "dow": "tue",
#   "customer_id": 1928,
#   "store_id": 2,
#   "item_count": 3,
#   "total_cost": 4566.03
# }

from pyspark.sql.functions import col
import pyspark.sql.functions as F 

# initialize variables
min_doc_timestamp = 1635168000

# read the raw OLAP data, filtering by document _ts (timestamp)
df_sales_raw = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", "CosmosMongoDemoDB")\
    .option("spark.cosmos.container", "sales")\
    .load().filter(col("_ts") > min_doc_timestamp)

display(df_sales_raw.limit(3))
df_sales_raw.printSchema()


StatementMeta(poolspark3s, 16, 21, Finished, Available)

SynapseWidget(Synapse.DataFrame, 308bb2a4-cda4-4667-9606-037a248b477f)


root
 |-- _rid: string (nullable = true)
 |-- _ts: long (nullable = true)
 |-- id: string (nullable = true)
 |-- _etag: string (nullable = true)
 |-- _id: struct (nullable = true)
 |    |-- objectId: string (nullable = true)
 |-- pk: struct (nullable = true)
 |    |-- string: string (nullable = true)
 |-- sale_id: struct (nullable = true)
 |    |-- int32: integer (nullable = true)
 |-- doctype: struct (nullable = true)
 |    |-- string: string (nullable = true)
 |-- date: struct (nullable = true)
 |    |-- string: string (nullable = true)
 |-- line_num: struct (nullable = true)
 |    |-- int32: integer (nullable = true)
 |-- customer_id: struct (nullable = true)
 |    |-- int32: integer (nullable = true)
 |-- store_id: struct (nullable = true)
 |    |-- int32: integer (nullable = true)
 |-- upc: struct (nullable = true)
 |    |-- string: string (nullable = true)
 |-- price: struct (nullable = true)
 |    |-- float64: double (nullable = true)
 |-- qty: struct (nullable = true)
 |    |-

In [49]:
# Select just the pertinet columns.
# Unpack the struct columns with attrname.* syntax.
# Filter by doctype == 'sale'

df_sales_unpacked = df_sales_raw.select(
    col('doctype.*'),
    col('date.*'),
    col('customer_id.*'),
    col('item_count.*'),
    col('total_cost.*'))

display(df_sales_unpacked.limit(3))
df_sales_unpacked.printSchema()

# Rename the columns of the unpacked DataFrame
new_column_names = ['doctype', 'date', 'customer_id', 'item_count', 'total_cost']
df_sales = df_sales_unpacked.toDF(*new_column_names).filter(col("doctype") == "sale")

display(df_sales.limit(3))
df_sales.printSchema()

print('df_sales, shape: {} x {}'.format(
        df_sales.count(), len(df_sales.columns)))


StatementMeta(poolspark3s, 16, 22, Finished, Available)

SynapseWidget(Synapse.DataFrame, a4c8b960-02ed-4f3e-8bf9-1912f9a607e8)


root
 |-- string: string (nullable = true)
 |-- string: string (nullable = true)
 |-- int32: integer (nullable = true)
 |-- int32: integer (nullable = true)
 |-- float64: double (nullable = true)


root
 |-- doctype: string (nullable = true)
 |-- date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- item_count: integer (nullable = true)
 |-- total_cost: double (nullable = true)

df_sales, shape: 28923 x 5

SynapseWidget(Synapse.DataFrame, 62b3d000-0bb7-44eb-aa0b-bbf7c51382a4)

In [50]:
# Aggregate Sales by Customer 

import pyspark.sql.functions as F 

df_customer_aggregated = df_sales.groupBy("customer_id") \
    .agg(
        F.first('customer_id').alias('_id'), \
        F.first('customer_id').alias('pk'), \
        F.count("customer_id").alias('order_count'), \
        F.sum("total_cost").alias("total_dollar_amount"), \
        F.sum("item_count").alias("total_item_count")) \
        .sort("customer_id", ascending=True)

display(df_customer_aggregated.limit(4))
df_customer_aggregated.printSchema()


StatementMeta(poolspark3s, 16, 23, Finished, Available)

SynapseWidget(Synapse.DataFrame, 91f731e0-2ad0-4887-99a6-fecc744253c5)


root
 |-- customer_id: integer (nullable = true)
 |-- _id: integer (nullable = true)
 |-- pk: integer (nullable = true)
 |-- order_count: long (nullable = false)
 |-- total_dollar_amount: double (nullable = true)
 |-- total_item_count: long (nullable = true)

In [51]:
# Write the df_customer_aggregated to Azure Blob Storage as CSV 
# See https://github.com/Azure-Samples/Synapse/tree/main/Notebooks/PySpark

from pyspark.sql import SparkSession
from pyspark.sql.types import *

import pyspark.sql.functions as F 

# Azure storage account info
blob_account_name   = 'cjoakimstorage'
blob_container_name = 'synapse'
blob_relative_path  = 'retail/sales/mongo/'
linked_service_name = 'cjoakimstorageAzureBlobStorage'

blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name)
#print('blob_sas_token: {}'.format(blob_sas_token))

# Allow Spark to access from Blob remotely
wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (
    blob_container_name, blob_account_name, blob_relative_path)

spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (
    blob_container_name, blob_account_name), blob_sas_token)

csv_path = '{}{}'.format(wasbs_path,'sales_by_customer_csv')

print('wasbs_path: ' + wasbs_path)
print('csv_path:   ' + csv_path)

# Write to blob storage, coalesce it into one CSV file
df_customer_aggregated.coalesce(1).write.csv(csv_path, mode='overwrite', header='true')
print('csv data written to azure storage blob')

#df_agg.coalesce(1).write.json(json_path, mode='overwrite')


StatementMeta(poolspark3s, 16, 24, Finished, Available)

wasbs_path: wasbs://synapse@cjoakimstorage.blob.core.windows.net/retail/sales/mongo/
csv_path:   wasbs://synapse@cjoakimstorage.blob.core.windows.net/retail/sales/mongo/sales_by_customer_csv
csv data written to azure storage blob