In [None]:
# Load the SynapseLink Customers and Sales SynapseLink Data into a Dataframes.
# Select just the "sale" document types from the sales container, which have a
# minimum _ts (timestamp) value

# The documents in CosmosDB OLTP data look like this:
# {
#   "pk": 28923,
#   "id": "8500eb9c-390c-462b-87ce-5fb5b8f6359a",
#   "sale_id": 28923,
#   "doctype": "sale",
#   "date": "2022-01-25",
#   "dow": "tue",
#   "customer_id": 1928,
#   "store_id": 2,
#   "item_count": 3,
#   "total_cost": 4566.03
# }

from pyspark.sql.functions import col
import pyspark.sql.functions as F 

# initialize variables
min_doc_timestamp = 1635168000

# read the raw OLAP data, filtering by document _ts (timestamp)
df_sales_raw = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", "CosmosMongoDemoDB")\
    .option("spark.cosmos.container", "sales")\
    .load().filter(col("_ts") > min_doc_timestamp)

display(df_sales_raw.limit(3))
df_sales_raw.printSchema()


In [None]:
# Select just the pertinet columns.
# Unpack the struct columns with attrname.* syntax.
# Filter by doctype == 'sale'

df_sales_unpacked = df_sales_raw.select(
    col('doctype.*'),
    col('date.*'),
    col('customer_id.*'),
    col('item_count.*'),
    col('total_cost.*'))

display(df_sales_unpacked.limit(3))
df_sales_unpacked.printSchema()

# Rename the columns of the unpacked DataFrame
new_column_names = ['doctype', 'date', 'customer_id', 'item_count', 'total_cost']
df_sales = df_sales_unpacked.toDF(*new_column_names).filter(col("doctype") == "sale")

display(df_sales.limit(3))
df_sales.printSchema()

print('df_sales, shape: {} x {}'.format(
        df_sales.count(), len(df_sales.columns)))


In [None]:
# Aggregate Sales by Customer 

import pyspark.sql.functions as F 

df_customer_aggregated = df_sales.groupBy("customer_id") \
    .agg(
        F.first('customer_id').alias('_id'), \
        F.first('customer_id').alias('pk'), \
        F.count("customer_id").alias('order_count'), \
        F.sum("total_cost").alias("total_dollar_amount"), \
        F.sum("item_count").alias("total_item_count")) \
        .sort("customer_id", ascending=True)

display(df_customer_aggregated.limit(4))
df_customer_aggregated.printSchema()


In [None]:
# Write the df_customer_aggregated to Azure Blob Storage as CSV 
# See https://github.com/Azure-Samples/Synapse/tree/main/Notebooks/PySpark

from pyspark.sql import SparkSession
from pyspark.sql.types import *

import pyspark.sql.functions as F 

# Azure storage account info
blob_account_name   = 'cjoakimstorage'
blob_container_name = 'synapse'
blob_relative_path  = 'retail/sales/mongo/'
linked_service_name = 'SecondaryAzureBlobStorage'

blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name)
#print('blob_sas_token: {}'.format(blob_sas_token))

# Allow Spark to access from Blob remotely
wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (
    blob_container_name, blob_account_name, blob_relative_path)

spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (
    blob_container_name, blob_account_name), blob_sas_token)

csv_path = '{}{}'.format(wasbs_path,'sales_by_customer_csv')

print('wasbs_path: ' + wasbs_path)
print('csv_path:   ' + csv_path)

# Write to blob storage, coalesce it into one CSV file
df_customer_aggregated.coalesce(1).write.csv(csv_path, mode='overwrite', header='true')
print('csv data written to azure storage blob')

#df_agg.coalesce(1).write.json(json_path, mode='overwrite')
