# Process the Customers vs Orders Synapse Link Data

In [None]:
# Load the SynapseLink Customers data into a Dataframe

df_customers = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", "demoCosmosDB")\
    .option("spark.cosmos.container", "customers")\
    .load()
display(df_customers.limit(3))

In [None]:
# Load the SynapseLink Orders data into a Dataframe

df_orders = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", "demoCosmosDB")\
    .option("spark.cosmos.container", "orders")\
    .load()
display(df_orders.limit(3))

In [None]:
# Select just the doctype == 'orders' from the Orders Dataframe
# Exclude the line_item and delivery document types

df_order_docs = df_orders.filter(df_orders["doctype"].isin(["order"]))
display(df_order_docs.limit(3))

In [None]:
# Display the Observed Schemas of the Dataframes

print('=== df_customers')
df_customers.printSchema()

print('=== df_orders')
df_orders.printSchema()

print('=== df_order_docs')
display(df_order_docs.printSchema())


In [None]:
# Display the shapes of the Dataframes

print('df_customers:')
print((df_customers.count(), len(df_customers.columns)))

print('df_orders')
print((df_orders.count(), len(df_orders.columns)))

print('df_order_docs')
print((df_order_docs.count(), len(df_order_docs.columns)))


In [84]:
# Create Minimal Dataframes for Join operation 

from pyspark.sql.functions import col

df_customers_minimal = df_customers.select(
    col('customerId'),
    col('name'))

print('df_customers_minimal')
display(df_customers_minimal.printSchema())
print((df_customers_minimal.count(), len(df_customers_minimal.columns)))

df_orders_minimal = df_order_docs.select(
    col('orderId'),
    col('customerId'),
    col('item_count'),
    col('order_total'))

print('df_orders_minimal')
display(df_orders_minimal.printSchema())
print((df_orders_minimal.count(), len(df_orders_minimal.columns)))

StatementMeta(poolspark3s, 42, 85, Finished, Available)

df_customers_minimal
root
 |-- customerId: string (nullable = true)
 |-- name: string (nullable = true)

(100000, 2)
df_orders_minimal
root
 |-- orderId: long (nullable = true)
 |-- customerId: string (nullable = true)
 |-- item_count: long (nullable = true)
 |-- order_total: double (nullable = true)

(300000, 4)

In [85]:
# Join the Customers to their Order documents

df_joined = df_orders_minimal.join(df_customers_minimal, ['customerId'])

display(df_joined.printSchema())
print((df_joined.count(), len(df_joined.columns)))


StatementMeta(poolspark3s, 42, 86, Finished, Available)

root
 |-- customerId: string (nullable = true)
 |-- orderId: long (nullable = true)
 |-- item_count: long (nullable = true)
 |-- order_total: double (nullable = true)
 |-- name: string (nullable = true)

(300000, 5)

In [86]:
display(df_joined.limit(30))

StatementMeta(poolspark3s, 42, 87, Finished, Available)

SynapseWidget(Synapse.DataFrame, 5bff4a2b-6177-4810-8117-a4d5d460d643)

In [None]:


df_grouped = df_joined.groupby("customerId").sum("order_total").alias('total_orders')

display(df_grouped.printSchema())
print((df_grouped.count(), len(df_grouped.columns)))



In [104]:
import pyspark.sql.functions as F 

df_agg = df_joined.groupBy("customerId") \
    .agg(
        F.count("customerId").alias('order_count'), \
        F.sum("order_total").alias("total_dollar_amount"), \
        F.sum("item_count").alias("total_item_count"))

display(df_agg.printSchema())
print((df_agg.count(), len(df_agg.columns)))

StatementMeta(poolspark3s, 42, 104, Finished, Available)

root
 |-- customerId: string (nullable = true)
 |-- order_count: long (nullable = false)
 |-- total_dollar_amount: double (nullable = true)
 |-- total_item_count: long (nullable = true)

(95119, 4)

In [105]:
display(df_agg.limit(30))

StatementMeta(poolspark3s, 42, 105, Finished, Available)

SynapseWidget(Synapse.DataFrame, 5b68980c-d122-4471-b009-9c0ff5aba167)

In [99]:
import pyspark.sql.functions as F 

df_agg2 = df_joined.groupBy("customerId").agg(
    {"order_total": "sum", "item_count": "sum", "customerId":"count"})
display(df_agg2.printSchema())
print((df_agg2.count(), len(df_agg2.columns)))

StatementMeta(poolspark3s, 42, 99, Finished, Available)

root
 |-- customerId: string (nullable = true)
 |-- sum(item_count): long (nullable = true)
 |-- count(customerId): long (nullable = false)
 |-- sum(order_total): double (nullable = true)

(95119, 4)

In [101]:
display(df_agg2.limit(30))

StatementMeta(poolspark3s, 42, 101, Finished, Available)

SynapseWidget(Synapse.DataFrame, f06ed022-96f5-4fd5-9ff5-ca8b47850d52)