# Process the Customers vs Orders Synapse Link Data

In [40]:
# Load the SynapseLink Customers data into a Dataframe

df_customers = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", "demoCosmosDB")\
    .option("spark.cosmos.container", "customers")\
    .load()
display(df_customers.limit(3))

StatementMeta(poolspark3s, 42, 41, Finished, Available)

SynapseWidget(Synapse.DataFrame, cc7d65b8-1c00-4e3e-abd5-0e7e64e1bd89)

In [41]:
# Load the SynapseLink Orders data into a Dataframe

df_orders = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", "demoCosmosDB")\
    .option("spark.cosmos.container", "orders")\
    .load()
display(df_orders.limit(3))

StatementMeta(poolspark3s, 42, 42, Finished, Available)

SynapseWidget(Synapse.DataFrame, a7650153-10ac-4096-910f-0346dbadeb63)

In [42]:
# Select just the doctype == 'orders' from the Orders Dataframe
# Exclude the line_item and delivery document types

df_order_docs = df_orders.filter(df_orders["doctype"].isin(["order"]))
display(df_order_docs.limit(3))

StatementMeta(poolspark3s, 42, 43, Finished, Available)

SynapseWidget(Synapse.DataFrame, 6cd7adae-b2ca-4e30-b4c9-f935065e888b)

In [43]:
# Display the Observed Schemas of the Dataframes

print('=== df_customers')
df_customers.printSchema()

print('=== df_orders')
df_orders.printSchema()

print('=== df_order_docs')
display(df_order_docs.printSchema())


StatementMeta(poolspark3s, 42, 44, Finished, Available)

=== df_customers
root
 |-- _rid: string (nullable = true)
 |-- _ts: long (nullable = true)
 |-- pk: string (nullable = true)
 |-- doctype: string (nullable = true)
 |-- customerId: string (nullable = true)
 |-- name: string (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- address: struct (nullable = true)
 |    |-- street: string (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- zip: string (nullable = true)
 |-- doc_epoch: long (nullable = true)
 |-- doc_time: string (nullable = true)
 |-- id: string (nullable = true)
 |-- _etag: string (nullable = true)

=== df_orders
root
 |-- _rid: string (nullable = true)
 |-- _ts: long (nullable = true)
 |-- pk: long (nullable = true)
 |-- doctype: string (nullable = true)
 |-- orderId: long (nullable = true)
 |-- lineNumber: long (nullable = true)
 |-- customerId: string (nullable = true)
 |-- sku: long (nullable = true)
 |-- name: string 

In [44]:
# Display the shapes of the Dataframes

print('df_customers:')
print((df_customers.count(), len(df_customers.columns)))

print('df_orders')
print((df_orders.count(), len(df_orders.columns)))

print('df_order_docs')
print((df_order_docs.count(), len(df_order_docs.columns)))


StatementMeta(poolspark3s, 42, 45, Finished, Available)

df_customers:
(100000, 13)
df_orders
(1049182, 21)
df_order_docs
(300000, 21)

In [45]:
# Join the Customers to the Order documents

df_joined = df_order_docs.join(df_customers, df_order_docs.customerId == df_customers.customerId, "inner")
display(df_joined.printSchema())
print((df_joined.count(), len(df_joined.columns)))


StatementMeta(poolspark3s, 42, 46, Finished, Available)

root
 |-- _rid: string (nullable = true)
 |-- _ts: long (nullable = true)
 |-- pk: long (nullable = true)
 |-- doctype: string (nullable = true)
 |-- orderId: long (nullable = true)
 |-- lineNumber: long (nullable = true)
 |-- customerId: string (nullable = true)
 |-- sku: long (nullable = true)
 |-- name: string (nullable = true)
 |-- qty: long (nullable = true)
 |-- price: double (nullable = true)
 |-- item_total: double (nullable = true)
 |-- doc_epoch: long (nullable = true)
 |-- doc_time: string (nullable = true)
 |-- id: string (nullable = true)
 |-- _etag: string (nullable = true)
 |-- status: string (nullable = true)
 |-- date_time: string (nullable = true)
 |-- item_count: long (nullable = true)
 |-- order_total: double (nullable = true)
 |-- delivery_count: long (nullable = true)
 |-- _rid: string (nullable = true)
 |-- _ts: long (nullable = true)
 |-- pk: string (nullable = true)
 |-- doctype: string (nullable = true)
 |-- customerId: string (nullable = true)
 |-- name: stri