# Intermediate Non-Event Models

In [1]:
import os
from pathlib import Path

from dotenv import load_dotenv
from sqlalchemy import create_engine
from snowflake.sqlalchemy import URL

  functions.register_function("flatten", flatten)


In [2]:
PROJ_ROOT = Path().resolve().parents[3]
env_file_dir = PROJ_ROOT / '.env'
_ = load_dotenv(env_file_dir, verbose=True)

## About

### Objective

Develop an **intermediate** model that includes logic to combine the following (non-event related) staging models

1. `stg_postgres_orders`
2. `stg_postgres_order_items`
3. `stg_postgres_promos`

**at the order level**. The model should be named `int_orders_joined_to_addresses_promos`.

### Use-Case

To get this **at the order level**, `order_items` must be aggregated from the product granularity to the order level by `order_id` before joining with the two other models. This `order_item` aggregation will be performed here so it does not need to repeated by the following marts models

1. `fct_user_orders`
2. `fct_promos`
3. `fct_orders`

all of which will use this intermediate model.

### Notes

1. This notebook supports <kbd>Run</kbd> > <kbd>Run All Cells</kbd>.

## User Inputs

In [3]:
#

In [4]:
engine = create_engine(
    URL(
        drivername="driver",
        account=os.getenv("UPLIMIT_SNOWFLAKE_ACCOUNT"),
        user=os.getenv("UPLIMIT_SNOWFLAKE_USER"),
        password=os.getenv("UPLIMIT_SNOWFLAKE_PASS"),
        warehouse=os.getenv("UPLIMIT_SNOWFLAKE_WAREHOUSE"),
        role=os.getenv("UPLIMIT_SNOWFLAKE_ROLE"),
        database=os.getenv("UPLIMIT_SNOWFLAKE_DB_NAME"),
        schema=os.getenv("UPLIMIT_SNOWFLAKE_SCHEMA"),
    )
)

## Connect

Load Jupyter SQL extension

In [5]:
%load_ext sql

Connect to DuckDB database

In [6]:
%sql engine --alias connection

## Exploratory Data Analysis

In [7]:
%%sql
WITH t1 AS (
    SELECT status,
           estimated_delivery_at,
           delivered_at,
           datediff(second, created_at, delivered_at) AS delivery_time_seconds,
           datediff(
               second, created_at, estimated_delivery_at
           ) AS estimated_delivery_time_seconds,
           (
               CASE
                   WHEN delivered_at > estimated_delivery_at
                   THEN ABS(
                       DATEDIFF(second, delivered_at, estimated_delivery_at)
                   )
                   ELSE NULL
               END
           ) AS delivery_delay_seconds,
           (
               CASE
                   WHEN
                       delivered_at > estimated_delivery_at
                       AND status = 'delivered'
                   THEN False
                   WHEN status IN ('shipped', 'preparing') THEN NULL
                   ELSE True
               END
           ) AS is_on_time_delivery
    FROM stg_postgres_orders
)
SELECT *
FROM t1
WHERE status IN ('delivered', 'shipped', 'preparing')
-- WHERE status IN ('shipped')
-- WHERE status IN ('delivered')
ORDER BY estimated_delivery_at DESC

status,estimated_delivery_at,delivered_at,delivery_time_seconds,estimated_delivery_time_seconds,delivery_delay_seconds,is_on_time_delivery
preparing,,,,,,
preparing,,,,,,
delivered,,2021-02-17 20:31:00,604800.0,,,True
delivered,,2021-02-12 00:54:07,86400.0,,,True
delivered,,2021-02-11 07:04:30,86400.0,,,True
preparing,,,,,,
delivered,,2021-02-14 13:37:32,345600.0,,,True
delivered,,2021-02-12 08:39:03,86400.0,,,True
delivered,,2021-02-11 20:23:20,86400.0,,,True
preparing,,,,,,


Show the different types of order statuses

In [8]:
%%sql
SELECT DISTINCT(status) AS status
FROM stg_postgres_orders

status
delivered
shipped
preparing


Show the different address states

In [9]:
%%sql
SELECT DISTINCT(state) AS state_name
FROM stg_postgres_addresses

state_name
California
Arizona
Maryland
Illinois
Florida
Virginia
Pennsylvania
Alabama
Texas
West Virginia


Show the first few rows of the `addresses` table

In [10]:
%%sql
SELECT *
FROM stg_postgres_addresses
LIMIT 4

address_id,address,zipcode,state,country
299ec70d-f1e7-4807-9d0e-6fd86e3ebcab,220 Artisan Park,92056,California,United States
957af504-cee3-4f3a-8f5a-eb5a5bd23b16,85717 Carberry Street,85062,Arizona,United States
ff8d0181-f21e-4235-bf7c-e5f917c6b55f,5423 Tennyson Alley,21282,Maryland,United States
b5d85492-b403-4f52-a42b-4363716f4cfb,4 Sutherland Alley,62723,Illinois,United States


Show the first few rows of the `users` table

In [11]:
%%sql
SELECT *
FROM stg_postgres_users
LIMIT 4

user_id,first_name,last_name,email,phone_number,created_at,updated_at,address_id
bbe51ac6-6687-4cbe-9178-27d96f90836b,Keslie,Hearmon,khearmon0@netvibes.com,831-155-1615,2020-10-23 20:21:57,2021-01-30 22:49:31,7a4821e6-4e7a-4894-bb35-70ffcf0c3aa8
eb8ac155-9558-47a2-ad5d-3a2518ec563c,Gerhard,Kernan,gkernan1@51.la,879-706-2066,2020-09-24 05:33:20,2020-10-10 06:11:13,dd1048ba-46a5-4d80-a7e1-b1592c161a4c
7091fe90-07da-424c-8da6-24e024503be2,Edithe,Petricek,epetricek2@purevolume.com,407-316-0158,2020-02-20 05:46:49,2020-05-13 08:09:47,4a9bcc05-a07b-4507-9d42-a2325a4844fa
ab21c3fa-34c7-4858-bee5-193775913caf,Danella,Wrankmore,dwrankmore3@europa.eu,118-153-0530,2020-04-01 11:01:25,2020-04-01 12:19:52,6b49d7c6-6ede-4a8a-ade1-ec5888c7ce14


Count the number of users in all the models with `user_id` in them

1. `stg_postgres_users`
2. `stg_postgres_orders`
3. `stg_postgres_events`

In [12]:
%%sql
SELECT COUNT(DISTINCT(user_id)) As num_users,
       'stg_postgres_users' AS model_name
FROM stg_postgres_users
UNION ALL
SELECT COUNT(DISTINCT(user_id)) As num_users,
       'stg_postgres_orders' AS model_name
FROM stg_postgres_orders
UNION ALL
SELECT COUNT(DISTINCT(user_id)) As num_users,
       'stg_postgres_events' AS model_name
FROM stg_postgres_events

num_users,model_name
130,stg_postgres_users
124,stg_postgres_orders
124,stg_postgres_events


Show the first few rows of the `orders` table

In [13]:
%%sql
SELECT *
FROM stg_postgres_orders
LIMIT 4

order_id,user_id,promo_id,address_id,created_at,order_cost,shipping_cost,order_total,tracking_id,shipping_service,estimated_delivery_at,delivered_at,status
8329a65b-7ddf-4250-aeee-bd625f8a401a,b3367c91-53bd-4aac-ab6d-0a596fe382c2,,d2fbe240-64ac-4feb-a360-8a9197f8b8ae,2021-02-11 23:30:34,551.9,5.16,557.06,1b2b3cff-dec1-47f6-a507-a64de9ddc663,ups,2021-02-12 23:30:34,2021-02-17 23:30:34,delivered
f3ab84e6-dec5-43b4-9aa7-45b9988bff58,5faf7aa9-ca1a-44b6-bc00-5954bc39cc91,,4a9bcc05-a07b-4507-9d42-a2325a4844fa,2021-02-11 15:13:09,121.0,2.72,123.72,02039212-f3fa-4bac-abf3-d2ea8365b837,usps,2021-02-16 15:13:09,2021-02-13 15:13:09,delivered
8385cfcd-2b3f-443a-a676-9756f7eb5404,96aa719e-c5a3-4645-ba89-16c304fb59b0,,1ceb9167-9852-45a7-8109-57b077d8a2e0,2021-02-11 02:19:17,205.5,1.44,206.94,19be4f7f-ae92-494e-99bd-6813bb41d592,ups,2022-10-18 10:15:26,,shipped
55164b42-78d3-4c4e-9dd3-cce61b956403,aa2bd310-8be1-4b6c-a852-2789d8652ac7,,c8e06217-c625-471f-8cdc-88c3e39f6b2c,2021-02-11 23:35:14,45.0,8.07,53.07,23c92052-e262-45bf-86af-312cb12ac472,usps,2021-02-14 23:35:14,2021-02-15 23:35:14,delivered


Show the first few rows of the `order_items` table

In [14]:
%%sql
SELECT *
FROM stg_postgres_order_items
LIMIT 8

order_id,product_id,quantity
5e75b8f4-e03e-462f-8a91-027bfaf3e8b4,c7050c3b-a898-424d-8d98-ab0aaad7bef4,3
c615ea16-2b87-471c-a40e-f1a1b81df308,e18f33a6-b89a-4fbc-82ad-ccba5bb261cc,2
1747e0c2-2649-4b8b-8048-540425302a8f,80eda933-749d-4fc6-91d5-613d29eb126f,2
44f19d9b-6f8d-4c04-8c1e-ee05a171a48a,689fb64e-a4a2-45c5-b9f2-480c2155624d,4
906ac891-d7a7-4d50-b4c6-c982fae2fc2d,e18f33a6-b89a-4fbc-82ad-ccba5bb261cc,4
885ebd5d-6a82-4802-a241-2e52e223df5a,64d39754-03e4-4fa0-b1ea-5f4293315f67,2
05202733-0e17-4726-97c2-0520c024ab85,689fb64e-a4a2-45c5-b9f2-480c2155624d,4
97c51e08-8026-44cb-af4b-15f8011e4931,bb19d194-e1bd-4358-819e-cd1f1b401c0c,4


Show the number of

1. orders and products
2. distinct orders and distinct products

In [15]:
%%sql
SELECT COUNT(DISTINCT(order_id)) As num_distinct_orders,
       COUNT(order_id) As num_orders,
       COUNT(DISTINCT(product_id)) As num_distinct_products,
       COUNT(product_id) As num_products
FROM stg_postgres_order_items

num_distinct_orders,num_orders,num_distinct_products,num_products
361,862,30,862


Show number of orders that meet any of the following criteria

1. `order_cost` = 0
1. `order_total` = 0

In [16]:
%%sql
SELECT COUNT(*) AS num_zero_cost_orders
FROM stg_postgres_orders
WHERE order_cost = 0
OR order_total = 0

num_zero_cost_orders
0


Get the first and last order date from the `orders` table

In [17]:
%%sql
SELECT TO_DATE(MIN(created_at)) AS first_order_date,
       TO_DATE(MAX(created_at)) AS last_order_date
FROM stg_postgres_orders

first_order_date,last_order_date
2021-02-10,2021-02-11


Get the addresses with multiple users

In [18]:
%%sql
SELECT address_id,
       COUNT(*) AS num_users
FROM stg_postgres_users
GROUP BY address_id
HAVING num_users > 1

address_id,num_users
4a9bcc05-a07b-4507-9d42-a2325a4844fa,2
6b49d7c6-6ede-4a8a-ade1-ec5888c7ce14,2
ed96dbd0-193e-4906-ae8b-904f50bcb57a,2
b2b4d6cd-3c94-40e0-99aa-9dd9ba011ca6,2
96a40097-0030-4509-bb52-b3251fdef2b6,4
965dbeea-a6d5-467d-9683-914b744ad1ef,2
7ae529bf-b9d0-47ee-acd0-d55e0e6a0d32,2
4934ebb8-8950-4bd0-87ba-a1532716e54d,3
9b6a4ed6-6c27-4102-8d5f-5db7435e80ee,2
f79a5736-1e58-436a-aca9-bc4cf21fef09,2


Show the number of unique

1. users and addresses in the `stg_postgres_users` table
2. addresses in the `stg_postgres_addresses` table
3. users (who have ordered) in the `stg_postgres_orders` table

In [19]:
%%sql
SELECT COUNT(DISTINCT(user_id)) AS number,
       'user_id' AS column_name,
       'stg_postgres_users' AS table_name
FROM stg_postgres_users
UNION ALL
SELECT COUNT(DISTINCT(address_id)) AS number,
       'address_id' AS column_name,
       'stg_postgres_users' AS table_name
FROM stg_postgres_users
UNION ALL
SELECT COUNT(DISTINCT(address_id)) AS number,
       'address_id' AS column_name,
       'stg_postgres_addresses' AS table_name
FROM stg_postgres_addresses
UNION ALL
SELECT COUNT(DISTINCT(user_id)) AS number,
       'user_id' AS column_name,
       'stg_postgres_orders' AS table_name
FROM stg_postgres_orders

number,column_name,table_name
130,user_id,stg_postgres_users
89,address_id,stg_postgres_users
150,address_id,stg_postgres_addresses
124,user_id,stg_postgres_orders


Show users without an address

In [20]:
%%sql
SELECT *
FROM stg_postgres_users
WHERE address_id IS NULL

user_id,first_name,last_name,email,phone_number,created_at,updated_at,address_id


Show orders with a missing delivery address

In [21]:
%%sql
SELECT *
FROM stg_postgres_orders
WHERE address_id IS NULL

order_id,user_id,promo_id,address_id,created_at,order_cost,shipping_cost,order_total,tracking_id,shipping_service,estimated_delivery_at,delivered_at,status


## Models

### `int_orders_joined_to_addresses_promos`

In [22]:
%%sql
/* get orders */
WITH orders AS (
    SELECT *
    FROM stg_postgres_orders
),
/* get users */
users AS (
    SELECT user_id,
           address_id
    FROM stg_postgres_users
),
/* get order items */
order_items AS (
    SELECT *
    FROM stg_postgres_order_items
),
/* get promotion discount in dollars */
promos AS (
    SELECT promo_id,
           discount
    FROM stg_postgres_promos
),
/* get state in which user's address is located */
addresses AS (
    SELECT address_id,
           state AS state_name
    FROM stg_postgres_addresses
),
/* get order item summary per order */
order_items_by_order AS (
    SELECT order_id,
           -- get number of unique greenery products per order
           COUNT(DISTINCT(product_id)) as num_unique_products,
           -- get total quantity of products per order
           SUM(quantity) as total_order_size
    FROM order_items
    GROUP BY order_id
),
/* create order profile from combination of orders, order items and promotion
discount */
order_summary AS (
    SELECT oi.order_id,
           o.created_at,
           u.user_id,
           u.address_id,
           o.order_cost,
           o.shipping_cost,
           p.promo_id,
           -- if no discount is offered then the discount value should be zero
           ZEROIFNULL(p.discount) AS discount,
           o.order_total,
           oi.total_order_size,
           oi.num_unique_products,
           o.status,
           o.delivered_at,
           o.estimated_delivery_at
    FROM users u
    LEFT JOIN orders o USING (user_id)
    -- use LEFT JOIN to capture all available users, including those that have
    -- not yet placed orders and so do not yet have any itemized orders
    LEFT JOIN order_items_by_order oi USING (order_id)
    -- use LEFT JOIN to capture orders that do not include products that are
    -- offered as part of a promotion
    LEFT JOIN promos p USING (promo_id)
),
/* append state name to combined order profile */
order_summary_with_state AS (
    SELECT os.order_id,
           os.created_at,
           os.user_id,
           a.state_name,
           os.order_cost,
           os.shipping_cost,
           os.discount,
           os.order_total,
           os.total_order_size,
           os.num_unique_products,
           os.status,
           os.delivered_at,
           os.estimated_delivery_at,
           -- append column to indicate if delivery timestamp occurred later
           -- than estimated delivery timestamp
           (
               CASE
                   WHEN
                       delivered_at > estimated_delivery_at
                       AND status = 'delivered'
                   THEN False
                   WHEN status = 'shipped' THEN NULL
                   ELSE True
               END
           ) AS is_on_time_delivery
    FROM order_summary os
    INNER JOIN addresses a USING (address_id)
)
SELECT *
FROM order_summary_with_state

order_id,created_at,user_id,state_name,order_cost,shipping_cost,discount,order_total,total_order_size,num_unique_products,status,delivered_at,estimated_delivery_at,is_on_time_delivery
8329a65b-7ddf-4250-aeee-bd625f8a401a,2021-02-11 23:30:34,b3367c91-53bd-4aac-ab6d-0a596fe382c2,District of Columbia,551.9,5.16,0,557.06,13,5,delivered,2021-02-17 23:30:34,2021-02-12 23:30:34,False
f3ab84e6-dec5-43b4-9aa7-45b9988bff58,2021-02-11 15:13:09,5faf7aa9-ca1a-44b6-bc00-5954bc39cc91,Texas,121.0,2.72,0,123.72,6,2,delivered,2021-02-13 15:13:09,2021-02-16 15:13:09,True
8385cfcd-2b3f-443a-a676-9756f7eb5404,2021-02-11 02:19:17,96aa719e-c5a3-4645-ba89-16c304fb59b0,Illinois,205.5,1.44,0,206.94,3,2,shipped,,2022-10-18 10:15:26,
55164b42-78d3-4c4e-9dd3-cce61b956403,2021-02-11 23:35:14,aa2bd310-8be1-4b6c-a852-2789d8652ac7,Texas,45.0,8.07,0,53.07,3,1,delivered,2021-02-15 23:35:14,2021-02-14 23:35:14,False
e1ecd50e-033d-4d0c-bcff-137c2eb494dd,2021-02-11 07:08:04,97fec509-ddfd-468b-8c39-ece964fdbcf1,California,150.0,3.18,0,153.18,4,3,shipped,,2021-02-16 07:08:04,
1747e0c2-2649-4b8b-8048-540425302a8f,2021-02-10 10:35:38,e512189a-15a2-4325-b41d-6092f5c03bc9,Colorado,562.9,6.11,0,569.01,10,3,delivered,2021-02-12 10:35:38,2021-02-13 10:35:38,True
1eb77dda-3387-4242-ba28-0b371ea785cf,2021-02-10 15:22:56,393de08c-725c-457a-a591-16030480eb80,New York,373.85,3.23,0,377.08,7,3,shipped,,2021-02-13 15:22:56,
529bcc4d-8fcc-42b5-b4c5-a0e2c8886686,2021-02-10 07:33:27,f47ac0f5-2afa-4e71-8328-bd881947159b,California,82.0,6.37,0,88.37,4,1,delivered,2021-02-15 07:33:27,2021-02-11 07:33:27,False
d0cc9fd6-0ce6-42a4-a77e-fabb1b0962da,2021-02-10 20:17:35,7025daf6-54d9-44be-8282-7923def85169,Virginia,565.0,7.48,13,559.48,7,2,delivered,2021-02-13 20:17:35,2021-02-12 20:17:35,False
e24985f3-2fb3-456e-a1aa-aaf88f490d70,2021-02-11 10:11:14,b9287f6c-c865-49bb-a3c9-da47bbdf3a90,Arizona,256.25,7.94,0,264.19,6,3,shipped,,2022-10-18 10:15:26,


**Notes**

1. Using
   ```sql
   SELECT COUNT(DISTINCT(user_id))
   FROM order_summary_with_state
   ```
   gives all the users in the `stg_postgres_users` table.
2. Using `INNER JOIN` to does not only capture orders from known addresses is not necessary since all orders have an `address_id`.

## Disconnect

Close connection

In [23]:
%sql --close connection