# Corrected Answers to Business Questions

In [None]:
import os
from pathlib import Path

from dotenv import load_dotenv
from sqlalchemy import create_engine
from snowflake.sqlalchemy import URL

In [None]:
PROJ_ROOT = Path().resolve().parents[3]
env_file_dir = PROJ_ROOT / '.env'
_ = load_dotenv(env_file_dir, verbose=True)

## About

Corrections to answers to business questions 2, 3 and 5. All queries were run using DBT models created in my personal Snowflake schema for week 1 project.

No changes are made to queries to answer quetions 1 and 4.

### Notes

1. This notebook supports <kbd>Run</kbd> > <kbd>Run All Cells</kbd>.

## User Inputs

In [None]:
#

In [None]:
engine = create_engine(
    URL(
        drivername="driver",
        account=os.getenv("UPLIMIT_SNOWFLAKE_ACCOUNT"),
        user=os.getenv("UPLIMIT_SNOWFLAKE_USER"),
        password=os.getenv("UPLIMIT_SNOWFLAKE_PASS"),
        warehouse=os.getenv("UPLIMIT_SNOWFLAKE_WAREHOUSE"),
        role=os.getenv("UPLIMIT_SNOWFLAKE_ROLE"),
        database=os.getenv("UPLIMIT_SNOWFLAKE_DB_NAME"),
        schema=os.getenv("UPLIMIT_SNOWFLAKE_SCHEMA"),
    )
)

## Connect

Load Jupyter SQL extension

In [None]:
%load_ext sql

Connect to DuckDB database

In [None]:
%sql engine --alias connection

## Exploratory Data Analysis

### Question 1

**How many users do we have?**

In [None]:
%%sql
SELECT *
FROM stg_postgres_users
LIMIT 4

In [None]:
%%sql
SHOW PRIMARY KEYS IN raw.public.users

In [None]:
%%sql
/* since each row in users is a unique user, number of rows is number of
users */
WITH num_users AS (
    SELECT COUNT(*) AS num_users
    FROM stg_postgres_users
)
SELECT *
FROM num_users

**Notes**

1. Since the `user_id` is the primary key of the source `raw.public.users` table, each row in that column is unique. So
   ```sql
   COUNT(DISTINCT(user_id))
   ```
   is not required to get the number of users. Instead, we can simply count the number of rows (to get the number of users) using
   ```sql
   COUNT(*)
   ```

### Question 2

**On average, how many orders do we receive per hour?**

In [None]:
%%sql
SELECT *
FROM stg_postgres_orders
LIMIT 4

In [None]:
%%sql
SHOW PRIMARY KEYS IN raw.public.orders

In [None]:
%%sql
SELECT count(*) AS num_missing_order_ids
FROM stg_postgres_orders
WHERE order_id IS NULL

In [None]:
%%sql
/* get the (unique) session ID and year, month, day and hour when it was
created (started) */
WITH orders_with_datetime_attributes AS (
    SELECT order_id,
           date_part('year', created_at) AS order_created_at_year,
           date_part('month', created_at) AS order_created_at_month,
           date_part('day', created_at) AS order_created_at_day,
           date_part('hour', created_at) AS order_created_at_hour
    FROM stg_postgres_orders
)
SELECT *
FROM orders_with_datetime_attributes

In [None]:
%%sql
/* get the (unique) session ID and year, month, day and hour when it was
created (started) */
WITH orders_with_datetime_attributes AS (
    SELECT order_id,
           date_part('year', created_at) AS order_created_at_year,
           date_part('month', created_at) AS order_created_at_month,
           date_part('day', created_at) AS order_created_at_day,
           date_part('hour', created_at) AS order_created_at_hour
    FROM stg_postgres_orders
),
/* get number of sessions per day per hour from the session ID */
num_orders_per_day_per_hour AS (
    SELECT order_created_at_year,
           order_created_at_month,
           order_created_at_day,
           order_created_at_hour,
           count(distinct(order_id)) AS num_orders
    FROM orders_with_datetime_attributes
    GROUP BY ALL
),
/* get the average of hourly sessions, across all days */
avg_hourly_orders AS (
    SELECT avg(num_orders) As avg_num_orders_per_hour,
           round(avg(num_orders)) As avg_num_orders_per_hour_rounded
    FROM num_orders_per_day_per_hour
)
SELECT *
FROM avg_hourly_orders

**Notes**

1. Since the `order_id` is the primary key of the source `raw.public.orders` table, each row in that column is unique. So
   ```sql
   COUNT(DISTINCT(order_id))
   ```
   is not required to get the number of orders. Instead, we can simply count the number of rows (to get the number of orders) using
   ```sql
   COUNT(*)
   ```

### Question 3

**On average, how long does an order take from being placed to being delivered?**

In [None]:
%%sql
SELECT *
FROM stg_postgres_orders
LIMIT 4

Some orders are missing an order delivery timestamp

In [None]:
%%sql
SELECT count(*) AS num_missing_order_ids
FROM stg_postgres_orders
WHERE delivered_at IS NULL

If the order delivery timestamp is missing then the order status indicates it has not been delivered

In [None]:
%%sql
SELECT distinct(status) AS status
FROM stg_postgres_orders
WHERE delivered_at IS NULL

In [None]:
%%sql
/* for each delivered order, get delivery time as difference in seconds
between order creation and delivery timestamps */
WITH delivery_times AS (
    SELECT
        datediff('second', created_at, delivered_at) AS delivery_time_seconds
    FROM stg_postgres_orders
    WHERE delivered_at IS NOT NULL
),
/* get the average delivery time in seconds across all orders */
avg_delivery_time AS (
    SELECT avg(delivery_time_seconds) AS avg_delivery_time_seconds
    FROM delivery_times
),
/* convert average delivery time from total seconds to days, hours, minutes
and seconds */
avg_delivery_time_formatted AS (
    SELECT -- average delivery time in seconds
           round(avg_delivery_time_seconds) AS avg_delivery_time_seconds,
           -- get whole number of days
           floor(avg_delivery_time_seconds/60/60/24) as days,
           -- get whole number of hours
           floor(avg_delivery_time_seconds/60/60%24) as hours,
           -- get whole number of minutes
           floor(avg_delivery_time_seconds/60%60) as minutes, 
           -- get whole number of seconds
           floor(avg_delivery_time_seconds%60) as seconds
    FROM avg_delivery_time
)
SELECT *
FROM avg_delivery_time_formatted

**Notes**

1. Since the `order_id` is the primary key of the source `raw.public.orders` table, each row in that column corresponds to a unique order. We can calculate the delivery time for each (unique) order by calculating the time difference between the (order) `created_at` and (order) `delivered_at` columns. Then, the overall average delivery time is the average of the delivery time for each order.
2. The average delivery time should be calculated from delivered orders, which have a non-`NULL` delivery time (`delivered_at`). Without the `WHERE` clause
   ```sql
   WHERE delivered_at IS NOT NULL
   ```
   orders that are not delivered give a `delivery_time_seconds` that is `NULL`. Snowflake's `AVG` function ignores `NULL` values when calculating the average. So, the correct average delivery time in seconds is calculated even if this `WHERE` clause is ignored. However, for clarity, this `WHERE` clause is kept in the query.

### Question 4

**How many users have only made one purchase? Two purchases? Three+ purchases? Note: you should consider a purchase to be a single order. In other words, if a user places one order for 3 products, they are considered to have made 1 purchase.**

As seen earlier, the `order_id` is the primary key of the source `raw.public.orders` table. So, each row in that column corresponds to a unique order. Since a purchase is a single (unique) order, we get the number of purchases per user by counting the number of rows in the `orders` table per user.

This is shown below

In [None]:
%%sql
/* get number of purchases per user */
WITH num_purchases_per_user AS (
    SELECT user_id,
           COUNT(*) AS num_purchases
    FROM stg_postgres_orders
    GROUP BY user_id
),
/* bin number of user purchases into bins of 1, 2, 3+ purchases */
num_user_purchases_binned AS (
    SELECT *,
           (
               CASE
                   WHEN num_purchases IN (1,2)
                   THEN CAST(num_purchases AS TEXT)
                   ELSE '3+'
               END
           ) AS num_purchases_binned
    FROM num_purchases_per_user
),
/* count the number of users in each bin of user purchases */
num_users_per_bin AS (
    SELECT num_purchases_binned AS num_purchases,
           COUNT(*) AS num_users
    FROM num_user_purchases_binned
    GROUP BY num_purchases_binned
    ORDER BY num_purchases_binned
)
SELECT *
FROM num_users_per_bin

### Question 5

**On average, how many unique sessions do we have per hour?**

In [None]:
%%sql
SELECT *
FROM stg_postgres_events
LIMIT 4

In [None]:
%%sql
SHOW PRIMARY KEYS IN raw.public.events

In [None]:
%%sql
SELECT count(*) AS num_missing_session_ids
FROM stg_postgres_events
WHERE session_id IS NULL

In [None]:
%%sql
/* get the (unique) session ID and year, month, day and hour when it was
created (started) */
WITH sessions_with_datetime_attributes AS (
    SELECT session_id,
           date_part('year', created_at) AS session_start_year,
           date_part('month', created_at) AS session_start_month,
           date_part('day', created_at) AS session_start_day,
           date_part('hour', created_at) AS session_start_hour
    FROM stg_postgres_events
),
/* get number of sessions per day per hour from the session ID */
num_sessions_per_day_per_hour AS (
    SELECT session_start_year,
           session_start_month,
           session_start_day,
           session_start_hour,
           count(distinct(session_id)) AS num_sessions
    FROM sessions_with_datetime_attributes
    GROUP BY ALL
),
/* get the average of hourly sessions, across all days */
avg_hourly_sessions AS (
    SELECT avg(num_sessions) As avg_num_sesions_per_hour,
           round(avg(num_sessions)) As avg_num_sesions_per_hour_rounded
    FROM num_sessions_per_day_per_hour
)
SELECT *
FROM avg_hourly_sessions

**Notes**

1. Since the `event_id` is **not** the primary key of the `raw.public.events` table, each `session_id` in that column is not unique. So we cannot simply count the number of rows using
   ```sql
   COUNT(*)
   ```
   to get the number of sessions, since this count would give the number of events. Instead
   ```sql
   COUNT(DISTINCT(session_id))
   ```
   is required to get the number of (unique) sessions.

## Disconnect

Close connection

In [None]:
%sql --close connection

## Links

1. [Convert total seconds into days, hours, minutes and seconds](https://stackoverflow.com/a/75761703)