# Business Questions

In [None]:
import os
from pathlib import Path

from dotenv import load_dotenv
from sqlalchemy import create_engine
from snowflake.sqlalchemy import URL

In [None]:
PROJ_ROOT = Path().resolve().parents[3]
env_file_dir = PROJ_ROOT / '.env'
_ = load_dotenv(env_file_dir, verbose=True)

## About

Answer the five business questions using the greenery e-commerce data using DBT models created in my personal Snowflake schema for week 1 project.

### Notes

1. This notebook supports <kbd>Run</kbd> > <kbd>Run All Cells</kbd>.

## User Inputs

In [None]:
#

In [None]:
engine = create_engine(
    URL(
        drivername="driver",
        account=os.getenv("UPLIMIT_SNOWFLAKE_ACCOUNT"),
        user=os.getenv("UPLIMIT_SNOWFLAKE_USER"),
        password=os.getenv("UPLIMIT_SNOWFLAKE_PASS"),
        warehouse=os.getenv("UPLIMIT_SNOWFLAKE_WAREHOUSE"),
        role=os.getenv("UPLIMIT_SNOWFLAKE_ROLE"),
        database=os.getenv("UPLIMIT_SNOWFLAKE_DB_NAME"),
        schema=os.getenv("UPLIMIT_SNOWFLAKE_SCHEMA"),
    )
)

## Connect

Load Jupyter SQL extension

In [None]:
%load_ext sql

Connect to database

In [None]:
%sql engine --alias connection

## Exploratory Data Analysis

### Question 1

**How many users do we have?**

In [None]:
%%sql
SELECT *
FROM stg_postgres_users
LIMIT 4

In [None]:
%%sql
SHOW PRIMARY KEYS IN raw.public.users

In [None]:
%%sql
/* since each row in users is a unique user, number of rows is number of
users */
WITH num_users AS (
    SELECT COUNT(*) AS num_users
    FROM stg_postgres_users
)
SELECT *
FROM num_users

**Notes**

1. Since the `user_id` is the primary key of the source `raw.public.users` table, each row in that column is unique. So
   ```sql
   COUNT(DISTINCT(user_id))
   ```
   is not required to get the number of users. Instead, we can simply count the number of rows (to get the number of users) using
   ```sql
   COUNT(*)
   ```

### Question 2

**On average, how many orders do we receive per hour?**

In [None]:
%%sql
SELECT *
FROM stg_postgres_orders
LIMIT 4

In [None]:
%%sql
SHOW PRIMARY KEYS IN raw.public.orders

In [None]:
%%sql
/* get the (unique) order ID and hour when it was created (received) */
WITH order_hour AS (
    SELECT order_id,
           date_part('hour', created_at) AS order_received_hour
    FROM stg_postgres_orders
),
/* get number of orders per hour from the number of order IDs per hour */
num_orders_per_hour AS (
    SELECT order_received_hour,
           count(*) AS num_orders
    FROM order_hour
    GROUP BY order_received_hour
),
/* get the average of all hourly orders */
avg_hourly_orders AS (
    -- round number of orders to show it as an integer
    SELECT round(avg(num_orders)) AS avg_num_hourly_orders_received
    FROM num_orders_per_hour
)
SELECT *
FROM avg_hourly_orders

**Notes**

1. Since the `order_id` is the primary key of the source `raw.public.orders` table, each row in that column is unique. So
   ```sql
   COUNT(DISTINCT(order_id))
   ```
   is not required to get the number of orders. Instead, we can simply count the number of rows (to get the number of orders) using
   ```sql
   COUNT(*)
   ```

### Question 3

**On average, how long does an order take from being placed to being delivered?**

In [None]:
%%sql
SELECT *
FROM stg_postgres_orders
LIMIT 4

In [None]:
%%sql
/* for each order, get delivery time as difference in seconds between order
creation and delivery timestamps */
WITH delivery_times AS (
    SELECT
        datediff('second', created_at, delivered_at) AS delivery_time_seconds
    FROM stg_postgres_orders
),
/* get the average delivery time in seconds across all orders */
avg_delivery_time AS (
    SELECT
        cast(
            -- round up average delivery time seconds from float to integer
            round(avg(delivery_time_seconds)) AS VARCHAR
        ) AS avg_delivery_time_seconds
    FROM delivery_times
),
/* convert average delivery time from seconds to HH:MM:SS format */
avg_delivery_time_formatted AS (
    SELECT -- average delivery time in seconds
           avg_delivery_time_seconds,
           -- average delivery time in hours:minutes:seconds
           to_time(avg_delivery_time_seconds) AS avg_delivery_time_hhmmss
    FROM avg_delivery_time
)
SELECT *
FROM avg_delivery_time_formatted

**Notes**

1. Since the `order_id` is the primary key of the source `raw.public.orders` table, each row in that column corresponds to a unique order. We can calculate the delivery time for each (unique) order by calculating the time difference between the (order) `created_at` and (order) `delivered_at` columns. Then, the overall average delivery time is the average of the delivery time for each order.

### Question 4

**How many users have only made one purchase? Two purchases? Three+ purchases? Note: you should consider a purchase to be a single order. In other words, if a user places one order for 3 products, they are considered to have made 1 purchase.**

As seen earlier, the `order_id` is the primary key of the source `raw.public.orders` table. So, each row in that column corresponds to a unique order. Since a purchase is a single (unique) order, we get the number of purchases per user by counting the number of rows in the `orders` table per user.

This is shown below

In [None]:
%%sql
/* get number of purchases per user */
WITH num_purchases_per_user AS (
    SELECT user_id,
           COUNT(*) AS num_purchases
    FROM stg_postgres_orders
    GROUP BY user_id
),
/* bin number of user purchases into bins of 1, 2, 3+ purchases */
num_user_purchases_binned AS (
    SELECT *,
           (
               CASE
                   WHEN num_purchases IN (1,2)
                   THEN CAST(num_purchases AS TEXT)
                   ELSE '3+'
               END
           ) AS num_purchases_binned
    FROM num_purchases_per_user
),
/* count the number of users in each bin of user purchases */
num_users_per_bin AS (
    SELECT num_purchases_binned AS num_purchases,
           COUNT(*) AS num_users
    FROM num_user_purchases_binned
    GROUP BY num_purchases_binned
    ORDER BY num_purchases_binned
)
SELECT *
FROM num_users_per_bin

### Question 5

**On average, how many unique sessions do we have per hour?**

In [None]:
%%sql
SELECT *
FROM stg_postgres_events
LIMIT 4

In [None]:
%%sql
SHOW PRIMARY KEYS IN raw.public.events

In [None]:
%%sql
/* get the (unique) session ID and hour when it was created (started) */
WITH session_hour AS (
    SELECT session_id,
           date_part('hour', created_at) AS session_start_hour
    FROM stg_postgres_events
),
/* get number of sessions per hour from the number of session IDs per hour */
num_sessions_per_hour AS (
    SELECT session_start_hour,
           count(distinct(session_id)) AS num_sessions
    FROM session_hour
    GROUP BY session_start_hour
),
/* get the average of all hourly sessions */
avg_hourly_sessions AS (
    -- round number of sessions to show it as an integer
    SELECT round(avg(num_sessions)) AS avg_num_hourly_sessions
    FROM num_sessions_per_hour
)
SELECT *
FROM avg_hourly_sessions

**Notes**

1. Since the `event_id` is **not** the primary key of the `raw.public.events` table, each `session_id` in that column is not unique. So we cannot simply count the number of rows using
   ```sql
   COUNT(*)
   ```
   to get the number of sessions, since this count would give the number of events. Instead
   ```sql
   COUNT(DISTINCT(session_id))
   ```
   is required to get the number of (unique) sessions.

## Disconnect

Close connection

In [None]:
%sql --close connection