In [1]:
%%configure
{"vCores": 64}

In [3]:
!pip install duckdb --pre --upgrade
import sys
sys.exit(0)

Collecting duckdb
  Downloading duckdb-1.4.0.dev2447-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl.metadata (2.0 kB)
Downloading duckdb-1.4.0.dev2447-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl (22.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/22.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/22.2 MB[0m [31m146.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/22.2 MB[0m [31m143.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m15.6/22.2 MB[0m [31m156.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m20.6/22.2 MB[0m [31m155.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m22.2/22.2 MB[0m [31m156.9 MB

Installing collected packages: duckdb
  Attempting uninstall: duckdb
    Found existing installation: duckdb 1.2.0
    Uninstalling duckdb-1.2.0:


sys.exit called with value 0. The interpreter will be restarted.
      Successfully uninstalled duckdb-1.2.0
Successfully installed duckdb-1.4.0.dev2447


In [4]:
num_rows =   2_000_000_000
ws       =   "largedata"
lh       =   "coffee"
db_path  =   '/lakehouse/default/Files/coffe_meta.db'
results  =   'abfss://largedata@onelake.dfs.fabric.microsoft.com/coffee.Lakehouse/Tables/dbo/results'

In [5]:
from   datetime           import datetime
from   deltalake.writer   import write_deltalake
import pandas as pd
from   psutil import *
import duckdb

In [None]:
core = cpu_count()

# ***Write Data***

In [6]:
%%time
# adapted from here, converted from pyspark to duckdb sql https://www.linkedin.com/pulse/databricks-vs-snowflake-fabric-test-details-josue-a-bogran-zcpke/
if not os.path.exists(f"/lakehouse/default/Tables/coffee{num_rows}"):
    con = duckdb.connect()
    con.sql(f"""
    ATTACH or replace 'ducklake:{db_path}' AS db (DATA_PATH '/lakehouse/default/Tables');
    USE db ;
    create schema if not exists coffee{num_rows} ;
    USE coffee{num_rows} ;
    SET preserve_insertion_order = false;
    SET temp_directory=   '/lakehouse/default/Files/tmp' ;

    -- Step 0: Defined Dimensional Tables
    create table if not exists dim_locations as select * from 'https://raw.githubusercontent.com/JosueBogran/coffeeshopdatagenerator/refs/heads/main/Dim_Locations_Table.csv' ;
    create table if not exists dim_products  as select * from 'https://raw.githubusercontent.com/JosueBogran/coffeeshopdatagenerator/refs/heads/main/Dim_Products_Table.csv' ;


    -- Step 1: Generate base orders
    CREATE OR REPLACE view base_orders AS
    WITH base AS (
        SELECT
            id,
            md5(CAST(id AS VARCHAR) || '_' || random()) AS Order_ID,
            DATE '2023-01-01' + CAST(random() * 730 AS INTEGER) AS Order_Date,
            random() AS rand_lines,
            random() AS rand_tod,
            random() AS rand_loc
        FROM generate_series(0, {num_rows} - 1) tbl(id)
    )
    SELECT
        *,
        EXTRACT('month' FROM Order_Date) AS Month,
        CASE
            WHEN EXTRACT('month' FROM Order_Date) IN (12, 1, 2) THEN 'winter'
            WHEN EXTRACT('month' FROM Order_Date) IN (3, 4, 5) THEN 'spring'
            WHEN EXTRACT('month' FROM Order_Date) IN (6, 7, 8) THEN 'summer'
            ELSE 'fall'
        END AS Season,
        CASE
            WHEN rand_lines < 0.60 THEN 1
            WHEN rand_lines < 0.90 THEN 2
            WHEN rand_lines < 0.95 THEN 3
            WHEN rand_lines < 0.96 THEN 4
            ELSE 5
        END AS Num_Lines,
        CASE
            WHEN rand_tod < 0.50 THEN 'Morning'
            WHEN rand_tod < 0.80 THEN 'Afternoon'
            ELSE 'Night'
        END AS Time_Of_Day,
        CASE
            WHEN rand_loc < 0.30 THEN FLOOR(random() * 50) + 1
            WHEN rand_loc < 0.80 THEN FLOOR(random() * 150) + 51
            WHEN rand_loc < 0.95 THEN FLOOR(random() * 300) + 201
            ELSE FLOOR(random() * 500) + 501
        END AS Location_ID
    FROM base;

    ------------------------------------------------------------
    -- Step 2: Explode orders by Num_Lines
    CREATE OR REPLACE view exploded_orders AS
    SELECT
        b.*,
        s.value AS Line_Val
    FROM base_orders b
    JOIN LATERAL generate_series(1, b.Num_Lines) s(value) ON TRUE;

    -- Step 3: Add line-level randomness
    CREATE OR REPLACE view final_data AS
    SELECT
        *,
        Order_ID || '_' || CAST(Line_Val AS VARCHAR) AS Order_Line_ID,

        -- Quantity
        CASE
            WHEN random() < 0.40 THEN 1
            WHEN random() < 0.70 THEN 2
            WHEN random() < 0.85 THEN 3
            WHEN random() < 0.95 THEN 4
            ELSE 5
        END AS Quantity,

        -- Discount Rate
        CASE
            WHEN random() < 0.80 THEN 0
            ELSE FLOOR(random() * 15 + 1)
        END AS Discount_Rate,

        -- Product_ID distribution by season (cast indexes to BIGINT!)
        CASE
            WHEN Season = 'summer' THEN
                CASE
                    WHEN random() < 0.40 THEN (CASE FLOOR(random() * 2) WHEN 0 THEN 5 ELSE 6 END)
                    WHEN random() < 0.90 THEN (ARRAY[1,2,3,4,7,8,9,10])[CAST(FLOOR(random() * 8) + 1 AS BIGINT)]
                    ELSE (ARRAY[11,12,13])[CAST(FLOOR(random() * 3) + 1 AS BIGINT)]
                END
            ELSE
                CASE
                    WHEN random() < 0.70 THEN (ARRAY[1,2,3,4,7,8,9,10])[CAST(FLOOR(random() * 8) + 1 AS BIGINT)]
                    WHEN random() < 0.80 THEN (CASE FLOOR(random() * 2) WHEN 0 THEN 5 ELSE 6 END)
                    ELSE (ARRAY[11,12,13])[CAST(FLOOR(random() * 3) + 1 AS BIGINT)]
                END
        END AS Product_ID
    FROM exploded_orders;

    -- Step 4: final Data
    CREATE table if not exists fact_sales as
    SELECT
    a.Order_ID ,
    a.order_line_id ,
    a.order_date ,
    a.time_Of_day ,
    a.season ,
    b.location_id ,
    c.name AS product_name ,
    a.quantity ,
    (c.standard_price * ((100-discount_rate)/100)) * a.Quantity AS sales_amount ,
    a.discount_rate AS discount_percentage
    FROM final_data AS a
    LEFT JOIN dim_locations AS b ON (a.Location_ID = b.record_id)
    LEFT JOIN dim_products  AS c ON (a.Product_ID = c.product_id AND a.Order_Date BETWEEN c.from_date AND c.to_date) ;
    """)
    con.close()
    !pip install -q ducklake-delta-exporter
    from ducklake_delta_exporter import generate_latest_delta_log
    generate_latest_delta_log('/lakehouse/default/Files/coffe_meta.db')
else :
    print("data exists already")

data exists already
CPU times: user 1.32 ms, sys: 0 ns, total: 1.32 ms
Wall time: 218 ms


# ***Query Data***

**<mark>SQL</mark>**

In [7]:
sql ="""
-- 1) Calculate total daily sales for each city and a 7-day rolling average.
SELECT
    f.order_date,
    l.city,
    SUM(f.sales_amount) AS total_sales,
    AVG(SUM(f.sales_amount)) OVER (
        PARTITION BY l.city
        ORDER BY f.order_date
        ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
    ) AS rolling_7day_avg
FROM fact_sales f
JOIN dim_locations l
    ON f.location_id = l.location_id
GROUP BY
    f.order_date,
    l.city
ORDER BY
    l.city,
    f.order_date;



-- 2) For each month, rank products by total sales amount, with 1 being the highest.
WITH monthly_sales AS (
    SELECT
        DATE_TRUNC('month', f.order_date) AS sales_month,
        f.product_name,
        SUM(f.sales_amount) AS total_sales
    FROM fact_sales f
    GROUP BY
        DATE_TRUNC('month', f.order_date),
        f.product_name
)
SELECT
    sales_month,
    product_name,
    total_sales,
    RANK() OVER (PARTITION BY sales_month ORDER BY total_sales DESC) AS sales_rank
FROM monthly_sales
ORDER BY sales_month, sales_rank;



-- 3) Find the locations in each season with the highest average discount, limited to top 3.
WITH season_discount AS (
    SELECT
        l.city,
        l.state,
        f.season,
        AVG(f.discount_percentage) AS avg_discount
    FROM fact_sales f
    JOIN dim_locations l
        ON f.location_id = l.location_id
    GROUP BY
        l.city,
        l.state,
        f.season
)
SELECT
    city,
    state,
    season,
    avg_discount,
    discount_rank
FROM (
    SELECT
        city,
        state,
        season,
        avg_discount,
        DENSE_RANK() OVER (PARTITION BY season ORDER BY avg_discount DESC) AS discount_rank
    FROM season_discount
) t
WHERE discount_rank <= 3
ORDER BY season, discount_rank;



-- 4) Compare actual daily sales to standard_price and standard_cost, to show total margin.
--    Join on product_name and date range.
SELECT
    f.order_date,
    f.product_name,
    p.standard_price,
    p.standard_cost,
    SUM(f.quantity) AS total_quantity_sold,
    SUM(f.sales_amount) AS total_sales_amount,
    (p.standard_price - p.standard_cost) * SUM(f.quantity) AS theoretical_margin
FROM fact_sales f
JOIN dim_products p
    ON f.product_name = p.name
    AND f.order_date BETWEEN p.from_date AND p.to_date
GROUP BY
    f.order_date,
    f.product_name,
    p.standard_price,
    p.standard_cost
ORDER BY
    f.order_date,
    f.product_name;



-- 5) Use a window function to calculate a 30-day rolling total quantity sold per city.
WITH daily_city_qty AS (
    SELECT
        f.order_date,
        l.city,
        SUM(f.quantity) AS daily_qty
    FROM fact_sales f
    JOIN dim_locations l
        ON f.location_id = l.location_id
    GROUP BY
        f.order_date,
        l.city
)
SELECT
    order_date,
    city,
    daily_qty,
    SUM(daily_qty) OVER (
        PARTITION BY city
        ORDER BY order_date
        ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
    ) AS rolling_30day_qty
FROM daily_city_qty
ORDER BY city, order_date;



-- 6) Create or replace a table that stores monthly revenue by product category.
WITH monthly_cat AS (
    SELECT
        DATE_TRUNC('month', f.order_date) AS sales_month,
        p.category,
        SUM(f.sales_amount) AS monthly_revenue
    FROM fact_sales f
    JOIN dim_products p
        ON f.product_name = p.name
        AND f.order_date BETWEEN p.from_date AND p.to_date
    GROUP BY
        DATE_TRUNC('month', f.order_date),
        p.category
)
SELECT
    sales_month,
    category,
    monthly_revenue
FROM monthly_cat;



-- 7) Compare total sales by location in 2023 vs. 2024.
WITH yearly_sales AS (
    SELECT
        l.location_id,
        l.city,
        l.state,
        YEAR(f.order_date) AS sales_year,
        SUM(f.sales_amount) AS total_sales_year
    FROM fact_sales f
    JOIN dim_locations l
        ON f.location_id = l.location_id
    GROUP BY
        l.location_id,
        l.city,
        l.state,
        YEAR(f.order_date)
)
SELECT
    city,
    state,
    SUM(CASE WHEN sales_year = 2023 THEN total_sales_year ELSE 0 END) AS sales_2023,
    SUM(CASE WHEN sales_year = 2024 THEN total_sales_year ELSE 0 END) AS sales_2024,
    (SUM(CASE WHEN sales_year = 2024 THEN total_sales_year ELSE 0 END)
     - SUM(CASE WHEN sales_year = 2023 THEN total_sales_year ELSE 0 END)) AS yoy_diff
FROM yearly_sales
GROUP BY
    city,
    state
ORDER BY
    city,
    state;



-- 8) For each city and quarter, rank subcategories by total sales amount.
WITH city_quarter_subcat AS (
    SELECT
        l.city,
        DATE_TRUNC('quarter', f.order_date) AS sales_quarter,
        p.subcategory,
        SUM(f.sales_amount) AS total_sales
    FROM fact_sales f
    JOIN dim_locations l
        ON f.location_id = l.location_id
    JOIN   dim_products p
        ON f.product_name = p.name
        AND f.order_date BETWEEN p.from_date AND p.to_date
    GROUP BY
        l.city,
        DATE_TRUNC('quarter', f.order_date),
        p.subcategory
)
SELECT
    city,
    sales_quarter,
    subcategory,
    total_sales,
    RANK() OVER (PARTITION BY city, sales_quarter ORDER BY total_sales DESC) AS subcat_rank
FROM city_quarter_subcat
ORDER BY city, sales_quarter, subcat_rank;



-- 9) Show average discount by day, and a running cumulative average discount per city.
WITH daily_discount AS (
    SELECT
        l.city,
        f.order_date,
        AVG(f.discount_percentage) AS avg_discount
    FROM fact_sales f
    JOIN dim_locations l
        ON f.location_id = l.location_id
    GROUP BY
        l.city,
        f.order_date
)
SELECT
    city,
    order_date,
    avg_discount,
    AVG(avg_discount) OVER (
        PARTITION BY city
        ORDER BY order_date
        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
    ) AS cumulative_avg_discount
FROM daily_discount
ORDER BY city, order_date;



-- 10) 90-day rolling count of distinct orders in each city.
WITH daily_orders AS (
    SELECT
        f.order_date,
        l.city,
        COUNT(DISTINCT f.order_id) AS daily_distinct_orders
    FROM fact_sales f
    JOIN dim_locations l
        ON f.location_id = l.location_id
    GROUP BY
        f.order_date,
        l.city
)
SELECT
    order_date,
    city,
    daily_distinct_orders,
    SUM(daily_distinct_orders) OVER (
        PARTITION BY city
        ORDER BY order_date
        ROWS BETWEEN 89 PRECEDING AND CURRENT ROW
    ) AS rolling_90d_distinct_orders
FROM daily_orders
ORDER BY city, order_date;
"""

In [8]:
def execute_query(engine, sql_script, exclude_list):
    results = []
    sql_arr = sql_script.split(";")
    
    for index, value in enumerate(sql_arr, start=1):
        if index not in exclude_list:
            if len(value.strip()) > 0:
                start = time.time()
                print('query' + str(index))
                try:
                    engine.sql(value).show()
                    stop = time.time()
                    duration = stop - start
                except Exception as er:
                    print(er)
                    duration = float('nan')
                print(duration)
                results.append({'dur': duration, 'query': index})
    
    return pd.DataFrame(results)

In [10]:
con = duckdb.connect()
con.sql(f"""
    SET temp_directory=   '/lakehouse/default/Files/tmp' ;
    CREATE or replace SECRET onelake ( TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{notebookutils.credentials.getToken('storage')}')   ;
    ATTACH or replace 'ducklake:{db_path}' AS db (DATA_PATH 'abfss://{ws}@onelake.dfs.fabric.microsoft.com/{lh}.Lakehouse/Tables');
    USE db ;
    create schema if not exists coffee{num_rows} ;
    USE coffee{num_rows} ;
    """)
df = execute_query(con,sql,[])
df['Engine']            =  'duckdb'
df['time']              =  datetime.now().strftime('%Y-%m-%d %H:%M:%S')
df['sf']                =  num_rows
df['cpu']               =  core
df['test']              = 'coffee'
write_deltalake(results,df,mode="append")

query1


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌────────────┬─────────────┬────────────────────┬───────────────────┐
│ Order_Date │    city     │    total_sales     │ rolling_7day_avg  │
│    date    │   varchar   │       double       │      double       │
├────────────┼─────────────┼────────────────────┼───────────────────┤
│ 2023-01-01 │ Austin      │  4159120.985000061 │ 4159120.985000061 │
│ 2023-01-02 │ Austin      │  8323460.895499824 │ 6241290.940249942 │
│ 2023-01-03 │ Austin      │ 8321262.6839998225 │ 6934614.854833235 │
│ 2023-01-04 │ Austin      │  8321460.773999821 │ 7281326.334624882 │
│ 2023-01-05 │ Austin      │  8307517.620499828 │ 7486564.591799872 │
│ 2023-01-06 │ Austin      │  8310185.337499822 │ 7623834.716083196 │
│ 2023-01-07 │ Austin      │  8315671.108499822 │ 7722668.486428429 │
│ 2023-01-08 │ Austin      │  8311736.441999823 │ 8315899.265999824 │
│ 2023-01-09 │ Austin      │   8321641.38699983 │  8315639.33621411 │
│ 2023-01-10 │ Austin      │  8309658.940999826 │ 8313981.658642682 │
│     ·      │   ·  

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌─────────────┬────────────────────────┬────────────────────┬────────────┐
│ sales_month │      product_name      │    total_sales     │ sales_rank │
│    date     │        varchar         │       double       │   int64    │
├─────────────┼────────────────────────┼────────────────────┼────────────┤
│ 2023-01-01  │ Frappe                 │ 167449234.46173248 │          1 │
│ 2023-01-01  │ Iced Latte             │ 156068174.62499315 │          2 │
│ 2023-01-01  │ Cappuccino             │ 101406005.72902037 │          3 │
│ 2023-01-01  │ Latte                  │   93079633.5449915 │          4 │
│ 2023-01-01  │ Americano              │  91004841.84000716 │          5 │
│ 2023-01-01  │ Macchiato              │  82760732.36000197 │          6 │
│ 2023-01-01  │ Chamomile              │   72385874.7150018 │          7 │
│ 2023-01-01  │ Chai                   │  66199898.71997284 │          8 │
│ 2023-01-01  │ Green                  │  64134566.03889785 │          9 │
│ 2023-01-01  │ Black    

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌─────────────┬─────────┬─────────┬────────────────────┬───────────────┐
│    city     │  state  │ Season  │    avg_discount    │ discount_rank │
│   varchar   │ varchar │ varchar │       double       │     int64     │
├─────────────┼─────────┼─────────┼────────────────────┼───────────────┤
│ Austin      │ TX      │ fall    │  1.600157506307998 │             1 │
│ Los Angeles │ CA      │ fall    │ 1.6000653580580642 │             2 │
│ Charlotte   │ NC      │ fall    │ 1.6000598563776658 │             3 │
│ Los Angeles │ CA      │ spring  │ 1.6005384573476664 │             1 │
│ Charlotte   │ NC      │ spring  │ 1.6000905638645815 │             2 │
│ Houston     │ TX      │ spring  │ 1.6000760234395943 │             3 │
│ Austin      │ TX      │ summer  │ 1.6002353056741372 │             1 │
│ Houston     │ TX      │ summer  │  1.600077690303798 │             2 │
│ Charlotte   │ NC      │ summer  │ 1.5995421232564269 │             3 │
│ Charlotte   │ NC      │ winter  │ 1.6001244341964

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌────────────┬────────────────────────┬────────────────┬───────────────┬─────────────────────┬────────────────────┬────────────────────┐
│ Order_Date │      product_name      │ standard_price │ standard_cost │ total_quantity_sold │ total_sales_amount │ theoretical_margin │
│    date    │        varchar         │     double     │    double     │       int128        │       double       │       double       │
├────────────┼────────────────────────┼────────────────┼───────────────┼─────────────────────┼────────────────────┼────────────────────┤
│ 2023-01-01 │ Americano              │            4.4 │           1.5 │              344970 │ 1493489.8439999977 │ 1000413.0000000001 │
│ 2023-01-01 │ Biscotti               │            3.9 │           1.2 │               78963 │  303025.0080000008 │           213200.1 │
│ 2023-01-01 │ Black                  │            3.0 │           0.3 │              344969 │  1018459.319999999 │           931416.3 │
│ 2023-01-01 │ Cappuccino             │  

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌────────────┬─────────────┬───────────┬───────────────────┐
│ Order_Date │    city     │ daily_qty │ rolling_30day_qty │
│    date    │   varchar   │  int128   │      int128       │
├────────────┼─────────────┼───────────┼───────────────────┤
│ 2023-01-01 │ Austin      │    990440 │            990440 │
│ 2023-01-02 │ Austin      │   1982126 │           2972566 │
│ 2023-01-03 │ Austin      │   1981264 │           4953830 │
│ 2023-01-04 │ Austin      │   1980353 │           6934183 │
│ 2023-01-05 │ Austin      │   1978525 │           8912708 │
│ 2023-01-06 │ Austin      │   1978569 │          10891277 │
│ 2023-01-07 │ Austin      │   1979879 │          12871156 │
│ 2023-01-08 │ Austin      │   1978500 │          14849656 │
│ 2023-01-09 │ Austin      │   1981410 │          16831066 │
│ 2023-01-10 │ Austin      │   1978795 │          18809861 │
│     ·      │   ·         │      ·    │              ·    │
│     ·      │   ·         │      ·    │              ·    │
│     ·      │   ·      

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌─────────────┬──────────┬────────────────────┐
│ sales_month │ category │  monthly_revenue   │
│    date     │ varchar  │       double       │
├─────────────┼──────────┼────────────────────┤
│ 2023-06-01  │ Cold     │   576763984.096597 │
│ 2024-05-01  │ Hot      │  771719759.5368739 │
│ 2024-09-01  │ Hot      │   746782775.604385 │
│ 2023-06-01  │ Shelf    │  21869515.52700626 │
│ 2024-03-01  │ Hot      │  771689682.0098672 │
│ 2024-08-01  │ Hot      │  595329969.6204528 │
│ 2024-12-01  │ Hot      │  759152711.5736321 │
│ 2023-05-01  │ Hot      │  771724889.6808722 │
│ 2023-12-01  │ Hot      │  771657279.3798622 │
│ 2023-04-01  │ Hot      │  746849942.4913827 │
│     ·       │  ·       │          ·         │
│     ·       │  ·       │          ·         │
│     ·       │  ·       │          ·         │
│ 2024-11-01  │ Shelf    │ 21858923.043006256 │
│ 2024-11-01  │ Cold     │ 346054525.50461763 │
│ 2024-04-01  │ Cold     │  346133580.7266161 │
│ 2023-11-01  │ Baked    │ 38609191.1749

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌─────────────┬─────────┬────────────────────┬────────────────────┬───────────────────┐
│    city     │  state  │     sales_2023     │     sales_2024     │     yoy_diff      │
│   varchar   │ varchar │       double       │       double       │      double       │
├─────────────┼─────────┼────────────────────┼────────────────────┼───────────────────┤
│ Austin      │ TX      │  3489031742.487912 │ 3577832408.4984536 │ 88800666.01054144 │
│ Charlotte   │ NC      │  3451980594.309409 │   3539621662.01545 │ 87641067.70604086 │
│ Houston     │ TX      │  3488797478.245913 │  3577504910.880454 │ 88707432.63454103 │
│ Los Angeles │ CA      │ 3452522344.5384083 │ 3539957745.3264494 │ 87435400.78804111 │
└─────────────┴─────────┴────────────────────┴────────────────────┴───────────────────┘

4.965765476226807
query8


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌─────────────┬───────────────┬─────────────┬────────────────────┬─────────────┐
│    city     │ sales_quarter │ subcategory │    total_sales     │ subcat_rank │
│   varchar   │     date      │   varchar   │       double       │    int64    │
├─────────────┼───────────────┼─────────────┼────────────────────┼─────────────┤
│ Austin      │ 2023-01-01    │ Coffee      │  534320011.5902452 │           1 │
│ Austin      │ 2023-01-01    │ Tea         │ 210581206.30392453 │           2 │
│ Austin      │ 2023-01-01    │ Pastries    │  41113018.89450484 │           3 │
│ Austin      │ 2023-04-01    │ Coffee      │  623131892.7157575 │           1 │
│ Austin      │ 2023-04-01    │ Tea         │  225166575.4642985 │           2 │
│ Austin      │ 2023-04-01    │ Pastries    │  46090134.95499359 │           3 │
│ Austin      │ 2023-07-01    │ Coffee      │  665082998.2034734 │           1 │
│ Austin      │ 2023-07-01    │ Tea         │ 208284302.76941824 │           2 │
│ Austin      │ 2023-07-01  

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌─────────────┬────────────┬────────────────────┬─────────────────────────┐
│    city     │ Order_Date │    avg_discount    │ cumulative_avg_discount │
│   varchar   │    date    │       double       │         double          │
├─────────────┼────────────┼────────────────────┼─────────────────────────┤
│ Austin      │ 2023-01-01 │ 1.6005649841315057 │      1.6005649841315057 │
│ Austin      │ 2023-01-02 │ 1.5941461956105583 │      1.5973555898710319 │
│ Austin      │ 2023-01-03 │ 1.6017724560080764 │        1.59882787858338 │
│ Austin      │ 2023-01-04 │ 1.5987332346591085 │       1.598804217602312 │
│ Austin      │ 2023-01-05 │ 1.5971677211575799 │      1.5984769183133656 │
│ Austin      │ 2023-01-06 │ 1.5953807735760066 │      1.5979608941904724 │
│ Austin      │ 2023-01-07 │  1.608884884632693 │      1.5995214642536466 │
│ Austin      │ 2023-01-08 │ 1.6075183365703187 │      1.6005210732932307 │
│ Austin      │ 2023-01-09 │ 1.5935432692790445 │      1.5997457617360988 │
│ Austin    

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌────────────┬─────────────┬───────────────────────┬─────────────────────────────┐
│ Order_Date │    city     │ daily_distinct_orders │ rolling_90d_distinct_orders │
│    date    │   varchar   │         int64         │           int128            │
├────────────┼─────────────┼───────────────────────┼─────────────────────────────┤
│ 2023-01-01 │ Austin      │                344531 │                      344531 │
│ 2023-01-02 │ Austin      │                689211 │                     1033742 │
│ 2023-01-03 │ Austin      │                689386 │                     1723128 │
│ 2023-01-04 │ Austin      │                689354 │                     2412482 │
│ 2023-01-05 │ Austin      │                688607 │                     3101089 │
│ 2023-01-06 │ Austin      │                688262 │                     3789351 │
│ 2023-01-07 │ Austin      │                689244 │                     4478595 │
│ 2023-01-08 │ Austin      │                688157 │                     5166752 │
│ 20

In [11]:
con.sql("select count(*) from fact_sales")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│   3180026783 │
└──────────────┘

In [12]:
con.close()

# Results

In [None]:
duckdb.sql(f""" with xxx as (
     select time,sf,cpu,sum(dur) as duration,list(round(dur,1) order by query) as values, from delta_scan('{results}') where sf =2000000000  group by all)
     select time,sf,cpu,round(duration,1) as dur,round((duration/3600) * (cpu/2) * 0.1075,2) as cost,values  from xxx """)

┌─────────────────────┬────────────┬───────┬────────┬────────┬──────────────────────────────────────────────────────────┐
│        time         │     sf     │  cpu  │  dur   │  cost  │                          values                          │
│       varchar       │   int64    │ int64 │ double │ double │                         double[]                         │
├─────────────────────┼────────────┼───────┼────────┼────────┼──────────────────────────────────────────────────────────┤
│ 2025-08-15 05:46:09 │ 2000000000 │    64 │  231.1 │   0.22 │ [51.3, 13.6, 22.3, 15.0, 3.8, 7.4, 5.0, 10.4, 3.7, 98.5] │
│ 2025-08-15 04:37:38 │ 2000000000 │    64 │  226.6 │   0.22 │ [48.8, 13.8, 22.1, 15.5, 3.6, 7.3, 5.0, 10.4, 3.7, 96.3] │
└─────────────────────┴────────────┴───────┴────────┴────────┴──────────────────────────────────────────────────────────┘