In [30]:
import lmstudio as lms
model = lms.llm("qwen3-14b")
import duckdb
con = duckdb.connect()
con.sql(""" attach or replace '/lakehouse/default/Files/DS10.duckdb' as ds(read_only) ; use ds """)
def textto_sql(text):
    result = model.respond(text).content.replace("<think>", "").replace("</think>", "")
    return result

In [31]:
%%time
result = textto_sql("return rate per year")
print(result)





-- return rate per year  
WITH SalesYear AS (  
    SELECT  
        d.d_year AS sale_year,  
        SUM(ss.ss_sales_price * ss.ss_quantity) AS total_sales_amt -- Corresponds to total_sales measure  
    FROM store_sales AS ss  
    JOIN date_dim AS d ON ss.ss_sold_date_sk = d.d_date_sk  
    GROUP BY d.d_year  
), ReturnsYear AS (  
    SELECT  
        d.d_year AS return_year,  
        SUM(sr.sr_return_amt) AS total_returns_amt -- Corresponds to total_returns measure  
    FROM store_returns AS sr  
    JOIN date_dim AS d ON sr.sr_returned_date_sk = d.d_date_sk  
    GROUP BY d.d_year  
)  
SELECT  
    COALESCE(sy.sale_year, ry.return_year) AS year,  
    COALESCE(sy.total_sales_amt, 0) AS total_sales,  
    COALESCE(ry.total_returns_amt, 0) AS total_returns,  
    (COALESCE(ry.total_returns_amt, 0) / NULLIF(COALESCE(sy.total_sales_amt, 0), 0)) * 100 AS return_rate -- Corresponds to return_rate measure  
FROM SalesYear AS sy  
FULL OUTER JOIN ReturnsYear AS ry ON sy.sale_year 

In [32]:
con.sql(result).show()

┌───────┬────────────────┬───────────────┬───────────────────┐
│ year  │  total_sales   │ total_returns │    return_rate    │
│ int32 │ decimal(38,2)  │ decimal(38,2) │      double       │
├───────┼────────────────┼───────────────┼───────────────────┤
│  1998 │ 10054130684.38 │  303602905.62 │ 3.019683303815362 │
│  1999 │ 10158280504.79 │  532423267.25 │ 5.241273530485232 │
│  2000 │ 10185411410.38 │  533625372.77 │ 5.239114565624516 │
│  2001 │ 10090278333.54 │  526318126.14 │ 5.216091258756688 │
│  2002 │ 10158546224.68 │  531506273.52 │ 5.232109612581329 │
│  2003 │   110060783.52 │  229645059.45 │ 208.6529389537459 │
└───────┴────────────────┴───────────────┴───────────────────┘



In [36]:
%%time
result = textto_sql(" What is the return rate by customer age group? ")
print(result)





-- What is the return rate by customer age group?
WITH SalesAge AS (
    SELECT
        CASE
            WHEN (d.d_year - c.c_birth_year) < 20 THEN '< 20'
            WHEN (d.d_year - c.c_birth_year) BETWEEN 20 AND 29 THEN '20-29'
            WHEN (d.d_year - c.c_birth_year) BETWEEN 30 AND 39 THEN '30-39'
            WHEN (d.d_year - c.c_birth_year) BETWEEN 40 AND 49 THEN '40-49'
            WHEN (d.d_year - c.c_birth_year) BETWEEN 50 AND 59 THEN '50-59'
            WHEN (d.d_year - c.c_birth_year) >= 60 THEN '60+'
            ELSE 'Unknown'
        END AS age_group,
        SUM(ss.ss_sales_price * ss.ss_quantity) AS total_sales_amt
    FROM store_sales AS ss
    JOIN date_dim AS d ON ss.ss_sold_date_sk = d.d_date_sk
    JOIN customer AS c ON ss.ss_customer_sk = c.c_customer_sk
    WHERE c.c_birth_year IS NOT NULL AND d.d_year IS NOT NULL
    GROUP BY 1
), ReturnsAge AS (
    SELECT
        CASE
            WHEN (d.d_year - c.c_birth_year) < 20 THEN '< 20'
            WHEN (d.d_yea

In [37]:
con.sql(result).show()

┌────────────────────┬────────────────┬───────────────┬───────────────────┐
│ customer_age_group │  total_sales   │ total_returns │    return_rate    │
│      varchar       │ decimal(38,2)  │ decimal(38,2) │      double       │
├────────────────────┼────────────────┼───────────────┼───────────────────┤
│ 20-29              │  7058791678.27 │  368351132.47 │ 5.218331256381221 │
│ 30-39              │  7056482302.28 │  368207942.19 │  5.21800985841103 │
│ 40-49              │  7067611102.86 │  369547881.69 │ 5.228752350853851 │
│ 50-59              │  7055230584.75 │  368557846.07 │ 5.223895117852619 │
│ 60+                │ 11998907167.66 │  640297628.71 │ 5.336299545976648 │
│ < 20               │  8442653921.70 │  425835300.01 │ 5.043855924444365 │
└────────────────────┴────────────────┴───────────────┴───────────────────┘



In [38]:
%%time
result = textto_sql(" any days with unusual return rate?, use fancy statistics")
print(result)





-- Identify days with unusual return rates using statistical methods (e.g., Z-score or IQR)

WITH daily_return_rate AS (
    SELECT
        d.d_date,
        SUM(sr.sr_return_amt) AS total_returns,
        SUM(ss.ss_sales_price * ss.ss_quantity) AS total_sales,
        (SUM(sr.sr_return_amt) / NULLIF(SUM(ss.ss_sales_price * ss.ss_quantity), 0)) * 100 AS return_rate
    FROM store_returns sr
    JOIN date_dim d ON sr.sr_returned_date_sk = d.d_date_sk
    JOIN store st ON sr.sr_store_sk = st.s_store_sk
    JOIN store_sales ss ON st.s_store_sk = ss.ss_store_sk AND d.d_date_sk = ss.ss_sold_date_sk
    GROUP BY d.d_date
),
-- Calculate average and standard deviation of return rates for the entire dataset
stats AS (
    SELECT
        AVG(return_rate) AS avg_return_rate,
        STDDEV_SAMP(return_rate) AS std_dev_return_rate
    FROM daily_return_rate
),
-- Flag days with return rate more than 3 standard deviations from the mean (unusual)
unusual_days AS (
    SELECT
        d.d_date,
 

In [39]:
con.sql(result).show()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌────────────┬───────────┬───────────┬────────┬───────┬───────┬────────────────────┬─────────────────────┐
│   d_date   │ d_weekend │ d_holiday │ d_year │ d_moy │ d_dow │    return_rate     │       z_score       │
│    date    │  varchar  │  varchar  │ int32  │ int32 │ int32 │       double       │       double        │
├────────────┼───────────┼───────────┼────────┼───────┼───────┼────────────────────┼─────────────────────┤
│ 1998-01-06 │ N         │ N         │   1998 │     1 │     2 │   6.86699102560809 │ -15.676552497343172 │
│ 1998-01-10 │ Y         │ N         │   1998 │     1 │     6 │  35.46659364143699 │ -5.5922741289427575 │
│ 1998-01-13 │ N         │ N         │   1998 │     1 │     2 │ 61.323886931190245 │   3.525060363073109 │
│ 1998-01-15 │ N         │ N         │   1998 │     1 │     4 │ 37.872126239157424 │  -4.744078415683152 │
│ 1998-01-16 │ Y         │ N         │   1998 │     1 │     5 │ 26.080679822640047 │  -8.901766535602244 │
│ 1998-01-17 │ Y         │ N         

In [40]:
%%time
result = textto_sql(" Identify the top 10 item categories with the highest total return amount from customers born in 'USA' who made returns in 2001.")
print(result)





-- Identify the top 10 item categories with the highest total return amount from customers born in 'USA' who made returns in 2001.
SELECT 
    i.i_category,
    SUM(sr.sr_return_amt) AS total_return_amount
FROM store_returns sr
JOIN date_dim d ON sr.sr_returned_date_sk = d.d_date_sk
JOIN customer c ON sr.sr_customer_sk = c.c_customer_sk
JOIN item i ON sr.sr_item_sk = i.i_item_sk
WHERE 
    d.d_year = 2001 AND 
    c.c_birth_country = 'USA'
GROUP BY i.i_category
ORDER BY total_return_amount DESC
LIMIT 10;
CPU times: total: 297 ms
Wall time: 1min 41s


In [41]:
con.sql(result).show()

┌────────────┬─────────────────────┐
│ i_category │ total_return_amount │
│  varchar   │    decimal(38,2)    │
├────────────┴─────────────────────┤
│              0 rows              │
└──────────────────────────────────┘



In [56]:
%%time
result = textto_sql(" list customer country of birth order alphabetically")
print(result)





-- List customer countries of birth ordered alphabetically  
SELECT DISTINCT c.c_birth_country  
FROM customer AS c  
ORDER BY c.c_birth_country;
CPU times: total: 15.6 ms
Wall time: 14 s


In [57]:
con.sql(result).show(max_rows=1000)

┌──────────────────────┐
│   c_birth_country    │
│       varchar        │
├──────────────────────┤
│ AFGHANISTAN          │
│ ALAND ISLANDS        │
│ ALBANIA              │
│ ALGERIA              │
│ AMERICAN SAMOA       │
│ ANDORRA              │
│ ANGOLA               │
│ ANGUILLA             │
│ ANTARCTICA           │
│ ANTIGUA AND BARBUDA  │
│ ARGENTINA            │
│ ARMENIA              │
│ ARUBA                │
│ AUSTRALIA            │
│ AUSTRIA              │
│ AZERBAIJAN           │
│ BAHAMAS              │
│ BAHRAIN              │
│ BANGLADESH           │
│ BARBADOS             │
│ BELARUS              │
│ BELGIUM              │
│ BELIZE               │
│ BENIN                │
│ BERMUDA              │
│ BHUTAN               │
│ BOLIVIA              │
│ BOTSWANA             │
│ BOUVET ISLAND        │
│ BRAZIL               │
│ BRUNEI DARUSSALAM    │
│ BULGARIA             │
│ BURKINA FASO         │
│ BURUNDI              │
│ CAMBODIA             │
│ CAMEROON             │


In [58]:
%%time
result = textto_sql(""" Identify the top 10 item categories with the highest total return amount from customers born in 'USA'
 who made returns in 2001, for that question did you use the sample values to guess the correct country spelling""")
print(result)





-- Identify the top 10 item categories with the highest total return amount from customers born in 'USA' who made returns in 2001  
SELECT 
    i.i_category,
    SUM(sr.sr_return_amt) AS total_return_amount
FROM store_returns sr
JOIN date_dim d ON sr.sr_returned_date_sk = d.d_date_sk
JOIN customer c ON sr.sr_customer_sk = c.c_customer_sk
JOIN item i ON sr.sr_item_sk = i.i_item_sk
WHERE 
    d.d_year = 2001 AND 
    c.c_birth_country = 'USA'
GROUP BY i.i_category
ORDER BY total_return_amount DESC
LIMIT 10;

-- Answer: Yes, the country spelling "USA" was based on sample values from the semantic model (c_birth_country has sample values like "USA", "INDIA", "CAMEROON").
CPU times: total: 266 ms
Wall time: 1min 26s


actually the Model is correct, my sample values were wrong !!! USA should have being UNITED STATES