# Execute on different backends

In [1]:
from IPython.display import HTML, display

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# TODO: Set the minimal code for this, within the notebook for clarity
import datafusion
from libs.helpers.utils_db import query_trino, query_duckdb, query_bigquery, query_datafusion

In [2]:
import featuremesh
%reload_ext featuremesh

from libs.helpers.utils import get_featuremesh_config

fm_config = get_featuremesh_config()

# Access to local endpoint
__ACCESS_TOKEN__ = fm_config['access_token']
__ACCESS_TOKEN__ = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJmZWF0dXJlbWVzaC1jbG91ZCIsInN1YiI6IjcxMDJiMmM3LWNkMjktNGJkZC05MTJlLWIzY2I2YmQ5MzY2OCIsImF1ZCI6ImZlYXR1cmVtZXNoLWFwaSIsImVtYWlsIjoibmljb2xhcy5waXNzYXJkQGZlYXR1cmVtZXNoLmNvbSIsInByb2plY3RfaWQiOiI0ZjhhNmU4MC1iMmU1LTRlMjctYjRiOS03NTk0ZDg4MDAyNGUiLCJwcm9qZWN0X3JvbGUiOiJPV05FUiIsImlhdCI6MTc2MzM3NjgwMiwiZXhwIjoxNzYzNDYzMjAyfQ.m5IdWF-V0sRDoYrsDnwolnUQw05iCgqPbjVfVau4DoE"

featuremesh.set_default('registry.host', fm_config['registry.host'])

client_duckdb_anon = featuremesh.OfflineClient(None, featuremesh.Backend.DUCKDB, query_duckdb)
client_duckdb = featuremesh.OfflineClient(__ACCESS_TOKEN__, featuremesh.Backend.DUCKDB, query_duckdb)
client_trino = featuremesh.OfflineClient(__ACCESS_TOKEN__, featuremesh.Backend.TRINO, query_trino)
client_bigquery = featuremesh.OfflineClient(__ACCESS_TOKEN__, featuremesh.Backend.BIGQUERY, query_bigquery)
client_datafusion = featuremesh.OfflineClient(__ACCESS_TOKEN__, featuremesh.Backend.DATAFUSION, query_datafusion)

client_online = featuremesh.OnlineClient(access_token=__ACCESS_TOKEN__)

featuremesh.set_default('client', client_duckdb)

## Check executors run fine

In [3]:
# import os
# import signal
# os.kill(63768, signal.SIGTERM)

In [4]:
# Login to BigQuery using: 
# gcloud auth application-default login --quiet --project=your_project
# or
# gcloud auth application-default login
# gcloud auth application-default set-quota-project your_project

In [5]:
# !pip install datafusion==48.0.0

In [6]:
datafusion.__version__

'50.1.0'

In [7]:
query = "SELECT 1+2 AS COL1"
for sql_executor in [query_duckdb, query_trino, query_bigquery, query_datafusion]:
    display(HTML(f"<h3>Execute query with {sql_executor.__name__}</h3>"))
    display(sql_executor(query))

Unnamed: 0,COL1
0,3


Unnamed: 0,COL1
0,3


Unnamed: 0,COL1
0,3


Unnamed: 0,col1
0,3


## Execute with local client and magic commands

In [8]:
%%featureql

WITH 
    INPUT(BIGINT) AS INPUT1, 
    INPUT(BIGINT) AS INPUT2
SELECT 
    BIND_VALUES(ARRAY[ROW(2,3), ROW(2,4), ROW(2,5)]) AS (INPUT1, INPUT2), 
    INPUT1 + INPUT2 AS RESULT
;

Unnamed: 0,INPUT1,INPUT2,RESULT
0,2,3,5
1,2,4,6
2,2,5,7


In [9]:
%%featureql

WITH
    customers := ENTITY(),
    customer_id := INPUT(BIGINT#customers),
    orders := ENTITY(),
    order_id := INPUT(BIGINT#orders),
    products_source := EXTERNAL_SQL(
        `SELECT customer_id, orders_list
        FROM (VALUES
            (1, ARRAY[
                ROW(10, ARRAY[
                    ROW('item01', 'A', 10e0),
                    ROW('item02', 'B', 20e0)
                ])
                ]
            ),
            (2, ARRAY[
                ROW(11, ARRAY[
                    ROW('item01', 'A', 10e0),
                    ROW('item02', 'B', 20e0),
                ]), 
                ROW(12, ARRAY[
                    ROW('item03', 'B', 30e0),
                    ROW('item11', 'A', 110e0),
                    ROW('item12', 'A', 120e0)
                ])
            ])
        ) as t(customer_id, orders_list)`
        ON `SELF.customer_id=%CUSTOMER_ID`
        AS ROW(customer_id BIGINT#CUSTOMERS, orders_list ARRAY(ROW(order_id BIGINT#orders, lineitems ARRAY(ROW(item VARCHAR, category VARCHAR, price DOUBLE)))))
    ),
    orders_list := products_source[orders_list],
    customer_id := BIND_VALUES(ARRAY[1,2]),
    orders_list.transform(`SELECT array_merge(ZIP(REPEAT(order_id, cardinality(lineitems)) as order_id), lineitems)`).unwrap().flatten_func() as lineitems_unnested,
    UNNEST(array_merge(ZIP(REPEAT(customer_id, cardinality(lineitems_unnested)) as customer_id), lineitems_unnested)) as final_unnested_row,

    customer_id_f := final_unnested_row[customer_id],
    order_id_f := final_unnested_row[order_id],
    item := final_unnested_row[item],
    category := final_unnested_row[category],
    price := final_unnested_row[price],
SELECT
    -- CUSTOMER_ID_F, SUM(PRICE) group by CUSTOMER_ID_F
    CUSTOMER_ID_F, order_id_f, item, num_orders := ADD_FIELDS(CARDINALITY(orders_list) as num_orders TO ROW(customer_id_f as cid) BINDING FIELDS cid WITH customer_id)[num_orders]
    -- CUSTOMER_ID_F, order_id_f, item, num_orders := EXTEND(ROW(customer_id_f as cid) WITH CARDINALITY(orders_list) as num_orders VIA cid)[num_orders],
    -- CUSTOMER_ID_F, order_id_f, item, num_orders := RELATED(CARDINALITY(orders_list) VIA customer_id_f),
    -- customer_id, ADD_FIELDS(SUM(PRICE) group by CUSTOMER_ID_F as sum_orders TO ROW(customer_id as cid) BINDING FIELDS cid WITH CUSTOMER_ID_F)  -- does not work

Unnamed: 0,CUSTOMER_ID_F,ORDER_ID_F,ITEM,NUM_ORDERS
0,1,10,item02,1
1,2,12,item12,2
2,1,10,item01,1
3,2,12,item11,2
4,1,10,item02,1
5,2,12,item03,2
6,1,10,item01,1
7,2,11,item02,2
8,2,11,item01,2
9,2,12,item12,2


## Execute on 4+1 different backends with local client

In [10]:
query = """
    WITH 
        INPUT(BIGINT) AS INPUT1, 
        INPUT(BIGINT) AS INPUT2
    SELECT 
        BIND_TABLE(ARRAY[ROW(2,3), ROW(2,4), ROW(2,5)]) AS (INPUT1, INPUT2), 
        INPUT1 + INPUT2 AS RESULT
"""

for client in [client_duckdb, client_trino, client_bigquery, client_datafusion]: 
    display(HTML(f"<h3>Execute query with {client.backend}</h3>"))
    try:
        response = client.query(query)
        print(response.sql)
        display(response.dataframe)
    except Exception as e:
        print(f'A problem happened {e=}')

display(HTML(f"<h3>Execute query online</h3>"))
try:
    response = client_online.query(query)
    print(response.sql)
    display(response.dataframe)
except:
    print('A problem happened')

None


None

None


None

None


None

None


None

None


None