In [1]:
import coiled

In [2]:
# spin up cluster
cluster = coiled.Cluster(
    name="coiled-snowflake",
    software="coiled-examples/snowflake",
    n_workers=20,
    shutdown_on_close=False,
    scheduler_options={'idle_timeout':'2 hours'},
    backend_options={'spot':'True'},
)

In [3]:
# connect cluster to Dask
from dask.distributed import Client
client = Client(cluster)
client.dashboard_link


+---------+--------+-----------+---------+
| Package | client | scheduler | workers |
+---------+--------+-----------+---------+
| pandas  | 1.3.4  | 1.3.5     | 1.3.5   |
+---------+--------+-----------+---------+


'http://44.199.253.100:8787'

In [4]:
import os
import snowflake.connector

In [5]:
os.environ["SNOWFLAKE_USER"] = "RRPELGRIM"
os.environ["SNOWFLAKE_PASSWORD"] = "Rp@976559SN"
os.environ["SNOWFLAKE_ACCOUNT"] = "wo36648.us-east-2.aws"
os.environ["SNOWFLAKE_WAREHOUSE"] = "dask_snowflake_wh"

In [6]:
ctx = snowflake.connector.connect(
    user=os.environ["SNOWFLAKE_USER"],
    password=os.environ["SNOWFLAKE_PASSWORD"],
    account=os.environ["SNOWFLAKE_ACCOUNT"],
)

In [7]:
SCHEMA = "SNOWFLAKE_SAMPLE_DATA.TPCH_SF100"

example_query=f"""
SELECT
    
    C_CUSTKEY,
    C_NAME,
    SUM(L_QUANTITY) AS sum_qty,
    SUM(PS_AVAILQTY) AS sum_avail_qty,
    MAX(P_RETAILPRICE) AS max_retail_price
    
    FROM {SCHEMA}.CUSTOMER
    
        JOIN {SCHEMA}.ORDERS
            ON C_CUSTKEY = O_CUSTKEY
            
            JOIN {SCHEMA}.LINEITEM
                ON L_ORDERKEY = O_ORDERKEY
                
                JOIN {SCHEMA}.PART
                    ON P_PARTKEY = L_PARTKEY
                    
                    JOIN {SCHEMA}.PARTSUPP
                        ON P_PARTKEY = PS_PARTKEY
    
    WHERE PS_SUPPLYCOST > 11

GROUP BY C_CUSTKEY, C_NAME
"""

In [8]:
connection_kwargs = {
    "user": os.environ["SNOWFLAKE_USER"],
    "password": os.environ["SNOWFLAKE_PASSWORD"],
    "account": os.environ["SNOWFLAKE_ACCOUNT"],
    "warehouse": os.environ["SNOWFLAKE_WAREHOUSE"],
}

In [9]:
from dask_snowflake import read_snowflake

In [10]:
%%time
ddf = read_snowflake(
    query=example_query,
    connection_kwargs=connection_kwargs,
)

CPU times: user 639 ms, sys: 31.6 ms, total: 671 ms
Wall time: 4min 1s


In [11]:
ddf.head()

Unnamed: 0,C_CUSTKEY,C_NAME,SUM_QTY,SUM_AVAIL_QTY,MAX_RETAIL_PRICE
0,5198456,Customer#005198456,2760.0,555933,2028.21
1,6471499,Customer#006471499,7537.0,1404030,2033.51
2,6732655,Customer#006732655,7221.0,1263469,2053.13
3,12159119,Customer#012159119,3740.0,696425,2080.68
4,5038030,Customer#005038030,8988.0,1712961,1966.08


In [12]:
ddf.memory_usage().compute()

C_CUSTKEY           39999328
C_NAME              79998656
Index                  11264
MAX_RETAIL_PRICE    79998656
SUM_AVAIL_QTY       79998656
SUM_QTY             79998656
dtype: int64

This is ~400MB. Seems to me 4m8s is a long time for that to load.

## Machine Learning

In [13]:
import xgboost as xgb

In [14]:
X = ddf[['SUM_AVAIL_QTY', 'MAX_RETAIL_PRICE']]
y = ddf.SUM_QTY
dtrain = xgb.dask.DaskDMatrix(client, X, y)

In [15]:
%%time
output = xgb.dask.train(
    client,
    {
        'verbosity': 2,
        'tree_method': 'hist',
        'objective': 'reg:squarederror'
    },
    dtrain,
    num_boost_round=10,
    evals=[(dtrain, 'train')]
)

CPU times: user 89.9 ms, sys: 8.67 ms, total: 98.6 ms
Wall time: 2.07 s


In [16]:
y_pred = xgb.dask.predict(client, output["booster"], X)

In [17]:
y_pred.compute()

0         2765.138184
1         6957.069336
2         6278.103516
3         3472.858887
4         8494.183594
             ...     
256833     803.115723
256834     803.115723
256835     803.115723
256836     893.050781
256837    1177.331055
Name: 0, Length: 9999832, dtype: float32