# SciKit vs Snowflake ML

Pre-processing performance comparison

## Setup

In [9]:
import random, string
import pandas as pd
import time, math
from IPython.display import display
from sklearn.datasets import make_regression
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler

from snowflake.snowpark import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions

session = Session.builder.configs(SnowflakeLoginOptions("sanju")).create()
session.use_warehouse("ADHOC_WH")
session.use_database("DATA_ALCHEMIST")
session.use_schema("CORTEX")



Data Generation

In [None]:
# Sixe: 0.10 GB
ONE_MN=1000000
# Sixe: 1.0 GB
TEN_MN=10000000
# Sixe: 10.98 GB
HUNDRED_MN=100000000

X, _ = make_regression(n_samples=TEN_MN, n_features=2, noise=0.1, random_state=0)
X = pd.DataFrame(X, columns=["N1", "N2"])

cat_features = {}
for c in ["C1", "C2"]:
    cat_features[c] = ["".join(random.choices(string.ascii_uppercase, k=2))
        for _ in range(X.shape[0])]
X = X.assign(**cat_features)
print(X.memory_usage(deep=True).sum() / (1024**3), "GB")

1.0989607609808445 GB


Write data to Snowflake table

In [14]:
df = session.create_dataframe(X)
df.write.mode("overwrite").save_as_table("REGRESSION_DATASET")
df.show()

------------------------------------------------------------
|"N1"                 |"N2"                  |"C1"  |"C2"  |
------------------------------------------------------------
|-0.1610354267346038  |-0.11352353535248658  |HL    |NZ    |
|0.7008022206416491   |-1.0317932437794488   |OC    |MV    |
|-0.3657678917190336  |-0.12838701413357242  |OT    |LY    |
|1.05510945502436     |1.510300053501725     |DX    |KB    |
|0.4859850819476572   |-1.561051070692365    |KQ    |FN    |
|1.6046941346780645   |-0.5767635469431348   |GA    |LE    |
|1.1944185899921502   |-0.9244430746602714   |XV    |BP    |
|0.341584518980672    |0.27194306628987974   |HP    |YX    |
|-0.0420072830840601  |-0.7003844447530871   |TH    |EG    |
|1.305520348980537    |0.06788646665668029   |QG    |SO    |
------------------------------------------------------------



## SciKit

In [None]:
X = session.table("REGRESSION_DATASET").to_pandas()

# ========================================================================
# # Runs in 5 secs, 10 MM rows w/ XSmall WH 
start_time = time.time()

X[["C1O", "C2O"]] = OrdinalEncoder(
    ).fit_transform(X[["C1", "C2"]])
X[["N1FO", "N2FO", "C1FO", "C2FO"]] = MinMaxScaler(
    ).fit_transform(X[["N1", "N2", "C1O", "C2O"]])

total_time = math.trunc(time.time() - start_time)
print(f"{total_time} seconds")
display(X)


5 seconds


Unnamed: 0,N1,N2,C1,C2,C1O,C2O,N1FO,N2FO,C1FO,C2FO
0,0.745417,-1.070801,CZ,YC,77.0,626.0,0.585036,0.382994,0.114074,0.927407
1,-1.159729,0.707654,PB,KB,391.0,261.0,0.407116,0.548089,0.579259,0.386667
2,0.109150,1.101181,BN,SO,39.0,482.0,0.525616,0.584620,0.057778,0.714074
3,2.575776,-1.284722,VW,HJ,568.0,191.0,0.755973,0.363136,0.841481,0.282963
4,1.541702,-0.388306,GX,RK,179.0,452.0,0.659401,0.446351,0.265185,0.669630
...,...,...,...,...,...,...,...,...,...,...
9999995,-0.472871,-1.971240,HY,NY,206.0,362.0,0.471261,0.299406,0.305185,0.536296
9999996,0.655903,0.370141,GT,XD,175.0,601.0,0.576677,0.516758,0.259259,0.890370
9999997,0.409867,1.010092,UJ,AA,529.0,0.0,0.553700,0.576164,0.783704,0.000000
9999998,-0.976495,-1.732982,ZA,RC,650.0,444.0,0.424228,0.321524,0.962963,0.657778


## Snowflake ML

In [None]:
from snowflake.ml.modeling.preprocessing import MinMaxScaler, OrdinalEncoder
from snowflake.ml.modeling.pipeline import Pipeline

session.query_tag = "transformers-new"
df = session.table("REGRESSION_DATASET")

# ========================================================================
# Runs in 20 secs, 10 MM rows w/ XSmall WH. The dataset is not large enough to benifit from Snowflake's distributed compute
start_time = time.time()

pipe = Pipeline(steps=[
    ("encoder", OrdinalEncoder(
        input_cols=["C1", "C2"],
        output_cols=["C1O", "C2O"])),
    ("scaler", MinMaxScaler(
        input_cols=["N1", "N2", "C1O", "C2O"],
        output_cols=["N1FO", "N2FO", "C1FO", "C2FO"]))])
pipe.fit(df)
df = pipe.transform(df)

total_time = math.trunc(time.time() - start_time)
print(f"{total_time} seconds")
df.show()



20 seconds
---------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"N1FO"               |"N2FO"               |"C1FO"                |"C2FO"               |"C1O"  |"C2O"  |"N1"                 |"N2"                  |"C1"  |"C2"  |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------
|0.49784626928976883  |0.4075627655244988   |0.9703703703703703    |0.46074074074074073  |655.0  |311.0  |-0.1882010395200003  |-0.8061422304663725   |ZF    |LZ    |
|0.40552547557475926  |0.4420339251847886   |0.9555555555555555    |0.1925925925925926   |645.0  |130.0  |-1.1767572358692975  |-0.43480722189945925  |YV    |FA    |
|0.41718187087774633  |0.5962268761940934   |0.8592592592592592    |0.23555555555555555  |580.0  |159.0  |-1.0519424315313308  |1.2262114774498765    |WI    |G