# SciKit vs Snowflake ML

Pre-processing performance comparison

## Setup

In [None]:
import random, string
import pandas as pd
import time, math
from sklearn.datasets import make_classification

from snowflake.snowpark import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions

session = Session.builder.configs(SnowflakeLoginOptions("sanju")).create()
session.use_warehouse("ADHOC_WH")
session.use_database("DATA_ALCHEMIST")
session.use_schema("CORTEX")



Data Generation

In [5]:
# Sixe: 0.04 GB
ONE_MN=1000000
# Sixe: 0.44 GB
TEN_MN=10000000
# Sixe: 4.47
HUNDRED_MN=100000000

X, y = make_classification(n_samples=TEN_MN, n_features=6,
    n_informative=4, n_redundant=1, random_state=0, shuffle=True)

X = pd.DataFrame(X, columns=["X1", "X2", "X3", "X4", "X5", "X6"])
y = pd.DataFrame(y, columns=["Y"])
pdf = pd.concat([X, y], axis=1)

print(X.memory_usage(deep=True).sum() / (1024**3), "GB")

0.44703495875000954 GB


Write data to Snowflake table

In [7]:
df = session.create_dataframe(pdf)
df.write.mode("overwrite").save_as_table("CLASSIFICATION_DATASET")
df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------
|"X1"                  |"X2"                  |"X3"                 |"X4"                  |"X5"                   |"X6"                  |"Y"  |
-------------------------------------------------------------------------------------------------------------------------------------------------
|0.5899129227145623    |-0.20009392338453252  |-1.1199912533109633  |-0.543343483520465    |-0.007069701452958155  |1.1015018678084316    |1    |
|-0.15020340877272498  |1.4493874853246336    |1.0226614372265614   |-0.34165974367123275  |0.7816172388344352     |0.17615008194516846   |1    |
|0.40113266747430476   |0.11880749460345463   |-1.4879795421927695  |0.04405706515417407   |-1.4963080012633612    |2.2300562361511806    |1    |
|0.08489148379494038   |-1.8155771969299426   |2.5927323817045487   |0.47690806433115174   |0.2954813260809698     |1.113196

## SciKit

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

df = session.table("CLASSIFICATION_DATASET").to_pandas()
X = pd.DataFrame(df, columns=["X1", "X2", "X3", "X4", "X5", "X6"])
y = pd.DataFrame(df, columns=["Y"])
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1)

# ========================================================================
# 29 secs for 10M (X-Small) 
start_time = time.time()

clf = XGBClassifier()
clf.fit(X_train, y_train)

total_time = math.trunc(time.time() - start_time)
print(f"{total_time} seconds")


29 seconds


## Snowflake ML

In [None]:
from snowflake.ml.modeling.xgboost import XGBClassifier

df = session.table("CLASSIFICATION_DATASET")
train_data, _ = df.random_split(weights=[0.9, 0.1], seed=0)

# ========================================================================
# 85 secs for 10M (X-Small) 
start_time = time.time()
clf = XGBClassifier(
    input_cols=["X1", "X2", "X3", "X4", "X5", "X6"],
    label_cols=["Y"],
    output_cols=["PREDICTIONS"])
clf.fit(train_data)

total_time = math.trunc(time.time() - start_time)
print(f"{total_time} seconds")