In [1]:
# see https://docs.snowflake.com/en/developer-guide/snowpark-ml/snowpark-ml-modeling#distributed-hyperparameter-optimization

# connect to your Snowflake account
from snowflake.snowpark import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions

pars = SnowflakeLoginOptions("test_conn")
# pars["warehouse"] = ... XSmall then LARGE, for 10K --> 1M samples
session = Session.builder.configs(pars).create()
session.query_tag = "1-hpo-scale"

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


In [2]:
import pandas as pd
from sklearn.datasets import make_classification

# also try w/ 1M samples: 2m15s on LARGE vs 4m5s on XSMALL
N_SAMPLES = 10000
X, y = make_classification(n_samples=N_SAMPLES, n_features=6,
    n_informative=2, n_redundant=0, random_state=0, shuffle=True)

X = pd.DataFrame(X, columns=["X1", "X2", "X3", "X4", "X5", "X6"])
y = pd.DataFrame(y, columns=["Y"])

df = pd.concat([X, y], axis=1)
df = session.create_dataframe(df)
df.show()

---------------------------------------------------------------------------------------------------------------------------------------------
|"X1"                 |"X2"                 |"X3"                  |"X4"                 |"X5"                  |"X6"                 |"Y"  |
---------------------------------------------------------------------------------------------------------------------------------------------
|-0.8419764862142489  |-0.8236131319464177  |0.9075940977271297    |-2.237446137540834   |0.41812713757991454   |0.14206445780823243  |0    |
|0.2495179452165687   |-1.0261383115544833  |1.159807100525451     |-1.0200756637883104  |-0.1421750090153774   |0.3737017851614627   |0    |
|-0.7159294113018737  |0.8144495926630666   |0.8071326534973498    |0.1831886415975208   |0.4489324382592919    |-2.0545884582594405  |1    |
|-0.8336672327523885  |1.267946398948371    |-0.24525647900063868  |1.7112130463794246   |-0.1391163247361221   |-0.9085732375134297  |1    |
|0.148

In [3]:
from snowflake.ml.modeling.model_selection.grid_search_cv import GridSearchCV
from snowflake.ml.modeling.xgboost import XGBClassifier

model = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=dict(n_estimators=[10, 50], learning_rate=[0.01, 0.1, 0.2]),
    input_cols=["X1", "X2", "X3", "X4", "X5", "X6"], 
    label_cols=["Y"],
    output_cols=["PREDICTIONS"])
model.fit(df)



  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.



https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


<snowflake.ml.modeling.model_selection.grid_search_cv.GridSearchCV at 0x1cefef95040>

In [4]:
preds = model.predict(df)
preds[["PREDICTIONS"]].show()



-----------------
|"PREDICTIONS"  |
-----------------
|0.0            |
|0.0            |
|1.0            |
|1.0            |
|0.0            |
|1.0            |
|0.0            |
|1.0            |
|0.0            |
|1.0            |
-----------------



In [5]:
skl = model.to_sklearn()
print(skl.best_score_)

0.9099999999999999
