In [1]:
# see https://docs.snowflake.com/en/developer-guide/snowpark-ml/snowpark-ml-modeling#distributed-hyperparameter-optimization

# connect to your Snowflake account
from snowflake.snowpark import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions

pars = SnowflakeLoginOptions("test_conn")
pars["warehouse"] = "large"
session = Session.builder.configs(pars).create()
session.query_tag = "classifier-hpo"

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


In [2]:
import pandas as pd
from sklearn.datasets import make_classification

# also try w/ 1M samples: 2m15s on LARGE vs 4m5s on XSMALL
X, y = make_classification(n_samples=1000000, n_features=6,
    n_informative=2, n_redundant=0, random_state=0, shuffle=True)

X = pd.DataFrame(X, columns=["X1", "X2", "X3", "X4", "X5", "X6"])
y = pd.DataFrame(y, columns=["Y"])

df = pd.concat([X, y], axis=1)
df = session.create_dataframe(df)
df.show()

----------------------------------------------------------------------------------------------------------------------------------------------
|"X1"                 |"X2"                  |"X3"                 |"X4"                  |"X5"                 |"X6"                  |"Y"  |
----------------------------------------------------------------------------------------------------------------------------------------------
|-1.4243890323508717  |-0.05520220724696402  |-0.9702604834090749  |0.9460507691411167    |-1.6786647910060337  |0.8621464464088906    |1    |
|3.9638449599299856   |1.496715391172496     |-1.1731766541464825  |-2.213897324527216    |-2.016722848401358   |0.852839138511081     |0    |
|-0.7233404104035888  |-0.8302200800755797   |-0.8603016078760195  |-1.1139846901829347   |1.6108394747169368   |0.7645601354678879    |0    |
|-0.7238329090343465  |-0.6040304500869302   |0.987029296766597    |0.4273923686374005    |0.06354824512344817  |-0.07859504911618904  |1    |

In [3]:
from snowflake.ml.modeling.model_selection.grid_search_cv import GridSearchCV
from snowflake.ml.modeling.xgboost import XGBClassifier

model = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=dict(
        n_estimators=[10, 50],
        learning_rate=[0.01, 0.1, 0.2]),
    cv=5,
    n_jobs=-1,
    verbose=4,
    input_cols=["X1", "X2", "X3", "X4", "X5", "X6"], 
    label_cols=["Y"],
    output_cols=["PREDICTIONS"])
model.fit(df)



  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.



https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


<snowflake.ml.modeling.model_selection.grid_search_cv.GridSearchCV at 0x1feb8bf0430>

In [4]:
preds = model.predict(df)
preds[["PREDICTIONS"]].show()



-----------------
|"PREDICTIONS"  |
-----------------
|0.0            |
|0.0            |
|0.0            |
|1.0            |
|0.0            |
|1.0            |
|1.0            |
|1.0            |
|1.0            |
|0.0            |
-----------------



In [5]:
skl = model.to_sklearn()
print(skl.best_score_)

0.9282779999999999
