Connect to Snowflake

In [1]:
from snowflake.snowpark import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions

session = Session.builder.configs(SnowflakeLoginOptions("test_conn")).create()
print(session.sql('select current_warehouse(), current_database(), current_schema()').collect())

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


[Row(CURRENT_WAREHOUSE()='COMPUTE_WH', CURRENT_DATABASE()='TEST', CURRENT_SCHEMA()='PUBLIC')]


Load Gamma_Telescope table data and split into train/test datasets

In [2]:
query = "SELECT *, IFF(CLASS='g', 1.0, 0.0) AS LABEL FROM Telescope"
df = session.sql(query).drop("CLASS")
train_data, test_data = df.random_split(weights=[0.9, 0.1], seed=0)
df.show()

-------------------------------------------------------------------------------------------------------------------------------------
|"F_LENGTH"  |"F_WIDTH"  |"F_SIZE"  |"F_CONC"  |"F_CONC1"  |"F_ASYM"  |"F_M3_LONG"  |"F_M3_TRANS"  |"F_ALPHA"  |"F_DIST"  |"LABEL"  |
-------------------------------------------------------------------------------------------------------------------------------------
|28.7967     |16.0021    |2.6449    |0.3918    |0.1982     |27.7004   |22.0110      |-8.2027       |40.0920    |81.8828   |1        |
|31.6036     |11.7235    |2.5185    |0.5303    |0.3773     |26.2722   |23.8238      |-9.9574       |6.3609     |205.2610  |1        |
|162.0520    |136.0310   |4.0612    |0.0374    |0.0187     |116.7410  |-64.8580     |-45.2160      |76.9600    |256.7880  |1        |
|23.8172     |9.5728     |2.3385    |0.6147    |0.3922     |27.2107   |-6.4633      |-7.1513       |10.4490    |116.7370  |1        |
|75.1362     |30.9205    |3.1611    |0.3168    |0.1832     |-5

In [3]:
from snowflake.ml.modeling.impute import SimpleImputer
from snowflake.ml.modeling.preprocessing import StandardScaler
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.pipeline import Pipeline

COLS = [c for c in train_data.columns if c != "LABEL"]
model = Pipeline(steps=[
    ("imputer", SimpleImputer(input_cols=COLS, output_cols=COLS)),
    ("scaler", StandardScaler(input_cols=COLS, output_cols=COLS)),
    ("clf", XGBClassifier(input_cols=COLS, label_cols=["LABEL"]))])
model.fit(train_data)



  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.



<snowflake.ml.modeling.pipeline.pipeline.Pipeline at 0x19fc42a7be0>

In [4]:
from snowflake.ml.modeling.metrics import accuracy_score

train_data_pred = model.predict(train_data)
training_accuracy = accuracy_score(
    df=train_data_pred, 
    y_true_col_names=["LABEL"], 
    y_pred_col_names=["OUTPUT_LABEL"])
print(f"Training accuracy: {training_accuracy}")

test_data_pred = model.predict(test_data)
eval_accuracy = accuracy_score(
    df=test_data_pred, 
    y_true_col_names=["LABEL"], 
    y_pred_col_names=["OUTPUT_LABEL"])
print(f"Eval accuracy: {eval_accuracy}")



Training accuracy: 0.962837




Eval accuracy: 0.878279
