In [1]:
# see https://github.com/Snowflake-Labs/sfguide-intro-to-machine-learning-with-snowpark-ml-for-python/blob/main/2_snowpark_ml_feature_transformations.ipynb

import numpy as np
from snowflake.ml.modeling.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
from snowflake.snowpark.types import DecimalType
from snowflake.snowpark import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions

pars = SnowflakeLoginOptions("test_conn")
pars["database"] = "test"
pars["schema"] = "diamonds"
session = Session.builder.configs(pars).create()

df = session.table('diamonds_clean')
df.show()

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


----------------------------------------------------------------------------------------------------
|"COLOR"  |"CLARITY"  |"PRICE"  |"CUT"      |"CARAT"  |"X"   |"Y"   |"Z"   |"DEPTH"  |"TABLE_PCT"  |
----------------------------------------------------------------------------------------------------
|E        |SI2        |326      |IDEAL      |0.23     |3.95  |3.98  |2.43  |61.5     |55.0         |
|E        |SI1        |326      |PREMIUM    |0.21     |3.89  |3.84  |2.31  |59.8     |61.0         |
|E        |VS1        |327      |GOOD       |0.23     |4.05  |4.07  |2.31  |56.9     |65.0         |
|I        |VS2        |334      |PREMIUM    |0.29     |4.2   |4.23  |2.63  |62.4     |58.0         |
|J        |SI2        |335      |GOOD       |0.31     |4.34  |4.35  |2.75  |63.3     |58.0         |
|J        |VVS2       |336      |VERY_GOOD  |0.24     |3.94  |3.96  |2.48  |62.8     |57.0         |
|I        |VVS1       |336      |VERY_GOOD  |0.24     |3.95  |3.98  |2.47  |62.3     |57.0 

In [2]:
pipe = MinMaxScaler(input_cols=["CARAT"], output_cols=["CARAT_NORM"])
df = pipe.fit(df).transform(df)
df = df.with_column("CARAT_NORM", df.col("CARAT_NORM").cast(DecimalType(7, 6)))
df.show()

-------------------------------------------------------------------------------------------------------------------
|"COLOR"  |"CLARITY"  |"PRICE"  |"CUT"      |"CARAT"  |"X"   |"Y"   |"Z"   |"DEPTH"  |"TABLE_PCT"  |"CARAT_NORM"  |
-------------------------------------------------------------------------------------------------------------------
|E        |SI2        |326      |IDEAL      |0.23     |3.95  |3.98  |2.43  |61.5     |55.0         |0.006237      |
|E        |SI1        |326      |PREMIUM    |0.21     |3.89  |3.84  |2.31  |59.8     |61.0         |0.002079      |
|E        |VS1        |327      |GOOD       |0.23     |4.05  |4.07  |2.31  |56.9     |65.0         |0.006237      |
|I        |VS2        |334      |PREMIUM    |0.29     |4.2   |4.23  |2.63  |62.4     |58.0         |0.018711      |
|J        |SI2        |335      |GOOD       |0.31     |4.34  |4.35  |2.75  |63.3     |58.0         |0.022869      |
|J        |VVS2       |336      |VERY_GOOD  |0.24     |3.94  |3.96  |2.4

In [3]:
categories = {
    "CUT": np.array(["IDEAL", "PREMIUM", "VERY_GOOD", "GOOD", "FAIR"]),
    "CLARITY": np.array(["IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1", "I2", "I3"])}
enc = OrdinalEncoder(
    input_cols=["CUT", "CLARITY"],
    output_cols=["CUT_OE", "CLARITY_OE"],
    categories=categories)
df = enc.fit(df).transform(df)
print(enc._state_pandas)
df.show()

   _COLUMN_NAME  _CATEGORY  _INDEX
0           CUT      IDEAL       0
1           CUT    PREMIUM       1
2           CUT  VERY_GOOD       2
3           CUT       GOOD       3
4           CUT       FAIR       4
5       CLARITY         IF       0
6       CLARITY       VVS1       1
7       CLARITY       VVS2       2
8       CLARITY        VS1       3
9       CLARITY        VS2       4
10      CLARITY        SI1       5
11      CLARITY        SI2       6
12      CLARITY         I1       7
13      CLARITY         I2       8
14      CLARITY         I3       9
---------------------------------------------------------------------------------------------------------------------------------------------
|"CUT_OE"  |"CLARITY_OE"  |"COLOR"  |"CLARITY"  |"PRICE"  |"CUT"      |"CARAT"  |"X"   |"Y"   |"Z"   |"DEPTH"  |"TABLE_PCT"  |"CARAT_NORM"  |
---------------------------------------------------------------------------------------------------------------------------------------------
|0.0       |6.

In [4]:
enc = OneHotEncoder(
    input_cols=["CUT", "COLOR", "CLARITY"],
    output_cols=["CUT_OHE", "COLOR_OHE", "CLARITY_OHE"])
df = enc.fit(df).transform(df)
np.array(df.columns)
df.show()
df.write.mode('overwrite').save_as_table('diamonds_transform')

  success, nchunks, nrows, ci_output = write_pandas(
  success, nchunks, nrows, ci_output = write_pandas(


---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"CUT_OHE_FAIR"  |"CUT_OHE_GOOD"  |"CUT_OHE_IDEAL"  |"CUT_OHE_PREMIUM"  |"CUT_OHE_VERY_GOOD"  |"COLOR_OHE_D"  |"COLOR_OHE_E"  |"COLOR_OHE_F"  |"COLOR_OHE_G"  |"COLOR_OHE_H"  |"COLOR_OHE_I"  |"COLOR_OHE_J"  |"CLARITY_OHE_I1"  |"CLARITY_OHE_IF"  |"CLARITY_OHE_SI1"  |"CLARITY_OHE_SI2"  |"CLARITY_OHE_VS1"  |"CLARITY_OHE_VS2"  |"CLARITY_OHE_VVS1"  |"CLARITY_OHE_VVS2"  |"CUT_OE"  |"CLARITY_OE"  |"COLOR"  |"CLARITY"  |"PRICE"  |"CUT"      |"CARAT"  |"X"   |"Y"   |"Z"   |"DEPTH"  |"TABLE_PCT"  |