# MLOps with Snowflake ML

### Full Life-Cycle using Diamod dataset (loaded via data_science/test_data_generation/load_datasets_snowflake.sql)

Connect with Snowflake and get diamond dataset

In [None]:
from snowflake.snowpark import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions

conn = SnowflakeLoginOptions("sanju")
session = Session.builder.configs(conn).create()
session.use_warehouse("ADHOC_WH")
session.use_database("DATA_ALCHEMIST")
session.use_schema("DIAMONDS")


In [8]:
session.sql("LS @EXT_STAGE_LIST;").show()
pars = {
    "field_delimiter": ",",
    "field_optionally_enclosed_by": '"',
    "infer_schema": True,
    "parse_header": True }
df = session.read.options(pars).csv("@EXT_STAGE_LIST")
df.show()
df.describe().show()

-----------------------------------------------------------------------------------------------------------------------------------
|"name"                                              |"size"   |"md5"                             |"last_modified"                |
-----------------------------------------------------------------------------------------------------------------------------------
|s3://sfquickstarts/intro-to-machine-learning-wi...  |2772143  |4d3d1d4bbad5e0806dbaec425cf90196  |Mon, 10 Jul 2023 22:04:47 GMT  |
-----------------------------------------------------------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------
|"carat"  |"cut"      |"color"  |"clarity"  |"depth"  |"table"  |"price"  |"x"   |"y"   |"z"   |
------------------------------------------------------------------------------------------------
|0.23     |Ideal      |E        |SI2        |61.

Pre-Processing

In [15]:
import numpy as np
import snowflake.snowpark.functions as F
from snowflake.snowpark.types import DoubleType
from snowflake.snowpark.types import DecimalType

for colname in df.columns:
    if colname == '"table"': new_colname = "TABLE_PCT"
    else: new_colname = str.upper(colname)
    df = df.with_column_renamed(colname, new_colname)


for col in ["CUT"]:
    df = df.with_column(col, F.upper(F.regexp_replace(F.col(col), '[^a-zA-Z0-9]+', '_')))


for colname in ["CARAT", "X", "Y", "Z", "DEPTH", "TABLE_PCT"]:
    df = df.with_column(colname, df[colname].cast(DoubleType()))
df.show()

----------------------------------------------------------------------------------------------------
|"COLOR"  |"CLARITY"  |"PRICE"  |"CUT"      |"CARAT"  |"X"   |"Y"   |"Z"   |"DEPTH"  |"TABLE_PCT"  |
----------------------------------------------------------------------------------------------------
|E        |SI2        |326      |IDEAL      |0.23     |3.95  |3.98  |2.43  |61.5     |55.0         |
|E        |SI1        |326      |PREMIUM    |0.21     |3.89  |3.84  |2.31  |59.8     |61.0         |
|E        |VS1        |327      |GOOD       |0.23     |4.05  |4.07  |2.31  |56.9     |65.0         |
|I        |VS2        |334      |PREMIUM    |0.29     |4.2   |4.23  |2.63  |62.4     |58.0         |
|J        |SI2        |335      |GOOD       |0.31     |4.34  |4.35  |2.75  |63.3     |58.0         |
|J        |VVS2       |336      |VERY_GOOD  |0.24     |3.94  |3.96  |2.48  |62.8     |57.0         |
|I        |VVS1       |336      |VERY_GOOD  |0.24     |3.95  |3.98  |2.47  |62.3     |57.0 

Tranformation

In [16]:
from snowflake.ml.modeling.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
pipe = MinMaxScaler(input_cols=["CARAT"], output_cols=["CARAT_NORM"])
df = pipe.fit(df).transform(df)
df = df.with_column("CARAT_NORM", df.col("CARAT_NORM").cast(DecimalType(7, 6)))
df.show()

-------------------------------------------------------------------------------------------------------------------
|"COLOR"  |"CLARITY"  |"PRICE"  |"CUT"      |"CARAT"  |"X"   |"Y"   |"Z"   |"DEPTH"  |"TABLE_PCT"  |"CARAT_NORM"  |
-------------------------------------------------------------------------------------------------------------------
|E        |SI2        |326      |IDEAL      |0.23     |3.95  |3.98  |2.43  |61.5     |55.0         |0.006237      |
|E        |SI1        |326      |PREMIUM    |0.21     |3.89  |3.84  |2.31  |59.8     |61.0         |0.002079      |
|E        |VS1        |327      |GOOD       |0.23     |4.05  |4.07  |2.31  |56.9     |65.0         |0.006237      |
|I        |VS2        |334      |PREMIUM    |0.29     |4.2   |4.23  |2.63  |62.4     |58.0         |0.018711      |
|J        |SI2        |335      |GOOD       |0.31     |4.34  |4.35  |2.75  |63.3     |58.0         |0.022869      |
|J        |VVS2       |336      |VERY_GOOD  |0.24     |3.94  |3.96  |2.4

In [17]:
categories = {
    "CUT": np.array(["IDEAL", "PREMIUM", "VERY_GOOD", "GOOD", "FAIR"]),
    "CLARITY": np.array(["IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1", "I2", "I3"]),
    "COLOR": np.array(["D", "E", "F", "G", "H", "I", "J"])}
enc = OrdinalEncoder(
    input_cols=["CUT", "CLARITY", "COLOR"],
    output_cols=["CUT_OE", "CLARITY_OE", "COLOR_OE"],
    categories=categories)
df = enc.fit(df).transform(df)
print(enc._state_pandas)
df.show()

   _COLUMN_NAME  _CATEGORY  _INDEX
0           CUT      IDEAL       0
1           CUT    PREMIUM       1
2           CUT  VERY_GOOD       2
3           CUT       GOOD       3
4           CUT       FAIR       4
5       CLARITY         IF       0
6       CLARITY       VVS1       1
7       CLARITY       VVS2       2
8       CLARITY        VS1       3
9       CLARITY        VS2       4
10      CLARITY        SI1       5
11      CLARITY        SI2       6
12      CLARITY         I1       7
13      CLARITY         I2       8
14      CLARITY         I3       9
15        COLOR          D       0
16        COLOR          E       1
17        COLOR          F       2
18        COLOR          G       3
19        COLOR          H       4
20        COLOR          I       5
21        COLOR          J       6
------------------------------------------------------------------------------------------------------------------------------------------------------
|"CUT_OE"  |"CLARITY_OE"  |"COLOR_OE"  |"COL

In [18]:
enc = OneHotEncoder(
    input_cols=["CUT", "COLOR", "CLARITY"],
    output_cols=["CUT_OHE", "COLOR_OHE", "CLARITY_OHE"])
df = enc.fit(df).transform(df)
np.array(df.columns)
df.show()
df.write.mode('overwrite').save_as_table('diamonds_transform')

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"CUT_OHE_FAIR"  |"CUT_OHE_GOOD"  |"CUT_OHE_IDEAL"  |"CUT_OHE_PREMIUM"  |"CUT_OHE_VERY_GOOD"  |"COLOR_OHE_D"  |"COLOR_OHE_E"  |"COLOR_OHE_F"  |"COLOR_OHE_G"  |"COLOR_OHE_H"  |"COLOR_OHE_I"  |"COLOR_OHE_J"  |"CLARITY_OHE_I1"  |"CLARITY_OHE_IF"  |"CLARITY_OHE_SI1"  |"CLARITY_OHE_SI2"  |"CLARITY_OHE_VS1"  |"CLARITY_OHE_VS2"  |"CLARITY_OHE_VVS1"  |"CLARITY_OHE_VVS2"  |"CUT_OE"  |"CLARITY_OE"  |"COLOR_OE"  |"COLOR"  |"CLARITY"  |"PRICE"  |"CUT"  |"CARAT"  |"X"   |"Y"   |"Z"   |"DEPTH

Model Training

In [19]:
from snowflake.ml.modeling.ensemble import RandomForestRegressor

session.query_tag = "model-registry-demo"
train_df, test_df = df.random_split(weights=[0.9, 0.1], seed=0)

model = RandomForestRegressor(
    input_cols=["CUT_OE", "COLOR_OE", "CLARITY_OE", "CARAT", "DEPTH", "TABLE_PCT", "X", "Y", "Z"],
    label_cols=['PRICE'],
    output_cols=['PREDICTED_PRICE'])
model.fit(train_df)

pred = model.predict(test_df)
pred.select("PRICE", "PREDICTED_PRICE").show()


2025-05-05 12:44:58.090849: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  dataset = snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset)


-------------------------------
|"PRICE"  |"PREDICTED_PRICE"  |
-------------------------------
|1960     |2049.34            |
|1107     |1035.33            |
|506      |477.57             |
|394      |400.1              |
|17841    |16864.77           |
|3478     |3565.01            |
|3539     |3424.09            |
|2759     |2414.76            |
|2893     |2750.83            |
|2056     |2080.0             |
-------------------------------



Model Registry

In [23]:
from snowflake.ml.registry import Registry

registry = Registry(session=session)    
model_ref = registry.log_model(
    model,
    model_name="Diamond_Price_Predictor",
    version_name="v1",
    comment="Random Forest Regressor for Diamonds",
    conda_dependencies=["scikit-learn"])

registry.show_models()

  self.manifest.save(


Unnamed: 0,created_on,name,model_type,database_name,schema_name,comment,owner,default_version_name,versions,aliases
0,2025-05-04 19:55:44.027000-07:00,DIAMOND_PRICE_PREDICTOR,USER_MODEL,DATA_ALCHEMIST,DIAMONDS,,ACCOUNTADMIN,V1,"[""V1""]","{""DEFAULT"":""V1"",""FIRST"":""V1"",""LAST"":""V1""}"
