In [131]:
from snowflake.ml.modeling.impute import SimpleImputer
from snowflake.ml.modeling.metrics import accuracy_score, precision_score, recall_score
from snowflake.ml.modeling.preprocessing import OneHotEncoder
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.ml.registry import model_registry
from snowflake.ml._internal.utils import identifier
from snowflake.snowpark import Session, types as T
from snowflake.snowpark.functions import col
import pandas as pd

import warnings

warnings.simplefilter(action="ignore", category=UserWarning)

In [132]:
session = Session.builder.configs(SnowflakeLoginOptions()).getOrCreate()

In [133]:
titanic_df = session.table("titanic")

In [134]:
titanic_df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"SURVIVED"  |"PCLASS"  |"AGE"  |"SIBSP"  |"PARCH"  |"FARE"   |"ADULT_MALE"  |"DECK"  |"ALIVE"  |"ALONE"  |"SEX"   |"EMBARKED"  |"CLASS"  |"WHO"  |"EMBARK_TOWN"  |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------
|0           |3         |22.00  |1        |0        |7.2500   |True          |NULL    |False    |False    |MALE    |S           |THIRD    |MAN    |SOUTHAMPTON    |
|1           |1         |38.00  |1        |0        |71.2833  |False         |C       |True     |False    |FEMALE  |C           |FIRST    |WOMAN  |CHERBOURG      |
|1           |3         |26.00  |0        |0        |7.9250   |False         |NULL    |True     |True     |FEMALE  |S           |THIRD    |WOMAN  |SOUTHAMPTON    |
|1           |1 

In [135]:
# Columns with null values and their respective counts
null_counts = [
    (col_name, titanic_df.where(col(col_name).isNull()).count())
    for col_name in titanic_df.columns
]
null_counts

[('SURVIVED', 0),
 ('PCLASS', 0),
 ('AGE', 177),
 ('SIBSP', 0),
 ('PARCH', 0),
 ('FARE', 0),
 ('ADULT_MALE', 0),
 ('DECK', 688),
 ('ALIVE', 0),
 ('ALONE', 0),
 ('SEX', 0),
 ('EMBARKED', 2),
 ('CLASS', 0),
 ('WHO', 0),
 ('EMBARK_TOWN', 2)]

In [136]:
titanic_df = titanic_df.drop(
    ["AGE", "DECK", "ALIVE", "ADULT_MALE", "EMBARKED", "SEX", "PCLASS", "ALONE"]
)

In [137]:
titanic_df = titanic_df.withColumn("FARE", titanic_df["FARE"].astype(T.FloatType()))

titanic_df.show()

------------------------------------------------------------------------------
|"SURVIVED"  |"SIBSP"  |"PARCH"  |"CLASS"  |"WHO"  |"EMBARK_TOWN"  |"FARE"   |
------------------------------------------------------------------------------
|0           |1        |0        |THIRD    |MAN    |SOUTHAMPTON    |7.25     |
|1           |1        |0        |FIRST    |WOMAN  |CHERBOURG      |71.2833  |
|1           |0        |0        |THIRD    |WOMAN  |SOUTHAMPTON    |7.925    |
|1           |1        |0        |FIRST    |WOMAN  |SOUTHAMPTON    |53.1     |
|0           |0        |0        |THIRD    |MAN    |SOUTHAMPTON    |8.05     |
|0           |0        |0        |THIRD    |MAN    |QUEENSTOWN     |8.4583   |
|0           |0        |0        |FIRST    |MAN    |SOUTHAMPTON    |51.8625  |
|0           |3        |1        |THIRD    |CHILD  |SOUTHAMPTON    |21.075   |
|1           |0        |2        |THIRD    |WOMAN  |SOUTHAMPTON    |11.1333  |
|1           |1        |0        |SECOND   |CHILD  |

In [138]:
cat_cols = ["CLASS", "WHO", "EMBARK_TOWN"]
num_cols = ["SIBSP", "PARCH", "FARE"]

In [139]:
impute_cat = SimpleImputer(
    input_cols=cat_cols,
    output_cols=cat_cols,
    strategy="most_frequent",
    drop_input_cols=True,
)

titanic_df = impute_cat.fit(titanic_df).transform(titanic_df)
titanic_df.show()

------------------------------------------------------------------------------
|"CLASS"  |"WHO"  |"EMBARK_TOWN"  |"SURVIVED"  |"SIBSP"  |"PARCH"  |"FARE"   |
------------------------------------------------------------------------------
|THIRD    |MAN    |SOUTHAMPTON    |0           |1        |0        |7.25     |
|FIRST    |WOMAN  |CHERBOURG      |1           |1        |0        |71.2833  |
|THIRD    |WOMAN  |SOUTHAMPTON    |1           |0        |0        |7.925    |
|FIRST    |WOMAN  |SOUTHAMPTON    |1           |1        |0        |53.1     |
|THIRD    |MAN    |SOUTHAMPTON    |0           |0        |0        |8.05     |
|THIRD    |MAN    |QUEENSTOWN     |0           |0        |0        |8.4583   |
|FIRST    |MAN    |SOUTHAMPTON    |0           |0        |0        |51.8625  |
|THIRD    |CHILD  |SOUTHAMPTON    |0           |3        |1        |21.075   |
|THIRD    |WOMAN  |SOUTHAMPTON    |1           |0        |2        |11.1333  |
|SECOND   |CHILD  |CHERBOURG      |1           |1   

In [140]:
OHE = OneHotEncoder(
    input_cols=cat_cols,
    output_cols=cat_cols,
    drop_input_cols=True,
    drop="first",
    handle_unknown="ignore",
)

titanic_df = OHE.fit(titanic_df).transform(titanic_df)
titanic_df.show()

--------------------------------------------------------------------------------------------------------------------------------------------------------------
|"CLASS_SECOND"  |"CLASS_THIRD"  |"WHO_MAN"  |"WHO_WOMAN"  |"EMBARK_TOWN_QUEENSTOWN"  |"EMBARK_TOWN_SOUTHAMPTON"  |"SURVIVED"  |"SIBSP"  |"PARCH"  |"FARE"   |
--------------------------------------------------------------------------------------------------------------------------------------------------------------
|0.0             |1.0            |1.0        |0.0          |0.0                       |1.0                        |0           |1        |0        |7.25     |
|0.0             |0.0            |0.0        |1.0          |0.0                       |0.0                        |1           |1        |0        |71.2833  |
|0.0             |1.0            |0.0        |1.0          |0.0                       |1.0                        |1           |0        |0        |7.925    |
|0.0             |0.0            |0.0        |

In [141]:
train_df, test_df = titanic_df.random_split(weights=[0.8, 0.2], seed=25)

In [142]:
parameters = {
    "n_estimators": [100, 200, 300, 400, 500],
    "learning_rate": [0.1, 0.2, 0.3, 0.4, 0.5],
}

In [143]:
session.sql(
    f"ALTER WAREHOUSE {session.get_current_warehouse()[1:-1]} SET WAREHOUSE_SIZE=LARGE;"
).collect()

[Row(status='Statement executed successfully.')]

In [144]:
grid_search = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=parameters,
    n_jobs=-1,
    scoring="accuracy",
    input_cols=train_df.drop("SURVIVED").columns,
    label_cols="SURVIVED",
    output_cols="PRED_SURVIVED",
)

# Train
grid_search.fit(train_df)



<snowflake.ml.modeling.model_selection.grid_search_cv.GridSearchCV at 0x7f8f82455d00>

In [145]:
session.sql(
    f"ALTER WAREHOUSE {session.get_current_warehouse()[1:-1]} SET WAREHOUSE_SIZE=XSMALL;"
).collect()

[Row(status='Statement executed successfully.')]

In [146]:
result = grid_search.predict(test_df)

In [147]:
accuracy = accuracy_score(
    df=result, y_true_col_names="SURVIVED", y_pred_col_names="PRED_SURVIVED"
)

print(f"Accuracy: {accuracy}")

Accuracy: 0.819095


In [148]:
# Print each combination of hyperparameters with their accuracy
results = grid_search.to_sklearn().cv_results_
data = {"accuracy": results["mean_test_score"]}
for i, param in enumerate(results["params"]):
    for key, value in param.items():
        if key not in data:
            data[key] = [None] * len(results["params"])
        data[key][i] = value

# Create DataFrame
hp_df = pd.DataFrame(data).sort_values(by='accuracy', ascending=False)
hp_df.head()

Unnamed: 0,accuracy,learning_rate,n_estimators
0,0.828099,0.1,100
14,0.826723,0.3,500
13,0.825274,0.3,400
1,0.825242,0.1,200
17,0.823845,0.4,300


# Model Registry


In [149]:
optimal_model = grid_search.to_sklearn().best_estimator_
optimal_n_estimators = optimal_model.n_estimators
optimal_learning_rate = optimal_model.learning_rate
optimal_accuracy = hp_df["accuracy"][0]

In [150]:
#create function to add one to our model number if it already exists
def model_version_update(df, name):
    filtered_df = df.filter(col('NAME') == name)
    if df.count() == 0:
        return 1
    else:
        filtered_df = filtered_df.withColumn("VERSION", filtered_df["VERSION"].cast("int"))
        max_version = filtered_df.agg({"VERSION": "max"}).collect()[0][0]
        return max_version + 1

In [151]:
# Get sample input data to pass into the registry logging function
X = train_df.drop("SURVIVED").limit(100)

db = identifier._get_unescaped_name(session.get_current_database())
schema = identifier._get_unescaped_name(session.get_current_schema())

# Define model name and version
model_name = "titanic"

# Create a registry and log the model
registry = model_registry.ModelRegistry(
    session=session, database_name=db, schema_name=schema, create_if_not_exists=True
)

reg_df = registry.list_models()
model_version = model_version_update(reg_df, model_name)

registry.log_model(
    model_name=model_name,
    model_version=model_version,
    model=optimal_model,
    sample_input_data=X,
    options={
        "embed_local_ml_library": True,  # This option is enabled to pull latest dev code changes.
        "relax": True,
    },  # relax dependencies
)

# Add evaluation metric
registry.set_metric(
    model_name=model_name,
    model_version=model_version,
    metric_name="accuracy",
    metric_value=optimal_accuracy,
)



In [152]:
# Let's confirm it was added
reg_df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"CREATION_CONTEXT"  |"CREATION_ENVIRONMENT_SPEC"  |"CREATION_ROLE"  |"CREATION_TIME"                   |"ID"                              |"INPUT_SPEC"  |"NAME"   |"OUTPUT_SPEC"  |"RUNTIME_ENVIRONMENT_SPEC"  |"TYPE"   |"URI"                                               |"VERSION"  |"ARTIFACT_IDS"  |"DESCRIPTION"  |"METRICS"                         |"TAGS"  |"REGISTRATION_TIMESTAMP"          |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

If you have multiple versions of the model, we want the UDF to be deployed as the version with the highest accuracy

In [153]:
best_model = (
    reg_df.flatten(reg_df["METRICS"])
    .filter(col("KEY") == "accuracy")
    .select("name", "VERSION", col("value").as_("ACCURACY"))
)
best_model.show()

--------------------------------------------
|"NAME"   |"VERSION"  |"ACCURACY"          |
--------------------------------------------
|titanic  |3          |0.8190133607399794  |
|titanic  |2          |0.8093414387031409  |
|titanic  |4          |0.8280992597226566  |
|titanic  |1          |0.8093414387031409  |
--------------------------------------------



Get the best model and version

In [154]:
deployed_version = best_model.sort(col('ACCURACY'), ascending=False).limit(1).collect()[0][1]

In [155]:
# We can always get a reference to our registry and model using this function call
model_ref = model_registry.ModelReference(
    registry=registry, model_name=model_name, model_version=deployed_version
)

In [156]:
model_deployment_name = "survival_pred"

model_ref.deploy(
    deployment_name="survival_pred",
    target_method="predict",  # the name of the model's method, usually predict
    permanent=True,
        options={
        "replace_udf": "True",
    }
)

{'name': 'SNOWPARK.TITANIC.survival_pred',
 'platform': <TargetPlatform.WAREHOUSE: 'warehouse'>,
 'target_method': 'predict',
 'signature': ModelSignature(
                     inputs=[
                         FeatureSpec(dtype=DataType.DOUBLE, name='CLASS_SECOND'),
 		FeatureSpec(dtype=DataType.DOUBLE, name='CLASS_THIRD'),
 		FeatureSpec(dtype=DataType.DOUBLE, name='WHO_MAN'),
 		FeatureSpec(dtype=DataType.DOUBLE, name='WHO_WOMAN'),
 		FeatureSpec(dtype=DataType.DOUBLE, name='EMBARK_TOWN_QUEENSTOWN'),
 		FeatureSpec(dtype=DataType.DOUBLE, name='EMBARK_TOWN_SOUTHAMPTON'),
 		FeatureSpec(dtype=DataType.INT8, name='SIBSP'),
 		FeatureSpec(dtype=DataType.INT8, name='PARCH'),
 		FeatureSpec(dtype=DataType.DOUBLE, name='FARE')
                     ],
                     outputs=[
                         FeatureSpec(dtype=DataType.INT64, name='output_feature_0')
                     ]
                 ),
 'options': {'replace_udf': 'True',
  'permanent_udf_stage_location': '@SNOWPARK.TITA

In [157]:
# Let's confirm it was added
registry.list_deployments(model_name, model_version).to_pandas()

Unnamed: 0,MODEL_NAME,MODEL_VERSION,DEPLOYMENT_NAME,CREATION_TIME,TARGET_METHOD,TARGET_PLATFORM,SIGNATURE,OPTIONS,STAGE_PATH,ROLE
0,titanic,4,survival_pred,2024-01-09 09:43:16.194000-08:00,predict,warehouse,"{\n ""inputs"": [\n {\n ""name"": ""CLASS_...","{\n ""permanent_udf_stage_location"": ""@SNOWPAR...",@SNOWPARK.TITANIC._SYSTEM_REGISTRY_DEPLOYMENTS...,"""SYSADMIN"""


In [158]:
# We can then use the deployed model to perform inference
result_sdf = model_ref.predict(deployment_name="survival_pred", data=test_df)
# result_sdf.rename(F.col('"output_feature_0"'),"PREDICTED_PRICE").show()
result_sdf.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"CLASS_SECOND"  |"CLASS_THIRD"  |"WHO_MAN"  |"WHO_WOMAN"  |"EMBARK_TOWN_QUEENSTOWN"  |"EMBARK_TOWN_SOUTHAMPTON"  |"SURVIVED"  |"SIBSP"  |"PARCH"  |"FARE"   |"output_feature_0"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|0.0             |0.0            |1.0        |0.0          |0.0                       |1.0                        |0           |0        |0        |51.8625  |0                   |
|0.0             |1.0            |0.0        |0.0          |1.0                       |0.0                        |0           |4        |1        |29.125   |0                   |
|1.0             |0.0            |1.0        |0.0          |0.0                       |1.0          

In [159]:
model_ref.predict("survival_pred", test_df).show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"CLASS_SECOND"  |"CLASS_THIRD"  |"WHO_MAN"  |"WHO_WOMAN"  |"EMBARK_TOWN_QUEENSTOWN"  |"EMBARK_TOWN_SOUTHAMPTON"  |"SURVIVED"  |"SIBSP"  |"PARCH"  |"FARE"   |"output_feature_0"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|0.0             |0.0            |1.0        |0.0          |0.0                       |1.0                        |0           |0        |0        |51.8625  |0                   |
|0.0             |1.0            |0.0        |0.0          |1.0                       |0.0                        |0           |4        |1        |29.125   |0                   |
|1.0             |0.0            |1.0        |0.0          |0.0                       |1.0          

In [160]:
test_df.write.mode("overwrite").save_as_table("TEST_DATA")

In [161]:
model_deployment_name = "survival_pred_proba"

model_ref.deploy(
    deployment_name="survival_pred_proba",
    target_method="predict_proba",  # the name of the model's method, usually predict
    permanent=True,
    options={
        "replace_udf": "True",
    }
)

{'name': 'SNOWPARK.TITANIC.survival_pred_proba',
 'platform': <TargetPlatform.WAREHOUSE: 'warehouse'>,
 'target_method': 'predict_proba',
 'signature': ModelSignature(
                     inputs=[
                         FeatureSpec(dtype=DataType.DOUBLE, name='CLASS_SECOND'),
 		FeatureSpec(dtype=DataType.DOUBLE, name='CLASS_THIRD'),
 		FeatureSpec(dtype=DataType.DOUBLE, name='WHO_MAN'),
 		FeatureSpec(dtype=DataType.DOUBLE, name='WHO_WOMAN'),
 		FeatureSpec(dtype=DataType.DOUBLE, name='EMBARK_TOWN_QUEENSTOWN'),
 		FeatureSpec(dtype=DataType.DOUBLE, name='EMBARK_TOWN_SOUTHAMPTON'),
 		FeatureSpec(dtype=DataType.INT8, name='SIBSP'),
 		FeatureSpec(dtype=DataType.INT8, name='PARCH'),
 		FeatureSpec(dtype=DataType.DOUBLE, name='FARE')
                     ],
                     outputs=[
                         FeatureSpec(dtype=DataType.FLOAT, name='output_feature_0'),
 		FeatureSpec(dtype=DataType.FLOAT, name='output_feature_1')
                     ]
                 ),
 'options'

In [162]:
model_ref.predict("survival_pred_proba", test_df).drop("output_feature_0").show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"CLASS_SECOND"  |"CLASS_THIRD"  |"WHO_MAN"  |"WHO_WOMAN"  |"EMBARK_TOWN_QUEENSTOWN"  |"EMBARK_TOWN_SOUTHAMPTON"  |"SURVIVED"  |"SIBSP"  |"PARCH"  |"FARE"   |"output_feature_0"   |"output_feature_1"   |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|0.0             |0.0            |1.0        |0.0          |0.0                       |1.0                        |0           |0        |0        |51.8625  |0.7811152338981628   |0.21888476610183716  |
|0.0             |1.0            |0.0        |0.0          |1.0                       |0.0                        |0           |4        |1        |29.125   |0.9667628407478333   |0.033237

## Add images to stage for Streamlit App


In [163]:
session.file.put("streamlit_images/*", "@ML_DATA")

[PutResult(source='floating.webp', target='floating.webp.gz', source_size=205540, target_size=0, source_compression='NONE', target_compression='GZIP', status='SKIPPED', message=''),
 PutResult(source='flying.webp', target='flying.webp.gz', source_size=77798, target_size=0, source_compression='NONE', target_compression='GZIP', status='SKIPPED', message=''),
 PutResult(source='sinking.webp', target='sinking.webp.gz', source_size=148802, target_size=0, source_compression='NONE', target_compression='GZIP', status='SKIPPED', message='')]