In [None]:
import warnings

import pandas as pd
from snowflake.ml._internal.utils import identifier
from snowflake.ml.modeling.impute import SimpleImputer
from snowflake.ml.modeling.metrics import accuracy_score, precision_score, recall_score
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.modeling.preprocessing import OneHotEncoder
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.registry import model_registry
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.snowpark import Session
from snowflake.snowpark import types as T
from snowflake.snowpark.functions import col

warnings.simplefilter(action="ignore", category=UserWarning)

In [None]:
session = Session.builder.configs(SnowflakeLoginOptions()).getOrCreate()

In [None]:
titanic_df = session.table("titanic")

In [None]:
titanic_df.show()

In [None]:
# Columns with null values and their respective counts
null_counts = [
    (col_name, titanic_df.where(col(col_name).isNull()).count())
    for col_name in titanic_df.columns
]
null_counts

In [None]:
titanic_df = titanic_df.drop(
    ["AGE", "DECK", "ALIVE", "ADULT_MALE", "EMBARKED", "SEX", "PCLASS", "ALONE"]
)

In [None]:
titanic_df = titanic_df.withColumn("FARE", titanic_df["FARE"].astype(T.FloatType()))

titanic_df.show()

In [None]:
cat_cols = ["CLASS", "WHO", "EMBARK_TOWN"]
num_cols = ["SIBSP", "PARCH", "FARE"]

In [None]:
impute_cat = SimpleImputer(
    input_cols=cat_cols,
    output_cols=cat_cols,
    strategy="most_frequent",
    drop_input_cols=True,
)

titanic_df = impute_cat.fit(titanic_df).transform(titanic_df)
titanic_df.show()

In [None]:
OHE = OneHotEncoder(
    input_cols=cat_cols,
    output_cols=cat_cols,
    drop_input_cols=True,
    drop="first",
    handle_unknown="ignore",
)

titanic_df = OHE.fit(titanic_df).transform(titanic_df)
titanic_df.show()

In [None]:
train_df, test_df = titanic_df.random_split(weights=[0.8, 0.2], seed=25)

In [None]:
parameters = {
    "n_estimators": [100, 200, 300, 400, 500],
    "learning_rate": [0.1, 0.2, 0.3, 0.4, 0.5],
    "max_depth": list(range(3,6,1)),
    "min_child_weight": list(range(1,6,1))
}

In [None]:
parameters

In [None]:
session.sql(
    f"ALTER WAREHOUSE {session.get_current_warehouse()[1:-1]} SET WAREHOUSE_SIZE=LARGE;"
).collect()

Data scientists may not have the ability to change the warehouse size.  They will usually have access to a larger warehouse and can easily switch as well using session.use_warehouse('bigger_warehouse')

In [None]:
grid_search = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=parameters,
    n_jobs=-1,
    scoring="accuracy",
    input_cols=train_df.drop("SURVIVED").columns,
    label_cols="SURVIVED",
    output_cols="PRED_SURVIVED",
)

# Train
grid_search.fit(train_df)

In [None]:
session.sql(
    f"ALTER WAREHOUSE {session.get_current_warehouse()[1:-1]} SET WAREHOUSE_SIZE=XSMALL;"
).collect()

In [None]:
result = grid_search.predict(test_df)

In [None]:
accuracy = accuracy_score(
    df=result, y_true_col_names="SURVIVED", y_pred_col_names="PRED_SURVIVED"
)

print(f"Accuracy: {accuracy}")

In [None]:
# Print each combination of hyperparameters with their accuracy
results = grid_search.to_sklearn().cv_results_
data = {"accuracy": results["mean_test_score"]}
for i, param in enumerate(results["params"]):
    for key, value in param.items():
        if key not in data:
            data[key] = [None] * len(results["params"])
        data[key][i] = value

# Create DataFrame
hp_df = pd.DataFrame(data).sort_values(by="accuracy", ascending=False)
hp_df.head()

# Model Registry


In [None]:
optimal_model = grid_search.to_sklearn().best_estimator_
optimal_n_estimators = optimal_model.n_estimators
optimal_learning_rate = optimal_model.learning_rate
optimal_accuracy = hp_df["accuracy"][0]

In [None]:
# create function to add one to our model number if it already exists


def model_version_update(df, name):
    filtered_df = df.filter(col("NAME") == name)
    if filtered_df.count() == 0:
        return 1
    else:
        filtered_df = filtered_df.withColumn(
            "VERSION", filtered_df["VERSION"].cast("int")
        )
        max_version = filtered_df.agg({"VERSION": "max"}).collect()[0][0]
        return max_version + 1

In [None]:
# Get sample input data to pass into the registry logging function
X = train_df.drop("SURVIVED").limit(100)

db = identifier._get_unescaped_name(session.get_current_database())
schema = identifier._get_unescaped_name(session.get_current_schema())

# Define model name and version
model_name = "titanic"

# Create a registry and log the model
registry = model_registry.ModelRegistry(
    session=session, database_name=db, schema_name=schema, create_if_not_exists=True
)

reg_df = registry.list_models()
model_version = model_version_update(reg_df, model_name)

registry.log_model(
    model_name=model_name,
    model_version=model_version,
    model=optimal_model,
    sample_input_data=X,
    options={
        "embed_local_ml_library": True,  # This option is enabled to pull latest dev code changes.
        "relax": True,
    },  # relax dependencies
)

# Add evaluation metric
registry.set_metric(
    model_name=model_name,
    model_version=model_version,
    metric_name="accuracy",
    metric_value=optimal_accuracy,
)

In [None]:
# Let's confirm it was added
reg_df.show()

If you have multiple versions of the model, we want the UDF to be deployed as the version with the highest accuracy


In [None]:
best_model = (
    reg_df.flatten(reg_df["METRICS"])
    .filter(col("KEY") == "accuracy")
    .select("name", "VERSION", col("value").as_("ACCURACY"))
)
best_model.show()

Get the best model and version


In [None]:
deployed_version = (
    best_model.sort(col("ACCURACY"), ascending=False).limit(1).collect()[0][1]
)

In [None]:
# We can always get a reference to our registry and model using this function call
model_ref = model_registry.ModelReference(
    registry=registry, model_name=model_name, model_version=deployed_version
)

In [None]:
model_deployment_name = "survival_pred"

model_ref.deploy(
    deployment_name="survival_pred",
    target_method="predict",  # the name of the model's method, usually predict
    permanent=True,
    options={
        "replace_udf": "True",
    },
)

In [None]:
# Let's confirm it was added
registry.list_deployments(model_name, model_version).to_pandas()

In [None]:
# We can then use the deployed model to perform inference
result_sdf = model_ref.predict(deployment_name="survival_pred", data=test_df)
# result_sdf.rename(F.col('"output_feature_0"'),"PREDICTED_PRICE").show()
result_sdf.show()

In [None]:
model_ref.predict("survival_pred", test_df).show()

In [None]:
test_df.write.mode("overwrite").save_as_table("TEST_DATA")

In [None]:
model_deployment_name = "survival_pred_proba"

model_ref.deploy(
    deployment_name="survival_pred_proba",
    target_method="predict_proba",  # the name of the model's method, usually predict
    permanent=True,
    options={
        "replace_udf": "True",
    },
)

In [None]:
model_ref.predict("survival_pred_proba", test_df).drop("output_feature_0").show()

## Add images to stage for Streamlit App


In [None]:
session.file.put("../streamlit_images/*", "@ML_DATA")