In [None]:
import warnings

import pandas as pd
from snowflake.ml._internal.utils import identifier
from snowflake.ml.modeling.impute import SimpleImputer
from snowflake.ml.modeling.metrics import accuracy_score, precision_score, recall_score
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.modeling.preprocessing import OneHotEncoder
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.registry import Registry
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.snowpark import Session
from snowflake.snowpark import types as T
from snowflake.snowpark.functions import col
import ast
import json

warnings.simplefilter(action="ignore", category=UserWarning)

In [None]:
session = Session.builder.configs(SnowflakeLoginOptions()).getOrCreate()

In [None]:
titanic_df = session.table("titanic")

In [None]:
titanic_df.show()

In [None]:
# Columns with null values and their respective counts
{
    k: v
    for k, v in {
        col_name: titanic_df.where(col(col_name).is_null()).count()
        for col_name in titanic_df.columns
    }.items()
    if v > 0
}

In [None]:
titanic_df = titanic_df.drop(
    ["AGE", "DECK", "ALIVE", "ADULT_MALE", "EMBARKED", "SEX", "PCLASS", "ALONE"]
)

In [None]:
titanic_df = titanic_df.withColumn("FARE", titanic_df["FARE"].astype(T.FloatType()))

titanic_df.show()

In [None]:
cat_cols = ["CLASS", "WHO", "EMBARK_TOWN"]
num_cols = ["SIBSP", "PARCH", "FARE"]

In [None]:
impute_cat = SimpleImputer(
    input_cols=cat_cols,
    output_cols=cat_cols,
    strategy="most_frequent",
    drop_input_cols=True,
)

titanic_df = impute_cat.fit(titanic_df).transform(titanic_df)
titanic_df.show()

In [None]:
OHE = OneHotEncoder(
    input_cols=cat_cols,
    output_cols=cat_cols,
    drop_input_cols=True,
    drop="first",
    handle_unknown="ignore",
)

titanic_df = OHE.fit(titanic_df).transform(titanic_df)
titanic_df.show()

In [None]:
train_df, test_df = titanic_df.random_split(weights=[0.8, 0.2], seed=8)

In [None]:
parameters = {
    "n_estimators": [100, 200, 300, 400, 500],
    "learning_rate": [0.1, 0.2, 0.3, 0.4, 0.5],
    "max_depth": list(range(3,6,1)),
    "min_child_weight": list(range(1,6,1))
}

In [None]:
parameters

In [None]:
session.sql(
    f"ALTER WAREHOUSE {session.get_current_warehouse()[1:-1]} SET WAREHOUSE_SIZE=LARGE;"
).collect()

Data scientists may not have the ability to change the warehouse size.  They will usually have access to a larger warehouse and can easily switch as well using session.use_warehouse('bigger_warehouse')

In [None]:
grid_search = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=parameters,
    n_jobs=-1,
    scoring="accuracy",
    input_cols=train_df.drop("SURVIVED").columns,
    label_cols="SURVIVED",
    output_cols="PRED_SURVIVED",
)

# Train
grid_search.fit(train_df)

In [None]:
session.sql(
    f"ALTER WAREHOUSE {session.get_current_warehouse()[1:-1]} SET WAREHOUSE_SIZE=XSMALL;"
).collect()

In [None]:
result = grid_search.predict(test_df)

In [None]:
accuracy = accuracy_score(
    df=result, y_true_col_names="SURVIVED", y_pred_col_names="PRED_SURVIVED"
)

print(f"Accuracy: {accuracy}")

In [None]:
# Print each combination of hyperparameters with their accuracy
results = grid_search.to_sklearn().cv_results_
data = {"accuracy": results["mean_test_score"]}
for i, param in enumerate(results["params"]):
    for key, value in param.items():
        if key not in data:
            data[key] = [None] * len(results["params"])
        data[key][i] = value

# Create DataFrame
hp_df = pd.DataFrame(data).sort_values(by="accuracy", ascending=False)
hp_df.head()

# Model Registry


In [None]:
optimal_model = grid_search.to_sklearn().best_estimator_
optimal_n_estimators = optimal_model.n_estimators
optimal_learning_rate = optimal_model.learning_rate
optimal_max_depth = optimal_model.max_depth
optimal_min_child_weight = optimal_model.min_child_weight
optimal_accuracy = hp_df["accuracy"][0]

In [None]:
# create function to add one to our model versions if it already exists

def check_and_update(df, model_name):
    if df.empty:
        return 'V_1'
    elif df[df['name'] == model_name].empty:
        return 'V_1'
    else:
        # Increment model_version if df is not a pandas Series
        lst = sorted(ast.literal_eval(df['versions'][0]))
        last_value = lst[-1] 
        prefix, num = last_value.rsplit('_', 1)
        new_last_value = f"{prefix}_{int(num)+1}"
        lst[-1] = new_last_value
        return new_last_value

In [None]:
# Get sample input data to pass into the registry logging function
X = train_df.drop("SURVIVED").limit(100)

# Create a registry and log the model
# You can specify a different DB and Schema if you'd like 
# otherwise it uses the sesion context
reg = Registry(session=session)

reg_df = reg.show_models()

# Define model name and version (use uppercase for name)
model_name = "TITANIC"

model_version = check_and_update(reg_df, model_name)

titanic_model = reg.log_model(
    model_name=model_name,
    version_name=model_version,
    model=optimal_model,
    sample_input_data=X,
)

# Add evaluation metric
titanic_model.set_metric(
    metric_name="accuracy",
    value=optimal_accuracy,
)

In [None]:
reg.show_models()

In [None]:
hyperparameters = {
    "optimal_n_estimators" : optimal_n_estimators,
    "optimal_learning_rate" : optimal_learning_rate,
    "optimal_max_depth" : optimal_max_depth,
    "optimal_min_child_weight" : optimal_min_child_weight
}

titanic_model.set_metric(metric_name="hyperparameters", value=hyperparameters)

In [None]:
pd.options.display.max_colwidth = 500
reg.get_model(model_name).show_versions()

If you have multiple versions of the model, we want the UDF to be deployed as the version with the highest accuracy


In [None]:
reg_df = reg.get_model(model_name).show_versions()
reg_df["accuracy"] = reg_df["metadata"].apply(
    lambda x: json.loads(x)["metrics"]["accuracy"]
)
best_model = reg_df.sort_values(by="accuracy", ascending=False)

In [None]:
deployed_version = best_model['name'].iloc[0]
deployed_version

Set the default version to the deployed version (best model)

In [None]:
m = reg.get_model(model_name)
m.default = deployed_version
mv = m.default
mv.version_name

In [None]:
remote_prediction = mv.run(test_df, function_name="predict_proba")
remote_prediction.show()

In [None]:
# To test in SQL write test data back to a table

test_df.write.mode("overwrite").save_as_table("TEST_DATA")

## Add images to stage for Streamlit App


In [None]:
session.file.put("../streamlit_images/*", "@ML_DATA")

# Calling model from a new notebook

In [None]:
# Point to the registry

reg = Registry(session=session)

# Get the default version of your model (Model with best accuracy in our case)

mv = reg.get_model("titanic").default

remote_prediction = mv.run(test_df, function_name="predict_proba")
remote_prediction.drop('"output_feature_0"').with_column_renamed('"output_feature_1"','pred_survived').show()

## To delete your model and all of it's versions

In [None]:
reg.delete_model("TITANIC")