In [1]:
from snowflake.ml.modeling.impute import SimpleImputer
from snowflake.ml.modeling.metrics import accuracy_score, precision_score, recall_score
from snowflake.ml.modeling.preprocessing import OneHotEncoder
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.snowpark import Session, types as T
from snowflake.snowpark.functions import col
import pandas as pd

import warnings

warnings.simplefilter(action="ignore", category=UserWarning)

In [2]:
session = Session.builder.configs(SnowflakeLoginOptions()).getOrCreate()

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


In [3]:
titanic_df = session.table("titanic")

In [4]:
titanic_df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"SURVIVED"  |"PCLASS"  |"SEX"   |"AGE"  |"SIBSP"  |"PARCH"  |"FARE"   |"EMBARKED"  |"CLASS"  |"WHO"  |"ADULT_MALE"  |"DECK"  |"EMBARK_TOWN"  |"ALIVE"  |"ALONE"  |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------
|0           |3         |male    |22.00  |1        |0        |7.2500   |S           |Third    |man    |True          |NULL    |Southampton    |False    |False    |
|1           |1         |female  |38.00  |1        |0        |71.2833  |C           |First    |woman  |False         |C       |Cherbourg      |True     |False    |
|1           |3         |female  |26.00  |0        |0        |7.9250   |S           |Third    |woman  |False         |NULL    |Southampton    |True     |True     |
|1           |1 

In [5]:
# Columns with null values and their respective counts
null_counts = [
    (col_name, titanic_df.where(col(col_name).isNull()).count())
    for col_name in titanic_df.columns
]
null_counts

[('SURVIVED', 0),
 ('PCLASS', 0),
 ('SEX', 0),
 ('AGE', 177),
 ('SIBSP', 0),
 ('PARCH', 0),
 ('FARE', 0),
 ('EMBARKED', 2),
 ('CLASS', 0),
 ('WHO', 0),
 ('ADULT_MALE', 0),
 ('DECK', 688),
 ('EMBARK_TOWN', 2),
 ('ALIVE', 0),
 ('ALONE', 0)]

In [6]:
titanic_df = titanic_df.drop(["AGE", "DECK", "ALIVE"])

In [7]:
titanic_df = titanic_df.withColumn("FARE", titanic_df["FARE"].astype(T.FloatType()))

titanic_df.show()

----------------------------------------------------------------------------------------------------------------------------------------
|"SURVIVED"  |"PCLASS"  |"SEX"   |"SIBSP"  |"PARCH"  |"EMBARKED"  |"CLASS"  |"WHO"  |"ADULT_MALE"  |"EMBARK_TOWN"  |"ALONE"  |"FARE"   |
----------------------------------------------------------------------------------------------------------------------------------------
|0           |3         |male    |1        |0        |S           |Third    |man    |True          |Southampton    |False    |7.25     |
|1           |1         |female  |1        |0        |C           |First    |woman  |False         |Cherbourg      |False    |71.2833  |
|1           |3         |female  |0        |0        |S           |Third    |woman  |False         |Southampton    |True     |7.925    |
|1           |1         |female  |1        |0        |S           |First    |woman  |False         |Southampton    |False    |53.1     |
|0           |3         |male    |0      

In [8]:
cat_cols = ["SEX", "EMBARKED", "CLASS", "WHO", "EMBARK_TOWN"]
num_cols = ["PCLASS", "SIBSP", "PARCH", "FARE"]

In [9]:
impute_cat = SimpleImputer(
    input_cols=cat_cols,
    output_cols=cat_cols,
    strategy="most_frequent",
    drop_input_cols=True,
)

titanic_df = impute_cat.fit(titanic_df).transform(titanic_df)
titanic_df.show()

----------------------------------------------------------------------------------------------------------------------------------------
|"SEX"   |"EMBARKED"  |"CLASS"  |"WHO"  |"EMBARK_TOWN"  |"SURVIVED"  |"PCLASS"  |"SIBSP"  |"PARCH"  |"ADULT_MALE"  |"ALONE"  |"FARE"   |
----------------------------------------------------------------------------------------------------------------------------------------
|male    |S           |Third    |man    |Southampton    |0           |3         |1        |0        |True          |False    |7.25     |
|female  |C           |First    |woman  |Cherbourg      |1           |1         |1        |0        |False         |False    |71.2833  |
|female  |S           |Third    |woman  |Southampton    |1           |3         |0        |0        |False         |True     |7.925    |
|female  |S           |First    |woman  |Southampton    |1           |1         |1        |0        |False         |False    |53.1     |
|male    |S           |Third    |man    |

In [10]:
OHE = OneHotEncoder(
    input_cols=cat_cols,
    output_cols=cat_cols,
    drop_input_cols=True,
    drop="first",
    handle_unknown="ignore",
)

titanic_df = OHE.fit(titanic_df).transform(titanic_df)
titanic_df.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"SEX_male"  |"EMBARKED_Q"  |"EMBARKED_S"  |"CLASS_Second"  |"CLASS_Third"  |"WHO_man"  |"WHO_woman"  |"EMBARK_TOWN_Queenstown"  |"EMBARK_TOWN_Southampton"  |"SURVIVED"  |"PCLASS"  |"SIBSP"  |"PARCH"  |"ADULT_MALE"  |"ALONE"  |"FARE"   |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|1.0         |0.0           |1.0           |0.0             |1.0            |1.0        |0.0          |0.0                       |1.0                        |0           |3         |1        |0        |True          |False    |7.25     |
|0.0         |0.0           |0.0           |0.0 

In [11]:
train_df, test_df = titanic_df.random_split(weights=[0.8, 0.2], seed=8)

In [12]:
parameters = {
    "n_estimators":[100, 200, 300, 400, 500],
    "learning_rate":[0.1, 0.2, 0.3, 0.4, 0.5],
}

In [13]:
session.sql("ALTER WAREHOUSE COMPUTE_WH SET WAREHOUSE_SIZE=LARGE;").collect()

[Row(status='Statement executed successfully.')]

In [14]:
grid_search = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=parameters,
    n_jobs = -1,
    scoring="accuracy",
    input_cols=train_df.drop("SURVIVED").columns,
    label_cols="SURVIVED",
    output_cols="PRED_SURVIVED"
)

# Train
grid_search.fit(train_df)

<snowflake.ml.modeling.model_selection.grid_search_cv.GridSearchCV at 0x7f91886032b0>

In [15]:
session.sql("ALTER WAREHOUSE COMPUTE_WH SET WAREHOUSE_SIZE=XSMALL;").collect()

[Row(status='Statement executed successfully.')]

In [16]:
result = grid_search.predict(test_df)

In [17]:
accuracy = accuracy_score(
    df=result, y_true_col_names="SURVIVED", y_pred_col_names="PRED_SURVIVED"
)

print(f"Accuracy: {accuracy}")

Accuracy: 0.829787


In [18]:
# Print each combination of hyperparameters with their accuracy
results = grid_search.to_sklearn().cv_results_
data = {'Accuracy': results['mean_test_score']}
for i, param in enumerate(results['params']):
    for key, value in param.items():
        if key not in data:
            data[key] = [None] * len(results['params'])
        data[key][i] = value

# Create DataFrame
hp_df = pd.DataFrame(data)
hp_df.head()

Unnamed: 0,Accuracy,learning_rate,n_estimators
0,0.809341,0.1,100
1,0.812209,0.1,200
2,0.810811,0.1,300
3,0.807964,0.1,400
4,0.807964,0.1,500


# Model Registry