In [None]:
#Snowpark
from snowflake.snowpark import Session
from snowflake.snowpark import types as T
from snowflake.snowpark.functions import col

# SnowflakeML
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.ml.modeling.preprocessing import OneHotEncoder, StandardScaler
from snowflake.ml.modeling.impute import SimpleImputer
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.metrics import accuracy_score, precision_score, recall_score

In [None]:
session = Session.builder.configs(SnowflakeLoginOptions()).getOrCreate()

In [None]:
titanic_df = session.table('titanic')

In [None]:
titanic_df.show()

In [None]:
# Columns with null values and their respective counts
null_counts = [(col_name, titanic_df.where(col(col_name).isNull()).count()) for col_name in titanic_df.columns]
null_counts

In [None]:
titanic_df = titanic_df.drop(['AGE','DECK','ALIVE'])

In [None]:
titanic_df = titanic_df.withColumn("FARE", titanic_df["FARE"].astype(T.FloatType()))

titanic_df.show()

In [None]:
cat_cols = ['SEX', 'EMBARKED', 'CLASS', 'WHO', 'EMBARK_TOWN']
num_cols = ['PCLASS', 'SIBSP', 'PARCH', 'FARE']

In [None]:
impute_cat = SimpleImputer(
    input_cols=cat_cols,
    output_cols=cat_cols,
    strategy="most_frequent",
    drop_input_cols=True,
)

titanic_df = impute_cat.fit(titanic_df).transform(titanic_df)
titanic_df.show()

In [None]:
OHE = OneHotEncoder(
    input_cols=cat_cols,
    output_cols=cat_cols,
    drop_input_cols=True,
    drop="first",
    handle_unknown="ignore"
)

titanic_df = OHE.fit(titanic_df).transform(titanic_df)
titanic_df.show()

In [None]:
train_df, test_df = titanic_df.random_split(weights=[0.8, 0.2], seed = 8)

In [None]:
xgb = XGBClassifier(input_cols= train_df.drop('SURVIVED').columns,
                    label_cols= 'SURVIVED',
                    output_cols = 'PRED_SURVIVED')

# Train
xgb.fit(train_df)

In [None]:
result = xgb.predict(test_df)

In [None]:
accuracy = accuracy_score(df=result, 
                      y_true_col_names="SURVIVED",
                      y_pred_col_names="PRED_SURVIVED")

precision = precision_score(df=result, 
                      y_true_col_names="SURVIVED",
                      y_pred_col_names="PRED_SURVIVED")

recall = recall_score(df=result, 
                      y_true_col_names="SURVIVED",
                      y_pred_col_names="PRED_SURVIVED")

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}")