In [84]:
import warnings
import pandas as pd
import os
import pickle
import yaml
import numpy as np
from snowflake.ml.modeling.preprocessing import OneHotEncoder, StandardScaler
from snowflake.ml.modeling.impute import SimpleImputer
from snowflake.ml.modeling.compose import ColumnTransformer
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.snowpark import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions


In [114]:
session = Session.builder.configs(SnowflakeLoginOptions()).getOrCreate()

In [139]:
titanic_df = pd.read_csv('../data/raw/titanic.csv')
titanic_df['FARE'] = titanic_df['FARE'].astype(float)
titanic_df.drop(['ALIVE', 'DECK'], axis=1, inplace=True)
titanic_df.dropna(subset="EMBARKED",inplace=True)
cat_cols:list = titanic_df.select_dtypes(include=['object', 'bool']).columns
num_cols = titanic_df.drop('SURVIVED', axis=1).select_dtypes(include=['int64', 'float64']).columns
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SURVIVED     889 non-null    int64  
 1   PCLASS       889 non-null    int64  
 2   SEX          889 non-null    object 
 3   AGE          712 non-null    float64
 4   SIBSP        889 non-null    int64  
 5   PARCH        889 non-null    int64  
 6   FARE         889 non-null    float64
 7   EMBARKED     889 non-null    object 
 8   CLASS        889 non-null    object 
 9   WHO          889 non-null    object 
 10  ADULT_MALE   889 non-null    bool   
 11  EMBARK_TOWN  889 non-null    object 
 12  ALONE        889 non-null    bool   
dtypes: bool(2), float64(2), int64(4), object(5)
memory usage: 85.1+ KB


In [144]:
num_cols

Index(['PCLASS', 'AGE', 'SIBSP', 'PARCH', 'FARE'], dtype='object')

In [147]:
transformers = [
    (
        'categorical_imputer',
        SimpleImputer( 
            strategy='most_frequent',
            output_cols=cat_cols
        ),
        cat_cols
    ),
    (
        "OneHotEncoder",
        OneHotEncoder(
            drop="first",
            handle_unknown="ignore",
            output_cols=cat_cols
        ),
        cat_cols
    ),
    (
         "numeric_imputer",
            SimpleImputer(
                strategy='mean',
                output_cols=num_cols
        ),
        num_cols
    ),
    (
         "StandardScaler",
             StandardScaler(
                output_cols=num_cols
        ),
        num_cols
    )
]
    
preprocessor = ColumnTransformer(transformers=transformers, remainder="passthrough", output_cols=[*cat_cols, *num_cols])

In [148]:
preprocessor.fit(titanic_df)
transformed_df = preprocessor.transform(titanic_df)

In [149]:
transformed_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 42 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SURVIVED     889 non-null    int64  
 1   PCLASS       889 non-null    int64  
 2   SEX          889 non-null    object 
 3   AGE          712 non-null    float64
 4   SIBSP        889 non-null    int64  
 5   PARCH        889 non-null    int64  
 6   FARE         889 non-null    float64
 7   EMBARKED     889 non-null    object 
 8   CLASS        889 non-null    object 
 9   WHO          889 non-null    object 
 10  ADULT_MALE   889 non-null    bool   
 11  EMBARK_TOWN  889 non-null    object 
 12  ALONE        889 non-null    bool   
 13  SEX_0        889 non-null    object 
 14  SEX_1        889 non-null    object 
 15  SEX_2        889 non-null    object 
 16  SEX_3        889 non-null    object 
 17  SEX_4        889 non-null    object 
 18  SEX_5        889 non-null    object 
 19  SEX_6        

In [131]:
preprocessor.get_input_cols()

['SURVIVED',
 'PCLASS',
 'SEX',
 'AGE',
 'SIBSP',
 'PARCH',
 'FARE',
 'EMBARKED',
 'CLASS',
 'WHO',
 'ADULT_MALE',
 'EMBARK_TOWN',
 'ALONE']

In [109]:
transformed_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 22 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SURVIVED     889 non-null    int64  
 1   PCLASS       889 non-null    int64  
 2   SEX          889 non-null    object 
 3   AGE          712 non-null    float64
 4   SIBSP        889 non-null    int64  
 5   PARCH        889 non-null    int64  
 6   FARE         889 non-null    float64
 7   EMBARKED     889 non-null    object 
 8   CLASS        889 non-null    object 
 9   WHO          889 non-null    object 
 10  ADULT_MALE   889 non-null    bool   
 11  EMBARK_TOWN  889 non-null    object 
 12  ALONE        889 non-null    bool   
 13  AGE_0        889 non-null    float64
 14  AGE_1        889 non-null    float64
 15  AGE_2        889 non-null    float64
 16  AGE_3        889 non-null    float64
 17  AGE_4        889 non-null    float64
 18  AGE_5        889 non-null    float64
 19  AGE_6        

In [8]:
metrics = {
        "Accuracy": accuracy_score(
            df=result_df,
            y_true_col_names="SURVIVED",
            y_pred_col_names="OUTPUT_SURVIVED",
        ),
        "Precision": precision_score(
            df=result_df,
            y_true_col_names="SURVIVED",
            y_pred_col_names="OUTPUT_SURVIVED",
        ),
        "Recall": recall_score(
            df=result_df,
            y_true_col_names="SURVIVED",
            y_pred_col_names="OUTPUT_SURVIVED",
        ),
        "F1 Score": f1_score(
            df=result_df,
            y_true_col_names="SURVIVED",
            y_pred_col_names="OUTPUT_SURVIVED",
        ),
        "Confusion Matrix": confusion_matrix(
            df=result_df, y_true_col_name="SURVIVED", y_pred_col_name="OUTPUT_SURVIVED"
        ).tolist(),
    }

DataFrame.flatten() is deprecated since 0.7.0. Use `DataFrame.join_table_function()` instead.


In [13]:
metrics

{'Accuracy': 0.887931,
 'Precision': 0.8961038961038961,
 'Recall': 0.7931034482758621,
 'F1 Score': 0.8414634146341463,
 'Confusion Matrix': [[411.0, 24.0], [54.0, 207.0]]}

In [14]:
X = train_df.drop("SURVIVED").limit(100)

reg = Registry(session=session)

titanic_model = reg.log_model(
    model_name="TITANIC_PIPE",
    version_name=get_next_version(reg, "TITANIC_PIPE"),
    model=pipeline,
    metrics=metrics,
)

  return next(self.gen)


In [15]:
m = reg.get_model("TITANIC_PIPE")
m.default_version = get_version_with_highest_accuracy(reg, "TITANIC_PIPE")

## Call pipeline from SQL

Show the data is not cleaned before performming inference

In [17]:
test_df = session.table('TEST')
test_df.show()

-------------------------------------------------------------------------------------------------------------
|"SURVIVED"  |"PCLASS"  |"SIBSP"  |"PARCH"  |"FARE"    |"ALONE"  |"SEX"   |"CLASS"  |"WHO"  |"EMBARK_TOWN"  |
-------------------------------------------------------------------------------------------------------------
|1           |3         |0        |0        |7.9250    |True     |FEMALE  |THIRD    |WOMAN  |SOUTHAMPTON    |
|0           |3         |0        |0        |8.4583    |True     |MALE    |THIRD    |MAN    |QUEENSTOWN     |
|0           |1         |0        |0        |51.8625   |True     |MALE    |FIRST    |MAN    |SOUTHAMPTON    |
|0           |3         |0        |0        |8.0500    |True     |MALE    |THIRD    |MAN    |SOUTHAMPTON    |
|1           |3         |0        |0        |8.0292    |True     |FEMALE  |THIRD    |CHILD  |QUEENSTOWN     |
|0           |3         |0        |0        |7.8958    |True     |MALE    |THIRD    |MAN    |SOUTHAMPTON    |
|1        

Run the pipeline

In [20]:
#Copy this code in a snowflake worksheet or run via session.sql
inference_df = session.sql('''
select *, TITANIC_PIPE!predict_proba(*):PREDICT_PROBA_1
as surv_pred
from (
select * exclude survived
from test)
            ''')
inference_df.show()

-----------------------------------------------------------------------------------------------------------------------
|"PCLASS"  |"SIBSP"  |"PARCH"  |"FARE"    |"ALONE"  |"SEX"   |"CLASS"  |"WHO"  |"EMBARK_TOWN"  |"SURV_PRED"           |
-----------------------------------------------------------------------------------------------------------------------
|3         |0        |0        |7.9250    |True     |FEMALE  |THIRD    |WOMAN  |SOUTHAMPTON    |0.5756063461303711    |
|3         |0        |0        |8.4583    |True     |MALE    |THIRD    |MAN    |QUEENSTOWN     |0.06476970762014389   |
|1         |0        |0        |51.8625   |True     |MALE    |FIRST    |MAN    |SOUTHAMPTON    |0.07613715529441833   |
|3         |0        |0        |8.0500    |True     |MALE    |THIRD    |MAN    |SOUTHAMPTON    |0.1316869705915451    |
|3         |0        |0        |8.0292    |True     |FEMALE  |THIRD    |CHILD  |QUEENSTOWN     |0.5572702884674072    |
|3         |0        |0        |7.8958  

In [22]:
#Copy this code in a snowflake worksheet or run via session.sql
inference_df = session.sql('''
select *, TITANIC_PIPE!predict(*):OUTPUT_SURVIVED
as surv_pred
from (
select * exclude survived
from test)
            ''')
inference_df.show()

--------------------------------------------------------------------------------------------------------------
|"PCLASS"  |"SIBSP"  |"PARCH"  |"FARE"    |"ALONE"  |"SEX"   |"CLASS"  |"WHO"  |"EMBARK_TOWN"  |"SURV_PRED"  |
--------------------------------------------------------------------------------------------------------------
|3         |0        |0        |7.9250    |True     |FEMALE  |THIRD    |WOMAN  |SOUTHAMPTON    |1            |
|3         |0        |0        |8.4583    |True     |MALE    |THIRD    |MAN    |QUEENSTOWN     |0            |
|1         |0        |0        |51.8625   |True     |MALE    |FIRST    |MAN    |SOUTHAMPTON    |0            |
|3         |0        |0        |8.0500    |True     |MALE    |THIRD    |MAN    |SOUTHAMPTON    |0            |
|3         |0        |0        |8.0292    |True     |FEMALE  |THIRD    |CHILD  |QUEENSTOWN     |1            |
|3         |0        |0        |7.8958    |True     |MALE    |THIRD    |MAN    |SOUTHAMPTON    |0            |
|