Connect to Snowflake and get 10% sample data from HOUSING table

In [8]:
# see https://github.com/Snowflake-Labs/sfguide-snowpark-scikit-learn/blob/main/3_snowpark_end_to_end_ml.ipynb

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

from snowflake.snowpark import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
session = Session.builder.configs(SnowflakeLoginOptions("test_conn")).create()

df = session.table("HOUSING").sample(frac=0.10).to_pandas()
df.shape

X = df.loc[:, df.columns != 'MEDIAN_HOUSE_VALUE']
y = df['MEDIAN_HOUSE_VALUE']
X, y

(      LONGITUDE  LATITUDE  HOUSING_MEDIAN_AGE  TOTAL_ROOMS  TOTAL_BEDROOMS  \
 0       -122.24     37.85                52.0       1467.0           190.0   
 1       -122.26     37.85                52.0       2202.0           434.0   
 2       -122.27     37.84                52.0       1688.0           337.0   
 3       -122.27     37.83                51.0       2665.0           574.0   
 4       -122.28     37.82                52.0       1170.0           235.0   
 ...         ...       ...                 ...          ...             ...   
 2036    -121.79     38.67                30.0       2602.0           401.0   
 2037    -121.57     39.16                33.0       2033.0           375.0   
 2038    -121.56     39.10                28.0       2130.0           484.0   
 2039    -121.54     39.08                26.0       2276.0           460.0   
 2040    -121.44     39.00                20.0        755.0           147.0   
 
       POPULATION  HOUSEHOLDS  MEDIAN_INCOME OCEAN

Training function (reused from local test + deployed as @sproc)

In [9]:
def fit_pipeline(X, y, cat_attribs, num_attribs):
    model = Pipeline([
        ('preprocessor', ColumnTransformer([
            ( "num", Pipeline([
                ('imputer', SimpleImputer(strategy="median")),
                ('std_scaler', StandardScaler())]), num_attribs),
            ( "cat", Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))]), cat_attribs)])),
        ('model', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))])
    model.fit(X, y)
    return model

pipe = fit_pipeline(X, y,
       ['OCEAN_PROXIMITY'], ['LONGITUDE', 'LATITUDE', 'HOUSING_MEDIAN_AGE', 'TOTAL_ROOMS',
       'TOTAL_BEDROOMS', 'POPULATION', 'HOUSEHOLDS', 'MEDIAN_INCOME'])
print(pipe.predict(X)[0])
pipe

365718.09


Save model in internal named stage

In [10]:
def save_model(session, model, stage_name, stage_path, model_file):
    import io, joblib

    input_stream = io.BytesIO()
    input_stream.name = model_file
    joblib.dump(model, input_stream)

    model_path = f'{stage_name}/{stage_path}/{model_file}'
    session.file.put_stream(input_stream, model_path, overwrite=True)
    return model_path

In [11]:
# Stored Procedure function
def train_model(
    session: Session,           # A stored procedure will receive a session object when executed in snowflake with the authentification done
    training_table: str,        # Table name that has the data to be used for training and test
    target_col: str,            # name of the target column
    save_stage: str) -> dict:   # name of the stage to save the fitted pipline object
    
    from datetime import datetime
    import numpy as np

    now = datetime.now() # Get the date and time when this is strated
    
    # Get the training table and split into a training and test Snowpark DataFrames
    snowdf_train, snowdf_test = session.table(training_table).random_split([0.8, 0.2], seed=82) # use seed to make the split repeatable

    # Get the categorical and numeric column names
    cat_attribs = [c.name for c in snowdf_train.schema.fields
        if (type(c.datatype) == T.StringType) & (c.name != target_col)]
    numeric_types = [T.DecimalType, T.LongType, T.DoubleType, T.FloatType, T.IntegerType]
    num_attribs = [c.name for c in snowdf_train.schema.fields
        if (type(c.datatype) in numeric_types) & (c.name != target_col)]

    # save the train and test sets as time stamped tables in Snowflake 
    #table_suffix = now.strftime("%Y%m%d%H%M%S")
    train_table_name = training_table + '_TRAIN'
    snowdf_train.write.mode("overwrite").save_as_table(train_table_name)
    test_table_name = training_table + '_TEST'
    snowdf_test.write.mode("overwrite").save_as_table(test_table_name)

    pd_train = snowdf_train.to_pandas()
    
    X_train = pd_train.loc[:, pd_train.columns != target_col]
    y_train = pd_train[target_col]
    
    # Fit the model (pipeline)
    full_pipeline = fit_pipeline(X_train, y_train, cat_attribs, num_attribs)

    # save the full pipeline including the model
    # Save the model to stage
    save_path = now.strftime("%Y-%m-%d-%H%M%S")
    object_saved_path = save_model(session, full_pipeline,
        f"@{save_stage}/models", save_path, 'housing_fores_reg.joblib')

    # predict on the test set and return the root mean squared error (RMSE)
    pd_test = snowdf_test.to_pandas()
    
    X_test = pd_test.loc[:, pd_train.columns != target_col]
    y_test = pd_test[target_col]
    
    housing_predictions = full_pipeline.predict(X_test)
    lin_mse = mean_squared_error(y_test, housing_predictions)
    lin_rmse = np.sqrt(lin_mse)

    # Create a dict to return with test metrics and the path to the saved model pipeline
    ret_dict = {
        "MSE": lin_mse,
        "RMSE": lin_rmse,
        "model_path": object_saved_path,
        "train_table": train_table_name,
        "test_table": test_table_name }
    return ret_dict

In [7]:
import json
from snowflake.snowpark import types as T

session.clear_packages()
session.clear_imports()

session.add_packages('snowflake-snowpark-python', 'scikit-learn',
    'pandas', 'numpy', 'joblib', 'cachetools')
train_model_sp = F.sproc(func=train_model, name="train_house_sp" ,
    replace=True, is_permanent=True,
    stage_location="int_stage/sp/", session=session)

return_dict = json.loads(train_model_sp(session, "HOUSING_DATA", "MEDIAN_HOUSE_VALUE", stage_name))
return_dict

session.sql("ls @int_stage").show(max_width=150)



NameError: name 'F' is not defined