Connect to Snowflake and get 10% sample data from HOUSING table

In [1]:
# see https://github.com/Snowflake-Labs/sfguide-snowpark-scikit-learn/blob/main/3_snowpark_end_to_end_ml.ipynb

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

from snowflake.snowpark import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
session = Session.builder.configs(SnowflakeLoginOptions("test_conn")).create()

df = session.table("HOUSING").sample(frac=0.10).to_pandas()
df.shape

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


(2074, 10)

Training function (reused from local test + deployed as @sproc)

In [2]:
def fit_pipeline(X, y, cat_attribs, num_attribs):

    # data preprocessing pipeline
    pipeline = ColumnTransformer([(
        "num",
        Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler())]),
        num_attribs
    ), (
        "cat",
        Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))]),
        cat_attribs
    )])

    # model training full pipeline
    model = Pipeline([
        ('preprocessor', pipeline),
        ('model', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))])
    model.fit(X, y)
    return model

Test the function and get the first prediction

In [11]:
X = df.loc[:, df.columns != 'MEDIAN_HOUSE_VALUE']
y = df['MEDIAN_HOUSE_VALUE']

model = fit_pipeline(X, y,
       ['OCEAN_PROXIMITY'], ['LONGITUDE', 'LATITUDE', 'HOUSING_MEDIAN_AGE', 'TOTAL_ROOMS',
       'TOTAL_BEDROOMS', 'POPULATION', 'HOUSEHOLDS', 'MEDIAN_INCOME'])
print(model.predict(X)[0])
model

145205.0
