Connect to Snowflake

In [1]:
# see https://github.com/Snowflake-Labs/sfguide-snowpark-scikit-learn/blob/main/3_snowpark_end_to_end_ml.ipynb
import os, configparser
parser = configparser.ConfigParser()
parser.read(os.path.join(os.path.expanduser('~'), ".snowsql/config"))
section = "connections.test_conn"
pars = {
    "account": parser.get(section, "accountname"),
    "user": parser.get(section, "username"),
    "password": parser.get(section, "password"),
    "database": parser.get(section, "database"),
    "schema": parser.get(section, "schema")}

from snowflake.snowpark import Session
session = Session.builder.configs(pars).create()
print(session.sql('select current_warehouse(), current_database(), current_schema()').collect())

[Row(CURRENT_WAREHOUSE()='COMPUTE_WH', CURRENT_DATABASE()='TEST', CURRENT_SCHEMA()='PUBLIC')]


 Get 10% sample data from HOUSING table

In [2]:
df = session.table("HOUSING").sample(frac=0.10).to_pandas()
df.shape

X = df.loc[:, df.columns != 'MEDIAN_HOUSE_VALUE']
y = df['MEDIAN_HOUSE_VALUE']
X

Unnamed: 0,LONGITUDE,LATITUDE,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,POPULATION,HOUSEHOLDS,MEDIAN_INCOME,OCEAN_PROXIMITY
0,-122.27,37.84,48.0,1922.0,409.0,1026.0,335.0,1.7969,NEAR BAY
1,-122.27,37.83,49.0,1215.0,282.0,570.0,264.0,1.4861,NEAR BAY
2,-122.26,37.82,40.0,624.0,195.0,423.0,160.0,0.9506,NEAR BAY
3,-122.27,37.82,41.0,3221.0,853.0,1959.0,720.0,1.1108,NEAR BAY
4,-122.30,37.81,52.0,1224.0,237.0,521.0,159.0,1.1910,NEAR BAY
...,...,...,...,...,...,...,...,...,...
2012,-122.03,38.69,23.0,1796.0,380.0,939.0,330.0,2.7955,INLAND
2013,-121.57,39.16,21.0,1872.0,302.0,870.0,301.0,3.7250,INLAND
2014,-121.57,39.16,33.0,2033.0,375.0,914.0,330.0,2.6964,INLAND
2015,-121.57,39.13,30.0,442.0,103.0,413.0,88.0,1.5694,INLAND


Preprocess data + train w/ RandomForestRegressor

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

COL_NAMES = ['OCEAN_PROXIMITY',
    'LONGITUDE', 'LATITUDE', 'HOUSING_MEDIAN_AGE', 'TOTAL_ROOMS',
    'TOTAL_BEDROOMS', 'POPULATION', 'HOUSEHOLDS', 'MEDIAN_INCOME']

# data preprocessing pipeline
pipeline = ColumnTransformer([(
    "num",
    Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler())]),
    COL_NAMES[1:]
), (
    "cat",
    Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))]),
    COL_NAMES[0:0]
)])

# model training full pipeline
from sklearn.ensemble import RandomForestRegressor
model = Pipeline([
    ('preprocessor', pipeline),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))])

model.fit(X, y)
model

Show first prediction + MSE/RMSE

In [4]:
preds = model.predict(X)
print(f'First Prediction: {preds[0]}')

import numpy as np
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y, preds)
print(f'MSE: {mse}')
rmse = np.sqrt(mse)
print(f'RMSE: {rmse}')

First Prediction: 115031.0
MSE: 515929381.1651715
RMSE: 22714.078919585787


Save model locally + predict first 5 from loaded model

In [5]:
import pickle
with open('models/model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('models/model.pkl', 'rb') as f:
    model2 = pickle.load(f)
model2.predict(X[0:5])

array([115031.  , 106422.  , 145515.  , 137617.04,  87628.  ])