 Get 10% sample data from the housing.csv file

In [13]:
# see https://github.com/Snowflake-Labs/sfguide-snowpark-scikit-learn/blob/main/3_snowpark_end_to_end_ml.ipynb

import random
import pandas as pd

df = pd.read_csv(
    "..\..\.spool\datasets\housing.csv",
    skiprows=lambda i: i > 0 and random.random() > 0.10)
df.columns = [c.upper() for c in df.columns]

X = df.loc[:, df.columns != 'MEDIAN_HOUSE_VALUE']
y = df['MEDIAN_HOUSE_VALUE']
X

Unnamed: 0,LONGITUDE,LATITUDE,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,POPULATION,HOUSEHOLDS,MEDIAN_INCOME,OCEAN_PROXIMITY
0,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,NEAR BAY
1,-122.26,37.85,52.0,2491.0,474.0,1098.0,468.0,3.0750,NEAR BAY
2,-122.28,37.84,50.0,2082.0,492.0,1131.0,473.0,1.6424,NEAR BAY
3,-122.28,37.84,49.0,1916.0,447.0,863.0,378.0,1.9274,NEAR BAY
4,-122.27,37.83,49.0,1215.0,282.0,570.0,264.0,1.4861,NEAR BAY
...,...,...,...,...,...,...,...,...,...
2036,-121.94,38.89,15.0,1462.0,314.0,774.0,271.0,2.5478,INLAND
2037,-122.04,38.68,26.0,1113.0,222.0,689.0,234.0,3.0486,INLAND
2038,-121.57,39.16,18.0,1632.0,367.0,769.0,330.0,3.1029,INLAND
2039,-121.52,39.12,37.0,102.0,17.0,29.0,14.0,4.1250,INLAND


Preprocess data + train w/ RandomForestRegressor

In [14]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

COL_NAMES = ['OCEAN_PROXIMITY',
    'LONGITUDE', 'LATITUDE', 'HOUSING_MEDIAN_AGE', 'TOTAL_ROOMS',
    'TOTAL_BEDROOMS', 'POPULATION', 'HOUSEHOLDS', 'MEDIAN_INCOME']

model = Pipeline([
    ('preprocessor', ColumnTransformer([
        ("num", Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler())]), COL_NAMES[1:]),
        ("cat", Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))]), COL_NAMES[0:0])
    ])),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

model.fit(X, y)
model

Show first prediction + MSE/RMSE

In [16]:
import numpy as np
from sklearn.metrics import mean_squared_error

preds = model.predict(X)
print(f'First Prediction: {preds[0]}')

mse = mean_squared_error(y, preds)
print(f'MSE: {mse}')

rmse = np.sqrt(mse)
print(f'RMSE: {rmse}')

First Prediction: 309210.04
MSE: 507381936.15115213
RMSE: 22525.1400917098


Save model locally + predict first 5 from loaded model

In [17]:
import pickle

with open('../../.spool/models/modelRFG.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('../../.spool/models/modelRFG.pkl', 'rb') as f:
    model = pickle.load(f)
    model.predict(X[0:5])

array([309210.04, 238740.02, 108492.  , 117678.  ,  99358.  ])