 Get 10% sample data from the housing.csv file

In [2]:
# see https://github.com/Snowflake-Labs/sfguide-snowpark-scikit-learn/blob/main/3_snowpark_end_to_end_ml.ipynb

import random
import pandas as pd

df = pd.read_csv("..\..\.spool\housing.csv",
    skiprows=lambda i: i > 0 and random.random() > 0.10)
df.columns = [c.upper() for c in df.columns]

X = df.loc[:, df.columns != 'MEDIAN_HOUSE_VALUE']
y = df['MEDIAN_HOUSE_VALUE']
X, y

(      LONGITUDE  LATITUDE  HOUSING_MEDIAN_AGE  TOTAL_ROOMS  TOTAL_BEDROOMS  \
 0       -122.25     37.85                52.0        919.0           213.0   
 1       -122.25     37.84                52.0       3104.0           687.0   
 2       -122.26     37.85                52.0       3503.0           752.0   
 3       -122.27     37.85                40.0        751.0           184.0   
 4       -122.26     37.83                52.0       1665.0           419.0   
 ...         ...       ...                 ...          ...             ...   
 2081    -121.59     39.15                48.0       1783.0           399.0   
 2082    -121.58     39.15                38.0       1756.0           396.0   
 2083    -121.57     39.12                30.0       2601.0           534.0   
 2084    -121.56     39.10                28.0       2130.0           484.0   
 2085    -121.56     39.08                26.0       1377.0           289.0   
 
       POPULATION  HOUSEHOLDS  MEDIAN_INCOME OCEAN

Preprocess data + train w/ RandomForestRegressor

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

COL_NAMES = ['OCEAN_PROXIMITY',
    'LONGITUDE', 'LATITUDE', 'HOUSING_MEDIAN_AGE', 'TOTAL_ROOMS',
    'TOTAL_BEDROOMS', 'POPULATION', 'HOUSEHOLDS', 'MEDIAN_INCOME']

model = Pipeline([
    ('preprocessor', ColumnTransformer([
        ("num", Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler())]), COL_NAMES[1:]),
        ("cat", Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))]), COL_NAMES[0:0])
    ])),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

model.fit(X, y)
model

Show first prediction + MSE/RMSE

In [4]:
import numpy as np
from sklearn.metrics import mean_squared_error

preds = model.predict(X)
print(f'First Prediction: {preds[0]}')

mse = mean_squared_error(y, preds)
print(f'MSE: {mse}')

rmse = np.sqrt(mse)
print(f'RMSE: {rmse}')

First Prediction: 255593.02
MSE: 493790286.66802645
RMSE: 22221.392545653533


Save model locally + predict first 5 from loaded model

In [6]:
import pickle

with open('../../.spool/modelRFG.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('../../.spool/modelRFG.pkl', 'rb') as f:
    model = pickle.load(f)
    print(model.predict(X[0:5]))

[255593.02 257314.01 258017.   137527.   148032.  ]
