Connect to Snowflake w/ the Python Connector and SNOWSQL config file data

In [7]:
# see https://github.com/Snowflake-Labs/sfguide-snowpark-scikit-learn/blob/main/3_snowpark_end_to_end_ml.ipynb
import os, configparser
parser = configparser.ConfigParser()
parser.read(os.path.join(os.path.expanduser('~'), ".snowsql/config"))
section = "connections.test_conn"
pars = {
    "account": parser.get(section, "accountname"),
    "user": parser.get(section, "username"),
    "password": parser.get(section, "password"),
    "database": parser.get(section, "database"),
    "schema": parser.get(section, "schema")}

from snowflake.snowpark import Session
session = Session.builder.configs(pars).create()
print(session.sql('select current_warehouse(), current_database(), current_schema()').collect())

[Row(CURRENT_WAREHOUSE()='COMPUTE_WH', CURRENT_DATABASE()='TEST', CURRENT_SCHEMA()='PUBLIC')]


Get 10% sample data from the HOUSING table, and separate MEDIAN_HOWSE_VALUE

In [8]:
df = session.table("HOUSING").sample(frac=0.10).to_pandas()
df.shape

X = df.loc[:, df.columns != 'MEDIAN_HOUSE_VALUE']
X

Unnamed: 0,LONGITUDE,LATITUDE,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,POPULATION,HOUSEHOLDS,MEDIAN_INCOME,OCEAN_PROXIMITY
0,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY
1,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.1200,NEAR BAY
2,-122.27,37.85,52.0,1228.0,293.0,648.0,303.0,2.1202,NEAR BAY
3,-122.27,37.84,52.0,2436.0,541.0,1015.0,478.0,1.7250,NEAR BAY
4,-122.28,37.84,50.0,2082.0,492.0,1131.0,473.0,1.6424,NEAR BAY
...,...,...,...,...,...,...,...,...,...
2095,-121.54,39.12,17.0,4251.0,899.0,3265.0,934.0,2.3496,INLAND
2096,-121.57,39.10,28.0,1442.0,333.0,832.0,286.0,1.8413,INLAND
2097,-121.59,39.10,24.0,1107.0,261.0,768.0,205.0,1.7167,INLAND
2098,-121.55,39.09,31.0,1728.0,365.0,1167.0,384.0,1.4958,INLAND


Perform data preprocessing in a transformation pipeline

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

COL_NAMES = ['OCEAN_PROXIMITY',
    'LONGITUDE', 'LATITUDE', 'HOUSING_MEDIAN_AGE', 'TOTAL_ROOMS',
    'TOTAL_BEDROOMS', 'POPULATION', 'HOUSEHOLDS', 'MEDIAN_INCOME']
pipeline = ColumnTransformer([(
        "num",
        Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler())]),
        COL_NAMES[1:]
    ), (
        "cat",
        Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))]),
        COL_NAMES[0:0]
    )])
model = pipeline.fit_transform(X)
model

array([[-1.31769216,  1.01181681,  1.79886382, ..., -0.80849339,
        -0.82150127,  1.84873006],
       [-1.32261317,  1.00724771,  1.79886382, ..., -0.19149591,
         0.44729505, -0.39253292],
       [-1.3324552 ,  1.01181681,  1.79886382, ..., -0.66661197,
        -0.48135587, -0.93413265],
       ...,
       [-0.99782643,  1.58295335, -0.39210067, ..., -0.55460033,
        -0.7459134 , -1.15271185],
       [-0.97814238,  1.57838426,  0.15564045, ..., -0.18216161,
        -0.26269097, -1.27237517],
       [-0.92893227,  1.65605883, -1.09633926, ..., -0.29417325,
        -0.25999141, -0.38982438]])

Check changes on numeric features

In [13]:
import pandas as pd
df_out = pd.DataFrame(model, index=df.index, columns=COL_NAMES[1:])
df_out.head(10)

Unnamed: 0,LONGITUDE,LATITUDE,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,POPULATION,HOUSEHOLDS,MEDIAN_INCOME
0,-1.317692,1.011817,1.798864,-0.484724,-0.804421,-0.808493,-0.821501,1.84873
1,-1.322613,1.007248,1.798864,0.246963,0.409874,-0.191496,0.447295,-0.392533
2,-1.332455,1.011817,1.798864,-0.59155,-0.552767,-0.666612,-0.481356,-0.934133
3,-1.332455,1.007248,1.798864,-0.051612,0.053159,-0.324043,-0.008932,-1.148216
4,-1.337376,1.007248,1.642366,-0.209839,-0.06656,-0.215765,-0.02243,-1.192961
5,-1.337376,1.007248,1.564118,-0.284035,-0.176506,-0.465924,-0.278888,-1.038574
6,-1.342297,0.99354,-0.235603,-0.797155,-0.897265,-0.90557,-0.95648,-1.12281
7,-1.337376,0.99354,0.546884,0.162039,0.104468,-0.117755,0.074755,-0.757427
8,-1.337376,0.99354,-0.939842,-0.727428,-0.56254,-0.703016,-0.51915,-1.2701
9,-1.337376,0.99354,1.798864,-0.988458,-1.031643,-1.084789,-1.064462,-1.258832
