Connect to Snowflake w/ the Python Connector and SNOWSQL config file data

In [1]:
# see https://github.com/Snowflake-Labs/sfguide-snowpark-scikit-learn/blob/main/3_snowpark_end_to_end_ml.ipynb
import os, configparser
parser = configparser.ConfigParser()
parser.read(os.path.join(os.path.expanduser('~'), ".snowsql/config"))
section = "connections.test_conn"
pars = {
    "account": parser.get(section, "accountname"),
    "user": parser.get(section, "username"),
    "password": parser.get(section, "password"),
    "database": parser.get(section, "database"),
    "schema": parser.get(section, "schema")}

from snowflake.snowpark import Session
session = Session.builder.configs(pars).create()
print(session.sql('select current_warehouse(), current_database(), current_schema()').collect())

[Row(CURRENT_WAREHOUSE()='COMPUTE_WH', CURRENT_DATABASE()='TEST', CURRENT_SCHEMA()='PUBLIC')]


Get 10% sample data from the HOUSING table, and separate MEDIAN_HOWSE_VALUE

In [36]:
df = session.table("HOUSING").sample(frac=0.10).to_pandas()
df.shape

X = df.loc[:, df.columns != 'MEDIAN_HOUSE_VALUE']
X

Unnamed: 0,LONGITUDE,LATITUDE,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,POPULATION,HOUSEHOLDS,MEDIAN_INCOME,OCEAN_PROXIMITY
0,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,NEAR BAY
1,-122.28,37.84,52.0,2153.0,481.0,1168.0,441.0,1.9615,NEAR BAY
2,-122.29,37.81,50.0,760.0,190.0,377.0,122.0,0.9011,NEAR BAY
3,-122.29,37.81,46.0,935.0,297.0,582.0,277.0,0.7286,NEAR BAY
4,-122.29,37.81,46.0,12.0,4.0,18.0,7.0,0.4999,NEAR BAY
...,...,...,...,...,...,...,...,...,...
2035,-121.76,38.68,38.0,674.0,178.0,701.0,189.0,1.3942,INLAND
2036,-121.78,38.66,18.0,4224.0,632.0,1907.0,641.0,4.8226,INLAND
2037,-121.58,39.16,36.0,1206.0,197.0,537.0,204.0,3.3611,INLAND
2038,-121.48,39.10,19.0,2043.0,421.0,1018.0,390.0,2.5952,INLAND


Perform data preprocessing in a transformation pipeline

In [37]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

COL_NAMES = ['LONGITUDE', 'LATITUDE', 'HOUSING_MEDIAN_AGE', 'TOTAL_ROOMS',
    'TOTAL_BEDROOMS', 'POPULATION', 'HOUSEHOLDS', 'MEDIAN_INCOME']

imputer_num = SimpleImputer(strategy="median")
X_imputer_num = imputer_num.fit_transform(X[COL_NAMES])
X_imputer_num.shape

(2040, 8)

In [38]:
scaler_num = StandardScaler()
X_scaler_num = scaler_num.fit_transform(X_imputer_num)
X_scaler_num.shape
X_scaler_num

array([[-1.33422451,  1.05178717,  1.82765258, ..., -0.93925166,
        -0.82368334,  0.13942165],
       [-1.34899017,  1.04712781,  1.82765258, ..., -0.22762737,
        -0.145585  , -0.98853856],
       [-1.35391206,  1.03314972,  1.67002305, ..., -0.97318341,
        -1.01781633, -1.56488367],
       ...,
       [-1.00445811,  1.66216368,  0.56661634, ..., -0.82237561,
        -0.7936064 , -0.22783258],
       [-0.95523925,  1.6342075 , -0.77323466, ..., -0.36900968,
        -0.28503264, -0.64411203],
       [-0.83711397,  1.76001029, -1.00967895, ..., -0.02120919,
         0.09776481, -0.75640258]])

In [39]:
imputer_cat = SimpleImputer(strategy='most_frequent')
X_imputer_cat = imputer_cat.fit_transform(X[['OCEAN_PROXIMITY']])
X_imputer_cat.shape
X_imputer_cat

array([['NEAR BAY'],
       ['NEAR BAY'],
       ['NEAR BAY'],
       ...,
       ['INLAND'],
       ['INLAND'],
       ['INLAND']], dtype=object)

In [44]:
onehot_cat = OneHotEncoder(handle_unknown='ignore')
X_onehot_cat = onehot_cat.fit_transform(X_imputer_cat)
X_onehot_cat.shape

(2040, 5)

In [43]:
import numpy as np
model = np.concatenate((X_scaler_num, X_onehot_cat), axis=0)
model

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 0 dimension(s)

Check changes on numeric features

In [13]:
import pandas as pd
df_out = pd.DataFrame(model, index=df.index, columns=COL_NAMES[1:])
df_out.head(10)

Unnamed: 0,LONGITUDE,LATITUDE,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,POPULATION,HOUSEHOLDS,MEDIAN_INCOME
0,-1.317692,1.011817,1.798864,-0.484724,-0.804421,-0.808493,-0.821501,1.84873
1,-1.322613,1.007248,1.798864,0.246963,0.409874,-0.191496,0.447295,-0.392533
2,-1.332455,1.011817,1.798864,-0.59155,-0.552767,-0.666612,-0.481356,-0.934133
3,-1.332455,1.007248,1.798864,-0.051612,0.053159,-0.324043,-0.008932,-1.148216
4,-1.337376,1.007248,1.642366,-0.209839,-0.06656,-0.215765,-0.02243,-1.192961
5,-1.337376,1.007248,1.564118,-0.284035,-0.176506,-0.465924,-0.278888,-1.038574
6,-1.342297,0.99354,-0.235603,-0.797155,-0.897265,-0.90557,-0.95648,-1.12281
7,-1.337376,0.99354,0.546884,0.162039,0.104468,-0.117755,0.074755,-0.757427
8,-1.337376,0.99354,-0.939842,-0.727428,-0.56254,-0.703016,-0.51915,-1.2701
9,-1.337376,0.99354,1.798864,-0.988458,-1.031643,-1.084789,-1.064462,-1.258832
