Connect to Snowflake w/ the Python Connector and SNOWSQL config file data

In [9]:
# see https://github.com/Snowflake-Labs/sfguide-snowpark-scikit-learn/blob/main/3_snowpark_end_to_end_ml.ipynb
import os, configparser
parser = configparser.ConfigParser()
parser.read(os.path.join(os.path.expanduser('~'), ".snowsql/config"))
section = "connections.test_conn"
pars = {
    "account": parser.get(section, "accountname"),
    "user": parser.get(section, "username"),
    "password": parser.get(section, "password"),
    "database": parser.get(section, "database"),
    "schema": parser.get(section, "schema")}

from snowflake.snowpark import Session
session = Session.builder.configs(pars).create()
print(session.sql('select current_warehouse(), current_database(), current_schema()').collect())

[Row(CURRENT_WAREHOUSE()='COMPUTE_WH', CURRENT_DATABASE()='TEST', CURRENT_SCHEMA()='PUBLIC')]


Get 10% sample data from the HOUSING table, and separate MEDIAN_HOWSE_VALUE

In [10]:
df = session.table("HOUSING").sample(frac=0.10).to_pandas()
df.shape

X = df.loc[:, df.columns != 'MEDIAN_HOUSE_VALUE']
y = df['MEDIAN_HOUSE_VALUE']
X

Unnamed: 0,LONGITUDE,LATITUDE,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,POPULATION,HOUSEHOLDS,MEDIAN_INCOME,OCEAN_PROXIMITY
0,-122.26,37.85,52.0,2491.0,474.0,1098.0,468.0,3.0750,NEAR BAY
1,-122.28,37.85,49.0,1130.0,244.0,607.0,239.0,2.4597,NEAR BAY
2,-122.26,37.83,51.0,936.0,311.0,517.0,249.0,1.2852,NEAR BAY
3,-122.26,37.84,49.0,713.0,202.0,462.0,189.0,1.0250,NEAR BAY
4,-122.29,37.82,49.0,135.0,29.0,86.0,23.0,6.1183,NEAR BAY
...,...,...,...,...,...,...,...,...,...
2077,-121.56,39.16,35.0,2157.0,441.0,1009.0,409.0,1.5827,INLAND
2078,-121.57,39.13,30.0,442.0,103.0,413.0,88.0,1.5694,INLAND
2079,-121.56,39.10,28.0,2130.0,484.0,1195.0,439.0,1.3631,INLAND
2080,-121.48,39.10,19.0,2043.0,421.0,1018.0,390.0,2.5952,INLAND


Perform data preprocessing in a transformation pipeline

In [11]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

COL_NAMES = ['OCEAN_PROXIMITY',
    'LONGITUDE', 'LATITUDE', 'HOUSING_MEDIAN_AGE', 'TOTAL_ROOMS',
    'TOTAL_BEDROOMS', 'POPULATION', 'HOUSEHOLDS', 'MEDIAN_INCOME']
pipeline = ColumnTransformer([(
        "num",
        Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler())]),
        COL_NAMES[1:]
    ), (
        "cat",
        Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))]),
        COL_NAMES[0:0]
    )])
ret = pipeline.fit_transform(X, y)

Check changes on numeric features

In [12]:
import pandas as pd
df2 = pd.DataFrame(df, index=df.index, columns=COL_NAMES[1:])
df2.head(10)

Unnamed: 0,LONGITUDE,LATITUDE,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,POPULATION,HOUSEHOLDS,MEDIAN_INCOME
0,-122.26,37.85,52.0,2491.0,474.0,1098.0,468.0,3.075
1,-122.28,37.85,49.0,1130.0,244.0,607.0,239.0,2.4597
2,-122.26,37.83,51.0,936.0,311.0,517.0,249.0,1.2852
3,-122.26,37.84,49.0,713.0,202.0,462.0,189.0,1.025
4,-122.29,37.82,49.0,135.0,29.0,86.0,23.0,6.1183
5,-122.3,37.81,52.0,572.0,109.0,274.0,82.0,1.8516
6,-122.28,37.81,36.0,2914.0,562.0,1236.0,509.0,2.4464
7,-122.29,37.81,23.0,1745.0,374.0,1054.0,325.0,0.8026
8,-122.25,37.82,28.0,3337.0,855.0,1520.0,802.0,3.9063
9,-122.25,37.82,52.0,1424.0,289.0,550.0,253.0,5.0917
