Get 10% sample data from the HOUSING.CSV file, and separate MEDIAN_HOUSE_VALUE

In [2]:
# see https://github.com/Snowflake-Labs/sfguide-snowpark-scikit-learn/blob/main/3_snowpark_end_to_end_ml.ipynb

import random
import pandas as pd

df = pd.read_csv(
    "..\..\.spool\datasets\housing.csv",
    skiprows=lambda i: i > 0 and random.random() > 0.10)
df.columns = [c.upper() for c in df.columns]

X = df.loc[:, df.columns != 'MEDIAN_HOUSE_VALUE']
X

Unnamed: 0,LONGITUDE,LATITUDE,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,POPULATION,HOUSEHOLDS,MEDIAN_INCOME,OCEAN_PROXIMITY
0,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY
1,-122.26,37.85,52.0,2202.0,434.0,910.0,402.0,3.2031,NEAR BAY
2,-122.26,37.84,52.0,696.0,191.0,345.0,174.0,2.6736,NEAR BAY
3,-122.27,37.84,52.0,2436.0,541.0,1015.0,478.0,1.7250,NEAR BAY
4,-122.27,37.83,49.0,1215.0,282.0,570.0,264.0,1.4861,NEAR BAY
...,...,...,...,...,...,...,...,...,...
2045,-121.52,39.12,37.0,102.0,17.0,29.0,14.0,4.1250,INLAND
2046,-121.48,39.10,19.0,2043.0,421.0,1018.0,390.0,2.5952,INLAND
2047,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,INLAND
2048,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,INLAND


Perform data preprocessing in a transformation pipeline

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

COL_NAMES = ['OCEAN_PROXIMITY',
    'LONGITUDE', 'LATITUDE', 'HOUSING_MEDIAN_AGE', 'TOTAL_ROOMS',
    'TOTAL_BEDROOMS', 'POPULATION', 'HOUSEHOLDS', 'MEDIAN_INCOME']

pipeline = ColumnTransformer([
    ( "num", Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler())]), COL_NAMES[1:]),
    ( "cat", Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))]), COL_NAMES[0:0])
])
model = pipeline.fit_transform(X)
model

array([[-1.33298072,  0.99504865,  1.81073047, ..., -0.64571011,
        -0.59844489,  0.00961655],
       [-1.33805264,  0.99504865,  1.81073047, ..., -0.38603039,
        -0.23908424, -0.34004503],
       [-1.33805264,  0.99036844,  1.81073047, ..., -0.81130298,
        -0.81205086, -0.62794086],
       ...,
       [-0.81057355,  1.73452144, -0.9221242 , ..., -0.31301899,
        -0.16118088, -1.15729947],
       [-0.86129269,  1.73452144, -0.84404264, ..., -0.51323582,
        -0.37227385, -1.06639072],
       [-0.82071738,  1.70644019, -1.00020577, ..., -0.02699494,
         0.08258124, -0.78289896]])

Check changes on numeric features

In [4]:
df_out = pd.DataFrame(model, index=df.index, columns=COL_NAMES[1:])
df_out.head(10)

Unnamed: 0,LONGITUDE,LATITUDE,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,POPULATION,HOUSEHOLDS,MEDIAN_INCOME
0,-1.332981,0.995049,1.81073,-0.441944,-0.589722,-0.64571,-0.598445,0.009617
1,-1.338053,0.995049,1.81073,-0.190415,-0.234172,-0.38603,-0.239084,-0.340045
2,-1.338053,0.990368,1.81073,-0.849203,-0.795202,-0.811303,-0.812051,-0.627941
3,-1.343125,0.990368,1.81073,-0.088053,0.012866,-0.306997,-0.048095,-1.143707
4,-1.343125,0.985688,1.576486,-0.62217,-0.585105,-0.641947,-0.58588,-1.2736
5,-1.348196,0.985688,1.81073,-0.492687,-0.335758,-0.392805,-0.236571,-1.314813
6,-1.338053,0.985688,1.81073,-0.510622,-0.474284,-0.552376,-0.472794,-0.189491
7,-1.338053,0.985688,1.732649,-0.744216,-0.51815,-0.681839,-0.623575,-1.382831
8,-1.35834,0.976328,1.498404,-0.517184,-0.418873,-0.477859,-0.414995,-1.643596
9,-1.348196,0.976328,-0.922124,-0.749466,-0.568943,-0.612592,-0.523055,-1.266042
