Get 10% sample data from the HOUSING.CSV file, and separate MEDIAN_HOUSE_VALUE

In [8]:
# see https://github.com/Snowflake-Labs/sfguide-snowpark-scikit-learn/blob/main/3_snowpark_end_to_end_ml.ipynb

import random
import pandas as pd

df = pd.read_csv("..\..\.spool\housing.csv",
    skiprows=lambda i: i > 0 and random.random() > 0.10)
df.columns = [c.upper() for c in df.columns]

X = df.loc[:, df.columns != 'MEDIAN_HOUSE_VALUE']
X

Unnamed: 0,LONGITUDE,LATITUDE,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,POPULATION,HOUSEHOLDS,MEDIAN_INCOME,OCEAN_PROXIMITY
0,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,NEAR BAY
1,-122.26,37.85,52.0,3503.0,752.0,1504.0,734.0,3.2705,NEAR BAY
2,-122.27,37.84,52.0,1688.0,337.0,853.0,325.0,2.1806,NEAR BAY
3,-122.26,37.84,52.0,950.0,202.0,467.0,198.0,3.9643,NEAR BAY
4,-122.27,37.82,52.0,1630.0,456.0,1162.0,400.0,1.2475,NEAR BAY
...,...,...,...,...,...,...,...,...,...
2112,-121.54,39.08,26.0,2276.0,460.0,1455.0,474.0,2.4695,INLAND
2113,-121.53,39.06,20.0,561.0,109.0,308.0,114.0,3.3021,INLAND
2114,-121.52,39.12,37.0,102.0,17.0,29.0,14.0,4.1250,INLAND
2115,-121.40,39.33,15.0,2655.0,493.0,1200.0,432.0,3.5179,INLAND


Perform data preprocessing in a transformation pipeline

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

COL_NAMES = ['OCEAN_PROXIMITY',
    'LONGITUDE', 'LATITUDE', 'HOUSING_MEDIAN_AGE', 'TOTAL_ROOMS',
    'TOTAL_BEDROOMS', 'POPULATION', 'HOUSEHOLDS', 'MEDIAN_INCOME']

pipeline = ColumnTransformer([
    ( "num", Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler())]), COL_NAMES[1:]),
    ( "cat", Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))]), COL_NAMES[0:0])])

model = pipeline.fit_transform(X)
df_out = pd.DataFrame(model, index=df.index, columns=COL_NAMES[0:])
df_out.head(10)

Unnamed: 0,LONGITUDE,LATITUDE,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,POPULATION,HOUSEHOLDS,MEDIAN_INCOME
0,-1.302581,1.039167,0.973913,-0.785201,-0.957842,-0.97562,-0.948948,2.379612
1,-1.317536,1.020375,1.84686,-0.865518,-0.813849,-0.955628,-0.829381,-0.635998
2,-1.317536,1.025073,1.688143,-0.680438,-0.600182,-0.649656,-0.605192,-0.928723
3,-1.327506,1.020375,1.84686,-0.229524,-0.140332,-0.240245,-0.164288,-1.015964
4,-1.322521,1.015677,1.767501,-0.006032,0.075658,-0.162014,0.072355,-0.605744
5,-1.317536,1.015677,1.84686,-0.442541,-0.284325,-0.433216,-0.278873,-0.943236
6,-1.317536,1.020375,1.84686,-0.754645,-0.788302,-0.849581,-0.769597,0.052701
7,-1.332491,1.006282,1.370707,-0.761193,-0.567667,-0.749619,-0.572809,-1.673821
8,-1.332491,1.006282,-0.692624,-0.804844,-0.883523,-1.003436,-0.931511,-0.737699
9,-1.327506,1.001584,0.735836,-0.870756,-0.848686,-0.956497,-0.876709,-0.989338


In [9]:
import numpy as np
imputer = SimpleImputer(strategy='mean', missing_values=np.nan)
imputer = imputer.fit(df[['TOTAL_BEDROOMS']])
df['TOTAL_BEDROOMS'] = imputer.transform(df[['TOTAL_BEDROOMS']])
df

Unnamed: 0,LONGITUDE,LATITUDE,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,POPULATION,HOUSEHOLDS,MEDIAN_INCOME,MEDIAN_HOUSE_VALUE,OCEAN_PROXIMITY
0,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY
1,-122.26,37.85,52.0,3503.0,752.0,1504.0,734.0,3.2705,241800.0,NEAR BAY
2,-122.27,37.84,52.0,1688.0,337.0,853.0,325.0,2.1806,99700.0,NEAR BAY
3,-122.26,37.84,52.0,950.0,202.0,467.0,198.0,3.9643,188800.0,NEAR BAY
4,-122.27,37.82,52.0,1630.0,456.0,1162.0,400.0,1.2475,104200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
2112,-121.54,39.08,26.0,2276.0,460.0,1455.0,474.0,2.4695,58000.0,INLAND
2113,-121.53,39.06,20.0,561.0,109.0,308.0,114.0,3.3021,70800.0,INLAND
2114,-121.52,39.12,37.0,102.0,17.0,29.0,14.0,4.1250,72000.0,INLAND
2115,-121.40,39.33,15.0,2655.0,493.0,1200.0,432.0,3.5179,107200.0,INLAND
