Get 10% sample data from the HOUSING.CSV file, and separate MEDIAN_HOUSE_VALUE

In [1]:
# see https://github.com/Snowflake-Labs/sfguide-snowpark-scikit-learn/blob/main/3_snowpark_end_to_end_ml.ipynb

import random
import pandas as pd

df = pd.read_csv("..\..\.spool\housing.csv",
    skiprows=lambda i: i > 0 and random.random() > 0.10)
df.columns = [c.upper() for c in df.columns]

X = df.loc[:, df.columns != 'MEDIAN_HOUSE_VALUE']
X

Unnamed: 0,LONGITUDE,LATITUDE,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,POPULATION,HOUSEHOLDS,MEDIAN_INCOME,OCEAN_PROXIMITY
0,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY
1,-122.26,37.85,52.0,2202.0,434.0,910.0,402.0,3.2031,NEAR BAY
2,-122.26,37.85,50.0,1120.0,283.0,697.0,264.0,2.1250,NEAR BAY
3,-122.27,37.84,52.0,2436.0,541.0,1015.0,478.0,1.7250,NEAR BAY
4,-122.27,37.84,52.0,2224.0,437.0,1006.0,422.0,2.6000,NEAR BAY
...,...,...,...,...,...,...,...,...,...
2048,-121.58,39.14,52.0,662.0,160.0,520.0,149.0,0.8928,INLAND
2049,-121.58,39.12,26.0,2796.0,629.0,2017.0,632.0,1.8355,INLAND
2050,-121.59,39.10,24.0,1107.0,261.0,768.0,205.0,1.7167,INLAND
2051,-121.56,39.01,22.0,1891.0,340.0,1023.0,296.0,2.7303,INLAND


Perform data preprocessing in a transformation pipeline

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

COL_NAMES = ['OCEAN_PROXIMITY',
    'LONGITUDE', 'LATITUDE', 'HOUSING_MEDIAN_AGE', 'TOTAL_ROOMS',
    'TOTAL_BEDROOMS', 'POPULATION', 'HOUSEHOLDS', 'MEDIAN_INCOME']

pipeline = ColumnTransformer([
    ( "num", Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler())]), COL_NAMES[1:]),
    ( "cat", Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))]), COL_NAMES[0:0])])

model = pipeline.fit_transform(X)
df_out = pd.DataFrame(model, index=df.index, columns=COL_NAMES[1:])
df_out.head(10)

Unnamed: 0,LONGITUDE,LATITUDE,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,POPULATION,HOUSEHOLDS,MEDIAN_INCOME
0,-1.334635,1.050287,1.884643,-0.469371,-0.611252,-0.78189,-0.628958,-0.046083
1,-1.339634,1.050287,1.884643,-0.201077,-0.239738,-0.466872,-0.253627,-0.377187
2,-1.339634,1.050287,1.722648,-0.705936,-0.604015,-0.661361,-0.615834,-0.932252
3,-1.344632,1.045575,1.884643,-0.091893,0.018392,-0.370997,-0.054151,-1.138195
4,-1.344632,1.045575,1.884643,-0.190812,-0.2325,-0.379215,-0.201133,-0.687696
5,-1.349631,1.045575,1.722648,-0.257069,-0.099817,-0.265078,-0.067274,-1.180722
6,-1.359629,1.031438,1.560652,-0.842182,-0.84767,-0.939856,-0.959669,-0.690888
7,-1.35463,1.026726,1.884643,-0.749329,-0.698099,-0.848546,-0.922923,-0.682856
8,-1.334635,1.031438,0.831675,-0.830051,-0.702924,-0.942595,-0.710323,-0.417397
9,-1.334635,1.031438,1.884643,-0.408246,-0.177015,-0.671405,-0.201133,-0.394692


In [4]:
import numpy as np
imputer = SimpleImputer(strategy='mean', missing_values=np.nan)
imputer = imputer.fit(df[['TOTAL_BEDROOMS']])
df['TOTAL_BEDROOMS'] = imputer.transform(df[['TOTAL_BEDROOMS']])
df

Unnamed: 0,LONGITUDE,LATITUDE,HOUSING_MEDIAN_AGE,TOTAL_ROOMS,TOTAL_BEDROOMS,POPULATION,HOUSEHOLDS,MEDIAN_INCOME,MEDIAN_HOUSE_VALUE,OCEAN_PROXIMITY
0,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
1,-122.26,37.85,52.0,2202.0,434.0,910.0,402.0,3.2031,281500.0,NEAR BAY
2,-122.26,37.85,50.0,1120.0,283.0,697.0,264.0,2.1250,140000.0,NEAR BAY
3,-122.27,37.84,52.0,2436.0,541.0,1015.0,478.0,1.7250,113900.0,NEAR BAY
4,-122.27,37.84,52.0,2224.0,437.0,1006.0,422.0,2.6000,132600.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
2048,-121.58,39.14,52.0,662.0,160.0,520.0,149.0,0.8928,55000.0,INLAND
2049,-121.58,39.12,26.0,2796.0,629.0,2017.0,632.0,1.8355,61200.0,INLAND
2050,-121.59,39.10,24.0,1107.0,261.0,768.0,205.0,1.7167,48800.0,INLAND
2051,-121.56,39.01,22.0,1891.0,340.0,1023.0,296.0,2.7303,99100.0,INLAND
