In [1]:
import california_housing_data
import pandas as pd
from sklearn.impute import SimpleImputer

housing = california_housing_data.load_data()
strat_train_set, strat_test_set = california_housing_data.split_train_test(housing, test_ratio=0.2)

train_data = strat_train_set.drop('median_house_value', axis=1)
train_labels = strat_train_set['median_house_value'].copy()

# Fill the missing entries with median values.
imputer = SimpleImputer(strategy='median')

# Choose only numeric features.
train_numeric_features = train_data.drop('ocean_proximity', axis=1)
imputer.fit(train_numeric_features)

SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)

In [3]:
print(imputer.statistics_)
print(train_numeric_features.median().values)

[-118.51     34.26     29.     2119.5     433.     1164.      408.
    3.5409]
[-118.51     34.26     29.     2119.5     433.     1164.      408.
    3.5409]


In [8]:
# Imputer instance makes samples filled with median values calculated in train set.
X = imputer.transform(train_numeric_features)
print('Filled with median =\n', X)

Filled with median =
 [[-121.89     37.29     38.     ...  710.      339.        2.7042]
 [-121.93     37.05     14.     ...  306.      113.        6.4214]
 [-117.2      32.77     31.     ...  936.      462.        2.8621]
 ...
 [-116.4      34.09      9.     ... 2098.      765.        3.2723]
 [-118.01     33.82     31.     ... 1356.      356.        4.0625]
 [-122.45     37.77     52.     ... 1269.      639.        3.575 ]]


In [9]:
# Convert numpy array to pandas data frame.
housing_tr = pd.DataFrame(X,
                          columns=train_numeric_features.columns,
                          index = list(train_data.index.values))

In [34]:
housing_cat = strat_train_set['ocean_proximity']

housing_cat_encoded, housing_categories = housing_cat.factorize()

# NEAR BAY:0, <1H OCEAN:1, INLAND:2, NEAR OCEAN:3, ISLAND:4
print(housing_categories)

Index(['<1H OCEAN', 'NEAR OCEAN', 'INLAND', 'NEAR BAY', 'ISLAND'], dtype='object')


In [35]:
from sklearn.preprocessing import OneHotEncoder

# One-hot-encoding: Convert numeric categorical valule to the one-hot vector.
encoder = OneHotEncoder(categories='auto')
housing_cat_one_hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))

housing_cat_one_hot

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [53]:
attr = ['ocean_proximity']
encoder.fit_transform(strat_train_set[attr].values).toarray()

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])