In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("housing.csv")

In [3]:
df['income_cat'] = pd.cut(df['median_income'], bins=[0, 1.5, 3.0, 4.5, 6.0, np.inf], labels=[1, 2, 3, 4, 5])

In [4]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['income_cat']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

#### Lets remove the income_cat column

In [5]:
for sett in (strat_train_set, strat_test_set):
    sett.drop("income_cat", axis=1, inplace=True)

In [6]:
df = strat_train_set.copy()

#### Lets take out median_house_value in order to get only features

In [7]:
housing_features = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [8]:
from sklearn.impute import SimpleImputer

In [9]:
imputer = SimpleImputer(strategy="median")

In [10]:
housing_num = housing_features.select_dtypes(include=[np.number])

In [11]:
X = imputer.fit_transform(housing_num)

In [12]:
housing = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

In [13]:
housing['ocean_proximity'] = df['ocean_proximity']

#### we can use ordinal_encoding also.....
- `from sklearn.preprocessing import OrdinalEncoder`
- `ordinal_encoder = OrdinalEncoder()`
- `housing_cat = ordinal_encoder.fit_transform(housing)`
- `housing_cat = pd.DataFrame(housing_cat, columns=housing.columns, index=housing.index)`

##### but here we are using onehot_encoding instead

In [14]:
housing = housing[['ocean_proximity']]

In [15]:
from sklearn.preprocessing import OneHotEncoder

In [16]:
cat_encoder = OneHotEncoder()

In [17]:
housing_cat = cat_encoder.fit_transform(housing)

In [18]:
housing_cat.toarray()

array([[0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [19]:
housing_cat = pd.DataFrame(housing_cat.toarray, columns=['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'], index=housing.index)