In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("housing.csv")

In [3]:
df['income_cat'] = pd.cut(df['median_income'], bins=[0, 1.5, 3.0, 4.5, 6.0, np.inf], labels=[1, 2, 3, 4, 5])

In [4]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['income_cat']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

#### Lets remove the income_cat column

In [5]:
for sett in (strat_train_set, strat_test_set):
    sett.drop("income_cat", axis=1, inplace=True)

In [6]:
df = strat_train_set.copy()

#### Lets take out median_house_value in order to get only features

In [7]:
housing_features = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [8]:
housing_features = housing_features.drop("ocean_proximity", axis=1)

#### Instead of doing all the Below,

- `from sklearn.impute import SimpleImputer`
- `imputer = SimpleImputer(strategy="median")`
- `housing_num = housing_features.select_dtypes(include=[np.number])`
- `X = imputer.fit_transform(housing_num)`
- `housing = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)`
- `housing['ocean_proximity'] = df['ocean_proximity']`

--------------------------------------------------------------------------------------------------------------------------------

- `housing = housing[['ocean_proximity']]`
- `from sklearn.preprocessing import OneHotEncoder`
- `cat_encoder = OneHotEncoder()`
- `housing_cat = cat_encoder.fit_transform(housing)`
- `df_1hot = housing_cat.toarray()`
- `housing_cat = pd.DataFrame(df_1hot, columns=['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'], index=housing.index)`
- `df = pd.concat([df, housing_cat], axis=1)`
- `df = df.drop("ocean_proximity", axis=1)`

--------------------------------------------------------------------------------------------------------------------------------

- `from sklearn.preprocessing import StandardScaler`
- `scaler = StandardScaler()`
- `df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)`



### we can use `Pipeline` to streamline this process

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [10]:
my_pipeline = Pipeline([
            ("impute", SimpleImputer(strategy="median")),
            ("standardize", StandardScaler()),
])

In [11]:
my_pipeline.fit_transform(housing_features)

array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.73260236,
         0.55628602, -0.8936472 ],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.53361152,
         0.72131799,  1.292168  ],
       [ 0.26758118, -0.1259716 ,  1.22045984, ..., -0.67467519,
        -0.52440722, -0.52543365],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ..., -0.86201341,
        -0.86511838, -0.36547546],
       [-1.56080303,  1.2492109 , -1.1653327 , ..., -0.18974707,
         0.01061579,  0.16826095],
       [-1.28105026,  2.02567448, -0.13148926, ..., -0.71232211,
        -0.79857323, -0.390569  ]])