In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("housing.csv")

In [3]:
df['income_cat'] = pd.cut(df['median_income'], bins=[0, 1.5, 3.0, 4.5, 6.0, np.inf], labels=[1, 2, 3, 4, 5])

In [4]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['income_cat']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

#### Lets remove the income_cat column

In [5]:
for sett in (strat_train_set, strat_test_set):
    sett.drop("income_cat", axis=1, inplace=True)

In [6]:
df = strat_train_set.copy()

#### Lets take out median_house_value in order to get only features

In [7]:
housing_features = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [8]:
from sklearn.impute import SimpleImputer

In [9]:
imputer = SimpleImputer(strategy="median")

In [10]:
housing_num = housing_features.select_dtypes(include=[np.number])

In [11]:
X = imputer.fit_transform(housing_num)

In [12]:
housing = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

In [13]:
housing['ocean_proximity'] = df['ocean_proximity']

#### we can use ordinal_encoding also.....
- `from sklearn.preprocessing import OrdinalEncoder`
- `ordinal_encoder = OrdinalEncoder()`
- `housing_cat = ordinal_encoder.fit_transform(housing)`
- `housing_cat = pd.DataFrame(housing_cat, columns=housing.columns, index=housing.index)`

##### but here we are using onehot_encoding instead

In [14]:
housing = housing[['ocean_proximity']]

In [15]:
from sklearn.preprocessing import OneHotEncoder

In [16]:
cat_encoder = OneHotEncoder()

In [17]:
housing_cat = cat_encoder.fit_transform(housing)

In [18]:
df_1hot = housing_cat.toarray()

In [19]:
housing_cat = pd.DataFrame(df_1hot, columns=['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'], index=housing.index)

In [20]:
df = pd.concat([df, housing_cat], axis=1)

In [21]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,72100.0,INLAND,0.0,1.0,0.0,0.0,0.0
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,279600.0,NEAR OCEAN,0.0,0.0,0.0,0.0,1.0
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,82700.0,INLAND,0.0,1.0,0.0,0.0,0.0
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,112500.0,NEAR OCEAN,0.0,0.0,0.0,0.0,1.0
20496,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,238300.0,<1H OCEAN,1.0,0.0,0.0,0.0,0.0


In [22]:
df = df.drop("ocean_proximity", axis=1)

In [23]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,72100.0,0.0,1.0,0.0,0.0,0.0
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,279600.0,0.0,0.0,0.0,0.0,1.0
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,82700.0,0.0,1.0,0.0,0.0,0.0
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,112500.0,0.0,0.0,0.0,0.0,1.0
20496,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,238300.0,1.0,0.0,0.0,0.0,0.0


- `from sklearn.preprocessing import MinMaxScaler`
- `scaler = MinMaxScaler(feature_range=(-1, 1))`
- `df_scaled = scaler.fit_transform(df)`
- `df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)`

#### In order to avoid Outliers to affect the scale we can use StandarsScaler 

In [24]:
from sklearn.preprocessing import StandardScaler

In [25]:
scaler = StandardScaler()

In [26]:
df_scaled = scaler.fit_transform(df)

In [27]:
df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)

In [28]:
df_scaled.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
12655,-0.94135,1.347438,0.027564,0.584777,0.635123,0.732602,0.556286,-0.893647,-1.166015,-0.887683,1.46218,-0.011006,-0.354889,-0.384217
15502,1.171782,-1.19244,-1.722018,1.261467,0.775677,0.533612,0.721318,1.292168,0.627451,-0.887683,-0.68391,-0.011006,-0.354889,2.602693
2908,0.267581,-0.125972,1.22046,-0.469773,-0.545045,-0.674675,-0.524407,-0.525434,-1.074397,-0.887683,1.46218,-0.011006,-0.354889,-0.384217
14053,1.221738,-1.351474,-0.370069,-0.348652,-0.038567,-0.467617,-0.037297,-0.865929,-0.816829,-0.887683,-0.68391,-0.011006,-0.354889,2.602693
20496,0.437431,-0.635818,-0.131489,0.427179,0.269198,0.37406,0.220898,0.325752,0.270486,1.126529,-0.68391,-0.011006,-0.354889,-0.384217
