In [2]:
import pandas as pd

## Boston Housing Dataset

* **CRIM** - per capita crime rate by town
* **ZN** - proportion of residential land zoned for lots over 25,000 sq.ft.
* **INDUS** - proportion of non-retail business acres per town.
* **CHAS** - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
* **NOX** - nitric oxides concentration (parts per 10 million)
* **RM** - average number of rooms per dwelling
* **AGE** - proportion of owner-occupied units built prior to 1940
* **DIS** - weighted distances to five Boston employment centres
* **RAD** - index of accessibility to radial highways
* **TAX** - full-value property-tax rate per \$10,000
* **PTRATIO** - pupil-teacher ratio by town
* **B** - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
* **LSTAT** - % lower status of the population
* **AREA** - Boston area (n: North, s:South, e:Est, o:Ovest)
* **MEDV** - Median value of owner-occupied homes in \$1000's

In [22]:
housing = pd.read_csv('../samples/boston.csv')
housing.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,area,medv
0,0.00632,18.0,2.31,0,538.0,6.575,65.2,4.09,1,296,15.3,396.9,4.98,n,24.0
1,0.02731,0.0,7.07,0,469.0,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,n,21.6
2,0.02729,0.0,7.07,0,469.0,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,s,34.7
3,0.03237,0.0,2.18,0,458.0,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,s,33.4
4,0.06905,0.0,2.18,0,458.0,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,o,36.2


## Find Missing Values

In [12]:
#housing.info()
housing.isnull().sum()

Unnamed: 0    0
crim          0
zn            0
indus         0
chas          0
nox           0
rm            0
age           8
dis           7
rad           0
tax           0
ptratio       0
black         0
lstat         0
area          3
medv          0
dtype: int64

### Use mean for AGE and DIS and most frequent for AREA

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Create ColumnTransformer
ct = ColumnTransformer(transformers)

In [17]:
# Define transformer list
transformers = [
    ['im_age', SimpleImputer(),['age','dis']],
    ['im_emb', SimpleImputer(strategy='most_frequent'),['area']]
]

In [23]:
# Apply transformation

ct.fit(housing)

housing[['age','dis','area']] = ct.transform(housing)

housing.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,area,medv
0,0.00632,18.0,2.31,0,538.0,6.575,65.2,4.09,1,296,15.3,396.9,4.98,n,24.0
1,0.02731,0.0,7.07,0,469.0,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,n,21.6
2,0.02729,0.0,7.07,0,469.0,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,s,34.7
3,0.03237,0.0,2.18,0,458.0,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,s,33.4
4,0.06905,0.0,2.18,0,458.0,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,o,36.2


In [21]:
# Check 
housing.isnull().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
black      0
lstat      0
area       0
medv       0
dtype: int64

## Categorical Data

### Use one-hot encoder to area

In [25]:
housing.area.value_counts()

n    190
s    131
e     99
o     86
Name: area, dtype: int64

In [26]:
#  One hot encoded -> add one column for each uniqie values.
for j in list(housing.area.unique()):
  housing['is_{}'.format(j)] = (housing.area == j)*1

housing.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,area,medv,is_n,is_s,is_o,is_e
0,0.00632,18.0,2.31,0,538.0,6.575,65.2,4.09,1,296,15.3,396.9,4.98,n,24.0,1,0,0,0
1,0.02731,0.0,7.07,0,469.0,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,n,21.6,1,0,0,0
2,0.02729,0.0,7.07,0,469.0,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,s,34.7,0,1,0,0
3,0.03237,0.0,2.18,0,458.0,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,s,33.4,0,1,0,0
4,0.06905,0.0,2.18,0,458.0,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,o,36.2,0,0,1,0


In [27]:
#  Drop area column
housing.drop('area', axis=1, inplace=True)
housing.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv,is_n,is_s,is_o,is_e
0,0.00632,18.0,2.31,0,538.0,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0,1,0,0,0
1,0.02731,0.0,7.07,0,469.0,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,1,0,0,0
2,0.02729,0.0,7.07,0,469.0,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,0,1,0,0
3,0.03237,0.0,2.18,0,458.0,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,0,1,0,0
4,0.06905,0.0,2.18,0,458.0,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2,0,0,1,0


## Scaling

In [28]:
# Check the valus
housing.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv,is_n,is_s,is_o,is_e
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,4.673727,11.363636,11.136779,0.06917,465.069952,6.284634,68.619277,3.798712,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806,0.375494,0.258893,0.16996,0.195652
std,25.223207,23.322453,6.860353,0.253994,222.311286,0.702617,27.897045,2.092192,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104,0.484729,0.43846,0.37597,0.397094
min,0.00632,0.0,0.46,0.0,0.4,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0,0.0,0.0,0.0,0.0
25%,0.082045,0.0,5.19,0.0,429.0,5.8855,45.45,2.104425,4.0,279.0,17.4,375.3775,6.95,17.025,0.0,0.0,0.0,0.0
50%,0.25651,0.0,9.69,0.0,504.0,6.2085,76.8,3.2759,5.0,330.0,19.05,391.44,11.36,21.2,0.0,0.0,0.0,0.0
75%,3.689387,12.5,18.1,0.0,605.0,6.6235,93.9,5.118,24.0,666.0,20.2,396.225,16.955,25.0,1.0,1.0,0.0,0.0
max,537.0,100.0,27.74,1.0,871.0,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0,1.0,1.0,1.0,1.0


### Apply Min-Max Normalization to crim, nox, tax and black

In [31]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))

housing[['crim','nox','tax','black']]=scaler.fit_transform(housing[['crim','nox','tax','black']])

housing.describe()

  return self.partial_fit(X, y)


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv,is_n,is_s,is_o,is_e
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,0.008692,11.363636,11.136779,0.06917,0.533735,6.284634,68.619277,3.798712,9.549407,0.422208,18.455534,0.898568,12.653063,22.532806,0.375494,0.258893,0.16996,0.195652
std,0.046971,23.322453,6.860353,0.253994,0.255354,0.702617,27.897045,2.092192,8.707259,0.321636,2.164946,0.230205,7.141062,9.197104,0.484729,0.43846,0.37597,0.397094
min,0.0,0.0,0.46,0.0,0.0,3.561,2.9,1.1296,1.0,0.0,12.6,0.0,1.73,5.0,0.0,0.0,0.0,0.0
25%,0.000141,0.0,5.19,0.0,0.492304,5.8855,45.45,2.104425,4.0,0.175573,17.4,0.94573,6.95,17.025,0.0,0.0,0.0,0.0
50%,0.000466,0.0,9.69,0.0,0.578452,6.2085,76.8,3.2759,5.0,0.272901,19.05,0.986232,11.36,21.2,0.0,0.0,0.0,0.0
75%,0.006859,12.5,18.1,0.0,0.694464,6.6235,93.9,5.118,24.0,0.914122,20.2,0.998298,16.955,25.0,1.0,1.0,0.0,0.0
max,1.0,100.0,27.74,1.0,1.0,8.78,100.0,12.1265,24.0,1.0,22.0,1.0,37.97,50.0,1.0,1.0,1.0,1.0


### Apply Standardization to zn, indus and dis

In [33]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

housing[['zn','indus','dis']]=scaler.fit_transform(housing[['zn','indus','dis']])

housing.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv,is_n,is_s,is_o,is_e
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,0.008692,-7.459997000000001e-17,8.425408e-17,0.06917,0.533735,6.284634,68.619277,0.0,9.549407,0.422208,18.455534,0.898568,12.653063,22.532806,0.375494,0.258893,0.16996,0.195652
std,0.046971,1.00099,1.00099,0.253994,0.255354,0.702617,27.897045,1.00099,8.707259,0.321636,2.164946,0.230205,7.141062,9.197104,0.484729,0.43846,0.37597,0.397094
min,0.0,-0.4877224,-1.557842,0.0,0.0,3.561,2.9,-1.277011,1.0,0.0,12.6,0.0,1.73,5.0,0.0,0.0,0.0,0.0
25%,0.000141,-0.4877224,-0.8676906,0.0,0.492304,5.8855,45.45,-0.810615,4.0,0.175573,17.4,0.94573,6.95,17.025,0.0,0.0,0.0,0.0
50%,0.000466,-0.4877224,-0.2110985,0.0,0.578452,6.2085,76.8,-0.250134,5.0,0.272901,19.05,0.986232,11.36,21.2,0.0,0.0,0.0,0.0
75%,0.006859,0.04877224,1.015999,0.0,0.694464,6.6235,93.9,0.631201,24.0,0.914122,20.2,0.998298,16.955,25.0,1.0,1.0,0.0,0.0
max,1.0,3.804234,2.422565,1.0,1.0,8.78,100.0,3.984351,24.0,1.0,22.0,1.0,37.97,50.0,1.0,1.0,1.0,1.0
