In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

[Sklean Preprocessing](http://scikit-learn.org/stable/modules/preprocessing.html#)

In [6]:
data = pd.read_csv("./other_housing.csv")
data.iloc[[0]]

Unnamed: 0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,90,RL,,7032,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,12,2006,WD,Normal,135960


In [16]:
# continuous, discrete, and nominal features
feats = set(data.columns)
cont_feats = ["Lot Frontage", "Lot Area", "Mas Vnr Area", "BsmtFin SF 1", "BsmtFin SF 2", "Bsmt Unf SF", 
              "Total Bsmt SF", "1st Flr SF", "2nd Flr SF", "Low Qual Fin SF", "Gr Liv Area", "Garage Area", 
              "Wood Deck SF", "Open Porch SF", "Enclosed Porch", "3Ssn Porch", "Screen Porch", "Pool Area", 
              "Misc Val"]
disc_feats = ["Year Built", "Year Remod/Add", "Bsmt Full Bath", "Bsmt Half Bath", "Full Bath", "Half Bath", 
              "Bedroom AbvGr", "Kitchen AbvGr", "Fireplaces", "Garage Yr Blt", "Garage Cars", "Yr Sold"]
ord_feats = ["Lot Shape", "Utilities", "Land Slope", "Overall Qual", "Overall Cond", "Exter Qual",
             "Exter Cond", "Bsmt Qual", "Bsmt Cond", "Heating QC", "Electrical", "Kitchen Qual", "Functional",
             "Fireplace Qu", "Garage Finish", "Garage Qual", "Garage Cond", "Paved Drive", "Pool QC", "Fence"]
nom_feats = list(feats.difference(set(cont_feats+disc_feats+ord_feats)))

### Example: Nominal variable

#### Turn labels into integer classes

In [57]:
data["Neighborhood"].head()

0      NAmes
1      NAmes
2    CollgCr
3    BrkSide
4      NAmes
Name: Neighborhood, dtype: object

In [42]:
data["Neighborhood"].unique()

array(['NAmes', 'CollgCr', 'BrkSide', 'StoneBr', 'NridgHt', 'Crawfor',
       'IDOTRR', 'Somerst', 'Edwards', 'Timber', 'Mitchel', 'Gilbert',
       'SawyerW', 'Sawyer', 'OldTown', 'MeadowV', 'Blmngtn', 'NWAmes',
       'NoRidge', 'BrDale', 'SWISU', 'Blueste', 'ClearCr', 'NPkVill',
       'Veenker', 'GrnHill', 'Greens', 'Landmrk'], dtype=object)

In [45]:
# convert labels to integer classes
label_enc = preprocessing.LabelEncoder()
label_enc.fit(data["Neighborhood"])

LabelEncoder()

In [46]:
label_enc.classes_

array(['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr',
       'Crawfor', 'Edwards', 'Gilbert', 'Greens', 'GrnHill', 'IDOTRR',
       'Landmrk', 'MeadowV', 'Mitchel', 'NAmes', 'NPkVill', 'NWAmes',
       'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW',
       'Somerst', 'StoneBr', 'Timber', 'Veenker'], dtype=object)

In [61]:
# transform a single column into normalized integer labels
neigh_inds = label_enc.transform(data["Neighborhood"])
neigh_inds = neigh_inds.reshape(-1,1)

#### Convert to one-hot

In [66]:
neigh_enc = preprocessing.OneHotEncoder(sparse=False)
neigh_onehot = neigh_enc.fit_transform(neigh_inds)

In [67]:
neigh_onehot[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

## Example: Ordinal variable

In [69]:
# some you can center around zero
data["Overall Qual"].unique()

array([ 5, 10,  9,  6,  7,  8,  3,  2,  4,  1])

In [71]:
# some it isn't clear what order they go in
data["Bsmt Cond"].unique()

array(['TA', 'Fa', 'Gd', nan, 'Po', 'Ex'], dtype=object)

### Example: Discrete variable

In [73]:
data["Fireplaces"].median()

1.0

In [None]:
data["Fireplaces"] - 1.

### Continuous variables

In [81]:
data["Lot Frontage"].head()

0     NaN
1    63.0
2     NaN
3    52.0
4    70.0
Name: Lot Frontage, dtype: float64

#### Handling NaNs
Lot frontage is the linear feet of street connected to the property. In the case of a NaN, the most sensical value is 0 (ft).

In [80]:
# replacing NaNs with 0's
lotfr = data["Lot Frontage"].fillna(0, inplace=False)
lotfr.head()

0     0.0
1    63.0
2     0.0
3    52.0
4    70.0
Name: Lot Frontage, dtype: float64

In [82]:
# standardize
preprocessing.scale(lotfr)

array([-1.70619736,  0.16339685, -1.70619736, ..., -0.8159144 ,
        1.4691452 ,  0.31177735])

## Preprocessing pipeline