# Preprocessing Pipeline

## Missing values

In [None]:
# drop rows of missing data
housing.dropna(subset=["total_bedrooms"])

In [None]:
# drop whole attribute
housing.drop("total_bedrooms", axis=1)

In [None]:
# set values to some value (zero, mean, median, etc.)
median = housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(median, inplace=True)

In [None]:
# using sklearn - can only work on numerical
from sklearn.preprocessing import Imputer

imputer= Imputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1) # drop text column
imputer.fit(housing_num) # fit
X = imputer.transform(housing_num) # transform, returns np
housing_tr = pd.DataFrame(X, columns=housing_num.columns) # can turn back into pd

## Handling Text / Categorical Attributes

In [None]:
# converting text categories to numbers

ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)

In [None]:
# one-hot encoding

In [None]:
# sklearn
from sklearn.preprocessing import CategoricalEncoder

cat_encoder = CategoricalEncoder()

In [None]:
# making dummies

## Custom Transformations

In [None]:
# write classes and implement fit(), transform(), and fit_transform()
housing["rooms_per_household"] = 
housing["population_per_household"] = 

In [None]:
# here's an example for adding bedrooms / rooms
from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

# to turn it back into pd
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs,
    columns=list(housing.columns)+["rooms_per_household", "population_per_household"])

# Make Pipeline

In [None]:
# custom transformer to select a subset of df based on name
from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [None]:
housing_num = housing.drop('ocean_proximity', axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

# pipeline for numerical features

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import CategoricalEncoder
from sklearn.preprocessing import StandardScaler 

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

# pipeline for categorical features
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', OneHotEncoder(sparse=False)),
    ])

In [None]:
# join the two together
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    )"cat_pipeline", cat_pipeline),
])

In [None]:
# run it!
housing_prepared = full_pipeline.fit_transform(housing)