## 2 Pipeline Building

### 2.1 Row Removal
We remove a few rows before the actual pipeline (e.g. outliers) because they would prevent a good training. We will also need to remove rows with no price from the holdout data set.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt

In [2]:
import warnings
warnings.filterwarnings('ignore') # seaborn shows a lot of ugly warnings, let's suppress this for the analysis part

In [3]:
df = pd.read_csv('data/dc_housing/DC_Properties_training.csv', index_col=0, low_memory=False)

df = df[~np.isnan(df['PRICE'])]
#df = df[df['PRICE'] >= 20000]
#df = df[np.abs(df['PRICE'] - df['PRICE'].mean()) <= (1.2*df['PRICE'].std())]

df = df[(df['HF_BATHRM'] <= 5) & 
        ((df['KITCHENS'] < 10) | (np.isnan(df['KITCHENS']))) &
        (df['FIREPLACES'] < 500) & 
        (df['BEDRM'] < 20)]

df = df[~df['SALEDATE'].isnull()]

### 2.2 Evaluation Function / Libraries

In [4]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestRegressor

def compare_predictions(x, y, finalpipeline, mean_price):
    # generate predictions:
    # Note: finalpipeline must have already been fit.
    
    predictions = finalpipeline.predict(x)
    y = y.reset_index()
    y.drop('index', axis=1, inplace=True)
    
    # a "lazy prediction" is where we return the average value of the target for every prediction.
    lazy_predictions = np.full(predictions.shape, mean_price)
    
    final_predictions = pd.DataFrame(pd.concat([y, 
                                                pd.Series(predictions), 
                                                pd.Series(lazy_predictions)], axis=1))
    final_predictions.rename(columns={'Price': 'True values',
                                      0: 'Predicted values',
                                      1: 'Lazy Predicted values'}, inplace=True)
    
    rmse_lazy = sqrt(mean_squared_error(y, lazy_predictions))
    mae_lazy = mean_absolute_error(y, lazy_predictions)
    r2_lazy = r2_score(y, lazy_predictions)
    
    rmse = sqrt(mean_squared_error(y, predictions))
    mae = mean_absolute_error(y, predictions)
    r2 = r2_score(y, predictions)
    
    print('RMSE Lazy Predictor', rmse_lazy)
    print('MAE Lazy Predictor', mae_lazy)
    print('R^2 Lazy Predictor', r2_lazy)
    print()
    print('RMSE', rmse)
    print('MAE', mae)
    print('R^2', r2)
    print()
    print('RMSE Improvement:', rmse_lazy - rmse)
    print('MAE Inprovement:', mae_lazy - mae)
    print('R^2 Improvement:', abs(r2_lazy - r2))
    
    return predictions

### 2.3 Pipeline Preparation

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.one_hot import OneHotEncoder
from sklearn.model_selection import train_test_split

In [6]:
df.columns

Index(['BATHRM', 'HF_BATHRM', 'HEAT', 'AC', 'NUM_UNITS', 'ROOMS', 'BEDRM',
       'AYB', 'YR_RMDL', 'EYB', 'STORIES', 'SALEDATE', 'PRICE', 'QUALIFIED',
       'SALE_NUM', 'GBA', 'BLDG_NUM', 'STYLE', 'STRUCT', 'GRADE', 'CNDTN',
       'EXTWALL', 'ROOF', 'INTWALL', 'KITCHENS', 'FIREPLACES', 'USECODE',
       'LANDAREA', 'GIS_LAST_MOD_DTTM', 'SOURCE', 'CMPLX_NUM', 'LIVING_GBA',
       'FULLADDRESS', 'CITY', 'STATE', 'ZIPCODE', 'NATIONALGRID', 'LATITUDE',
       'LONGITUDE', 'ASSESSMENT_NBHD', 'ASSESSMENT_SUBNBHD', 'CENSUS_TRACT',
       'CENSUS_BLOCK', 'WARD', 'SQUARE', 'X', 'Y', 'QUADRANT'],
      dtype='object')

In [7]:
#cols_disc = ['BATHRM','HF_BATHRM','ROOMS','BEDRM','FIREPLACES','YEAR']
#cols_cont = ['EYB','GBA','LANDAREA']
cols_num = ['BATHRM','HF_BATHRM','ROOMS','BEDRM','FIREPLACES','YEAR', 'EYB','GBA','LANDAREA']
cols_ord = ['GRADE']
cols_cat = ['AC','SOURCE']

#cols_all = cols_disc + cols_cont + cols_ord + cols_cat
cols_all = cols_num + cols_ord + cols_cat

In [8]:
x = df.drop('PRICE', axis=1)
y = df.loc[:,['PRICE']]
#x.loc[:,cols_all].isnull().any()

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)

In [10]:
x_train.loc[:,cols_all + ['LIVING_GBA', 'SALEDATE']].isnull().any()

BATHRM        False
HF_BATHRM     False
ROOMS         False
BEDRM         False
FIREPLACES    False
YEAR           True
EYB           False
GBA            True
LANDAREA      False
GRADE          True
AC            False
SOURCE        False
LIVING_GBA     True
SALEDATE      False
dtype: bool

All is as expected - we do have null values for ```GBA``` and ```LIVING_GBA``` (which we will merge into one column), ```GRADE``` which has null values for all condominiums (we will replace these with a standard value) and ```YEAR``` which we will populate from ```SALEDATE```.

Let's start with defining all the classes we'll need in the pipeline. We will test these right after the definition in the same order as we are using them in the pipeline (see section 2.4 to see the definition and order of the pipeline).

In [11]:
class MergeColumns(TransformerMixin):
    def __init__(self, column_one, column_two):
        self.column_one = column_one
        self.column_two = column_two
    
    def fit(self, x, y= None):
        return self
    
    def transform(self, x):
        x[self.column_one] = x[self.column_one].fillna(0) + x[self.column_two].fillna(0)
        x = x.drop(self.column_two, axis=1)
        return x

In [12]:
x_train.loc[:,['SOURCE','GBA','LIVING_GBA']].sample(3, random_state=1)

Unnamed: 0_level_0,SOURCE,GBA,LIVING_GBA
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2994,Residential,2192.0,
128836,Condominium,,1662.0
122208,Condominium,,324.0


In [13]:
merge_columns = MergeColumns('GBA', 'LIVING_GBA')
df_merge_columns = merge_columns.fit_transform(x_train)
x_train.loc[:,['SOURCE','GBA']].sample(3, random_state=1)

Unnamed: 0_level_0,SOURCE,GBA
index,Unnamed: 1_level_1,Unnamed: 2_level_1
2994,Residential,2192.0
128836,Condominium,1662.0
122208,Condominium,324.0


In [14]:
class ConvertZeroToN(TransformerMixin):
    def __init__(self, column):
        self.column = column
    
    def fit(self, x, y= None):
        return self
    
    def transform(self, x):
        x[self.column][x[self.column] == '0'] = 'N'
        
        return x

In [15]:
x_train.loc[x_train['AC'] == '0'].loc[:,['AC']].head(3)

Unnamed: 0_level_0,AC
index,Unnamed: 1_level_1
43000,0
144084,0
144076,0


In [16]:
convert_zero_to_n = ConvertZeroToN('AC')
convert_zero_to_n.fit_transform(x_train)
x_train.loc[x_train['AC'] == '0'].loc[:,['AC']].head(3)

Unnamed: 0_level_0,AC
index,Unnamed: 1_level_1


In [17]:
class ConvertStringDateToYear(TransformerMixin):
    def __init__(self, column):
        self.column = column
    
    def fit(self, x, y= None):
        return self
    
    def transform(self, x):
        x[self.column] = pd.to_datetime(x[self.column], format='%Y-%m-%d', errors='coerce')
        x['YEAR'] = x[self.column].dt.year
        
        return x       

In [18]:
convert_string_date_to_year = ConvertStringDateToYear('SALEDATE')
convert_string_date_to_year.fit_transform(x_train)
x_train.loc[:,['YEAR']].head(3)

Unnamed: 0_level_0,YEAR
index,Unnamed: 1_level_1
155050,2009
112994,1995
30395,2004


In [19]:
x_train.loc[:,cols_all].isnull().any()

BATHRM        False
HF_BATHRM     False
ROOMS         False
BEDRM         False
FIREPLACES    False
YEAR          False
EYB           False
GBA           False
LANDAREA      False
GRADE          True
AC            False
SOURCE        False
dtype: bool

The data cleaning all seems to work fine and removes our null values. In a later step we also should write unit tests to make sure they work fine.

In [20]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, x, y = None):
        return self
    
    def transform(self, x):
        return x.loc[:, self.columns]

Test scaling for numeric columns:

In [21]:
col_sel_num = ColumnSelector(cols_num)
x_train_num = col_sel_num.fit_transform(x_train)
x_train_num.head(3)

Unnamed: 0_level_0,BATHRM,HF_BATHRM,ROOMS,BEDRM,FIREPLACES,YEAR,EYB,GBA,LANDAREA
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
155050,2,1,4,2,0,2009,1900,1005.0,599
112994,2,0,4,2,0,1995,1960,1027.0,1148
30395,1,1,6,3,1,2004,1960,1322.0,3630


In [22]:
scaler = StandardScaler()
x_train_num_scaled = scaler.fit_transform(x_train_num)
pd.DataFrame(x_train_num_scaled).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.147663,0.967529,-0.728502,-0.434724,-0.426134,-0.095199,-2.394575,-0.465997,-0.546402
1,0.147663,-0.774144,-0.728502,-0.434724,-0.426134,-2.211523,-0.208826,-0.438521,-0.344289
2,-0.876478,0.967529,0.030793,0.283102,0.516963,-0.851029,-0.208826,-0.070098,0.569453


Test ordinal encoding for grades:

In [23]:
col_sel_ord = ColumnSelector(cols_ord)
x_train_ord = col_sel_ord.fit_transform(x_train)
x_train_ord.head(3)

Unnamed: 0_level_0,GRADE
index,Unnamed: 1_level_1
155050,
112994,
30395,Good Quality


In [24]:
mapping_ord = [{'col': 'GRADE','mapping': [(None, 0),
                                       ('Low Quality', 1),
                                       ('Fair Quality', 2),
                                       ('Average', 3),
                                       ('Above Average', 4),
                                       ('Good Quality', 5),
                                       ('Very Good', 6),
                                       ('Excellent', 7),
                                       ('Superior', 8),
                                       ('Exceptional-A', 9),
                                       ('Exceptional-B', 10),
                                       ('No Data', 11),
                                       ('Exceptional-D', 12),
                                       ('Exceptional-C', 13)]}]
ord_encoder = OrdinalEncoder(cols=cols_ord, mapping=mapping_ord)
x_train_ord_encoded = ord_encoder.fit_transform(x_train_ord)
x_train_ord_encoded.head(3)

Unnamed: 0_level_0,GRADE
index,Unnamed: 1_level_1
155050,0
112994,0
30395,5


Test one-hot-encoding for categories:

In [25]:
col_sel_cat = ColumnSelector(cols_cat)
x_train_cat = col_sel_cat.fit_transform(x_train)
x_train_cat.head(3)

Unnamed: 0_level_0,AC,SOURCE
index,Unnamed: 1_level_1,Unnamed: 2_level_1
155050,Y,Condominium
112994,Y,Condominium
30395,Y,Residential


In [26]:
one_hot_encoder = OneHotEncoder(drop_invariant=True)
x_train_one_hot_encoded = one_hot_encoder.fit_transform(x_train_cat)
x_train_one_hot_encoded.head(3)

Unnamed: 0_level_0,AC_1,AC_2,SOURCE_1,SOURCE_2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
155050,1,0,1,0
112994,1,0,1,0
30395,1,0,0,1


### 2.4 Load Data and Define Pipeline

We need to reset ```x_train``` and ```y_train``` in order for them being used in the pipeline. Let's set up the transformation pipeline using the classes we defined above. 

In [27]:
x_train = df.drop('PRICE', axis=1)
y_train = df.loc[:,['PRICE']]

In [28]:
processing_pipeline = make_pipeline(
    
    # RowNanChecker(my_column) # to be implemented if time allows
    MergeColumns('GBA', 'LIVING_GBA'),
    ConvertZeroToN('AC'),
    ConvertStringDateToYear('SALEDATE'),
    ColumnSelector(cols_all),  # we have to select all the columns used in the union
    make_union(
        # First, we select and 'hold out' the discrete variables, as we wont do any further work to them.
        make_pipeline(ColumnSelector(cols_num),
                      StandardScaler()
        ),
        make_pipeline(ColumnSelector(cols_ord),
                      OrdinalEncoder()
        ),
        make_pipeline(ColumnSelector(cols_cat),
                      OneHotEncoder()
        )
    )
)

In [29]:
pipeline = (make_pipeline(processing_pipeline, RandomForestRegressor(random_state=1, 
                                                                          n_jobs=-1, 
                                                                          n_estimators=100)))

### 2.5 Fit Pipeline and Evaluate Accuracy

In [30]:
pipeline.fit(x_train, pd.Series.ravel(y_train))

Pipeline(memory=None,
     steps=[('pipeline', Pipeline(memory=None,
     steps=[('mergecolumns', <__main__.MergeColumns object at 0x000002A801A9F160>), ('convertzeroton', <__main__.ConvertZeroToN object at 0x000002A801A9F198>), ('convertstringdatetoyear', <__main__.ConvertStringDateToYear object at 0x000002A801A9F2E8>), ('co...stimators=100, n_jobs=-1,
           oob_score=False, random_state=1, verbose=0, warm_start=False))])

In [31]:
pred_train = compare_predictions(x_train, y_train, pipeline, y_train['PRICE'].mean())

RMSE Lazy Predictor 7031964.244319852
MAE Lazy Predictor 943573.40586325
R^2 Lazy Predictor 0.0

RMSE 5456647.309092143
MAE 528230.193196893
R^2 0.3978586381683459

RMSE Improvement: 1575316.9352277098
MAE Inprovement: 415343.21266635705
R^2 Improvement: 0.3978586381683459


In [32]:
df_holdout = pd.read_csv('data/dc_housing/holdout_test_data.csv', index_col=0, low_memory=False)
df_holdout = df_holdout[~np.isnan(df_holdout['PRICE'])]

In [33]:
x_test = df_holdout.drop('PRICE', axis=1)
y_test = df_holdout.loc[:,['PRICE']]

In [34]:
pred_train = compare_predictions(x_test, y_test, pipeline, y_test['PRICE'].mean())

RMSE Lazy Predictor 9550662.233815953
MAE Lazy Predictor 1661050.334916615
R^2 Lazy Predictor 0.0

RMSE 4486150.59644549
MAE 332891.06372184394
R^2 0.7793617905404602

RMSE Improvement: 5064511.637370463
MAE Inprovement: 1328159.271194771
R^2 Improvement: 0.7793617905404602
