## 2 Pipeline Building

### 2.1 Row Removal
We remove a few rows before the actual pipeline (e.g. outliers) because they would prevent a good training. We will also need to remove rows with no price from the holdout data set.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt

In [2]:
import warnings
warnings.filterwarnings('ignore') # seaborn shows a lot of ugly warnings, let's suppress this for the analysis part

In [3]:
df = pd.read_csv('data/dc_housing/DC_Properties_training.csv', index_col=0, low_memory=False)

df = df.dropna(subset=["PRICE", 'SALEDATE', 'AYB'])

# remove properties very low or high price
df = df[df['PRICE'] >= 20000]
df = df[np.abs(df['PRICE'] - df['PRICE'].mean()) <= (1.2*df['PRICE'].std())]

# remove extreme properties
df = df[(df['HF_BATHRM'] <= 5) & 
        ((df['KITCHENS'] < 10) | (np.isnan(df['KITCHENS']))) &
        (df['FIREPLACES'] < 500) & 
        (df['BEDRM'] < 20)]

df = df[~df['SALEDATE'].isnull()]

In [4]:
x = df.drop('PRICE', axis=1)
y = df.loc[:,['PRICE']]

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)

### 2.2 Evaluation Function / Libraries

In [6]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestRegressor

def compare_predictions(x, y, finalpipeline, mean_price):
    # generate predictions:
    # Note: finalpipeline must have already been fit.
    
    predictions = finalpipeline.predict(x)
    y = y.reset_index()
    y.drop('index', axis=1, inplace=True)
    
    # a "lazy prediction" is where we return the average value of the target for every prediction.
    lazy_predictions = np.full(predictions.shape, mean_price)
    
    final_predictions = pd.DataFrame(pd.concat([y, 
                                                pd.Series(predictions), 
                                                pd.Series(lazy_predictions)], axis=1))
    final_predictions.rename(columns={'Price': 'True values',
                                      0: 'Predicted values',
                                      1: 'Lazy Predicted values'}, inplace=True)
    
    rmse_lazy = sqrt(mean_squared_error(y, lazy_predictions))
    mae_lazy = mean_absolute_error(y, lazy_predictions)
    r2_lazy = r2_score(y, lazy_predictions)
    
    rmse = sqrt(mean_squared_error(y, predictions))
    mae = mean_absolute_error(y, predictions)
    r2 = r2_score(y, predictions)
    
    print('RMSE Lazy Predictor', rmse_lazy)
    print('MAE Lazy Predictor', mae_lazy)
    print('R^2 Lazy Predictor', r2_lazy)
    print()
    print('RMSE', rmse)
    print('MAE', mae)
    print('R^2', r2)
    print()
    print('RMSE Improvement:', rmse_lazy - rmse)
    print('MAE Inprovement:', mae_lazy - mae)
    print('R^2 Improvement:', abs(r2_lazy - r2))
    
    return predictions

### 2.3 Pipeline Preparation

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.one_hot import OneHotEncoder
from sklearn.model_selection import train_test_split

In [8]:
df.columns

Index(['BATHRM', 'HF_BATHRM', 'HEAT', 'AC', 'NUM_UNITS', 'ROOMS', 'BEDRM',
       'AYB', 'YR_RMDL', 'EYB', 'STORIES', 'SALEDATE', 'PRICE', 'QUALIFIED',
       'SALE_NUM', 'GBA', 'BLDG_NUM', 'STYLE', 'STRUCT', 'GRADE', 'CNDTN',
       'EXTWALL', 'ROOF', 'INTWALL', 'KITCHENS', 'FIREPLACES', 'USECODE',
       'LANDAREA', 'GIS_LAST_MOD_DTTM', 'SOURCE', 'CMPLX_NUM', 'LIVING_GBA',
       'FULLADDRESS', 'CITY', 'STATE', 'ZIPCODE', 'NATIONALGRID', 'LATITUDE',
       'LONGITUDE', 'ASSESSMENT_NBHD', 'ASSESSMENT_SUBNBHD', 'CENSUS_TRACT',
       'CENSUS_BLOCK', 'WARD', 'SQUARE', 'X', 'Y', 'QUADRANT'],
      dtype='object')

In [9]:
cols_num = ['BATHRM','HF_BATHRM','ROOMS','BEDRM','FIREPLACES','YEAR', 'EYB','GBA','LANDAREA', 'AYB', 'CENSUS_TRACT']
cols_ord = ['GRADE','HEAT','ZIPCODE']
cols_cat = ['AC','SOURCE','QUALIFIED']

Optimizing: columns of team #1
```
numeric_columns = ['AYB', --> added
                   'EYB',
                   'CENSUS_TRACT',  --> added
                   'BATHRM',
                   'HF_BATHRM',
                   'ROOMS',
                   'BEDRM',
                   'SALE_NUM',  --> not added
                   'LANDAREA']

binary_columns = ['BLDG_NUM', --> not added
                  'QUALIFIED', --> added
                  'SOURCE']

one_hot_encode_column = ['AC',
                         'QUADRANT'] --> not added

label_encode_column = ['ZIPCODE', --> added
                      'HEAT', --> added
                      'USECODE'] --> not added
```

In [10]:
x = df.drop('PRICE', axis=1)
y = df.loc[:,['PRICE']]
#x.loc[:,cols_all].isnull().any()

In [11]:
x_train.loc[:,cols_all + ['LIVING_GBA', 'SALEDATE']].isnull().any()

BATHRM          False
HF_BATHRM       False
ROOMS           False
BEDRM           False
FIREPLACES      False
YEAR             True
EYB             False
GBA              True
LANDAREA        False
AYB             False
CENSUS_TRACT    False
GRADE            True
HEAT            False
ZIPCODE         False
AC              False
SOURCE          False
QUALIFIED       False
LIVING_GBA       True
SALEDATE        False
dtype: bool

All is as expected - we do have null values for ```GBA``` and ```LIVING_GBA``` (which we will merge into one column), ```GRADE``` which has null values for all condominiums (we will replace these with a standard value) and ```YEAR``` which we will populate from ```SALEDATE```.

Let's start with defining all the classes we'll need in the pipeline. We will test these right after the definition in the same order as we are using them in the pipeline (see section 2.4 to see the definition and order of the pipeline).

In [12]:
class MergeColumns(TransformerMixin):
    def __init__(self, column_one, column_two):
        self.column_one = column_one
        self.column_two = column_two
    
    def fit(self, x, y= None):
        return self
    
    def transform(self, x):
        x[self.column_one] = x[self.column_one].fillna(0) + x[self.column_two].fillna(0)
        x = x.drop(self.column_two, axis=1)
        return x

In [13]:
x_train.loc[:,['SOURCE','GBA','LIVING_GBA']].sample(3, random_state=1)

Unnamed: 0_level_0,SOURCE,GBA,LIVING_GBA
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
81833,Residential,1322.0,
57031,Residential,1200.0,
115217,Condominium,,643.0


In [14]:
merge_columns = MergeColumns('GBA', 'LIVING_GBA')
df_merge_columns = merge_columns.fit_transform(x_train)
x_train.loc[:,['SOURCE','GBA']].sample(3, random_state=1)

Unnamed: 0_level_0,SOURCE,GBA
index,Unnamed: 1_level_1,Unnamed: 2_level_1
81833,Residential,1322.0
57031,Residential,1200.0
115217,Condominium,643.0


In [15]:
class ConvertZeroToN(TransformerMixin):
    def __init__(self, column):
        self.column = column
    
    def fit(self, x, y= None):
        return self
    
    def transform(self, x):
        x[self.column][x[self.column] == '0'] = 'N'
        
        return x

In [16]:
x_train.loc[x_train['AC'] == '0'].loc[:,['AC']].head(3)

Unnamed: 0_level_0,AC
index,Unnamed: 1_level_1
138970,0
144084,0
145849,0


In [17]:
convert_zero_to_n = ConvertZeroToN('AC')
convert_zero_to_n.fit_transform(x_train)
x_train.loc[x_train['AC'] == '0'].loc[:,['AC']].head(3)

Unnamed: 0_level_0,AC
index,Unnamed: 1_level_1


In [18]:
class ConvertStringDateToYear(TransformerMixin):
    def __init__(self, column):
        self.column = column
    
    def fit(self, x, y= None):
        return self
    
    def transform(self, x):
        x[self.column] = pd.to_datetime(x[self.column], format='%Y-%m-%d', errors='coerce')
        x['YEAR'] = x[self.column].dt.year
        
        return x       

In [19]:
convert_string_date_to_year = ConvertStringDateToYear('SALEDATE')
convert_string_date_to_year.fit_transform(x_train)
x_train.loc[:,['YEAR']].head(3)

Unnamed: 0_level_0,YEAR
index,Unnamed: 1_level_1
24911,2003
30320,1997
79456,2014


In [20]:
x_train.loc[:,cols_all].isnull().any()

BATHRM          False
HF_BATHRM       False
ROOMS           False
BEDRM           False
FIREPLACES      False
YEAR            False
EYB             False
GBA             False
LANDAREA        False
AYB             False
CENSUS_TRACT    False
GRADE            True
HEAT            False
ZIPCODE         False
AC              False
SOURCE          False
QUALIFIED       False
dtype: bool

The data cleaning all seems to work fine and removes our null values. In a later step we also should write unit tests to make sure they work fine.

In [21]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, x, y = None):
        return self
    
    def transform(self, x):
        return x.loc[:, self.columns]

Test scaling for numeric columns:

In [22]:
col_sel_num = ColumnSelector(cols_num)
x_train_num = col_sel_num.fit_transform(x_train)
x_train_num.head(3)

Unnamed: 0_level_0,BATHRM,HF_BATHRM,ROOMS,BEDRM,FIREPLACES,YEAR,EYB,GBA,LANDAREA,AYB,CENSUS_TRACT
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
24911,4,0,10,3,2,2003,1958,2536.0,28562,1955.0,801.0
30320,3,0,11,5,2,1997,1957,3464.0,4425,1915.0,1100.0
79456,2,1,6,3,0,2014,2015,2208.0,1862,2014.0,9000.0


In [23]:
scaler = StandardScaler()
x_train_num_scaled = scaler.fit_transform(x_train_num)
pd.DataFrame(x_train_num_scaled).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,2.191778,-0.775505,1.546748,0.278211,1.980233,-1.00101,-0.277804,1.45487,9.976829,0.287979,-1.369089
1,1.1667,-0.775505,1.926922,1.718043,1.980233,-1.908089,-0.314205,2.623916,0.881204,-0.817887,-1.278069
2,0.141623,0.96792,0.026053,0.278211,-0.576072,0.661969,1.79705,1.041672,-0.084619,1.919131,1.126811


Test ordinal encoding for grades:

In [24]:
col_sel_ord = ColumnSelector(cols_ord)
x_train_ord = col_sel_ord.fit_transform(x_train)
x_train_ord.head(3)

Unnamed: 0_level_0,GRADE,HEAT,ZIPCODE
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
24911,Above Average,Warm Cool,20007.0
30320,Good Quality,Warm Cool,20015.0
79456,Good Quality,Forced Air,20018.0


In [25]:
mapping_ord = [{'col': 'GRADE','mapping': [(None, 0),
                                       ('Low Quality', 1),
                                       ('Fair Quality', 2),
                                       ('Average', 3),
                                       ('Above Average', 4),
                                       ('Good Quality', 5),
                                       ('Very Good', 6),
                                       ('Excellent', 7),
                                       ('Superior', 8),
                                       ('Exceptional-A', 9),
                                       ('Exceptional-B', 10),
                                       ('No Data', 11),
                                       ('Exceptional-D', 12),
                                       ('Exceptional-C', 13)]}]
#ord_encoder = OrdinalEncoder(cols=cols_ord, mapping=mapping_ord) # this would probably help for non RF estimators. Let's check.
ord_encoder = OrdinalEncoder()
x_train_ord_encoded = ord_encoder.fit_transform(x_train_ord)
x_train_ord_encoded.head(3)

Unnamed: 0_level_0,GRADE,HEAT,ZIPCODE
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
24911,1,1,20007.0
30320,2,1,20015.0
79456,2,2,20018.0


Test one-hot-encoding for categories:

In [26]:
col_sel_cat = ColumnSelector(cols_cat)
x_train_cat = col_sel_cat.fit_transform(x_train)
x_train_cat.head(3)

Unnamed: 0_level_0,AC,SOURCE,QUALIFIED
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
24911,Y,Residential,Q
30320,Y,Residential,Q
79456,Y,Residential,Q


In [27]:
one_hot_encoder = OneHotEncoder(drop_invariant=True)
x_train_one_hot_encoded = one_hot_encoder.fit_transform(x_train_cat)
x_train_one_hot_encoded.head(3)

Unnamed: 0_level_0,AC_1,AC_2,SOURCE_1,SOURCE_2,QUALIFIED_1,QUALIFIED_2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
24911,1,0,1,0,1,0
30320,1,0,1,0,1,0
79456,1,0,1,0,1,0


### 2.4 Load Data and Define Pipeline

We need to reset ```x_train``` and ```y_train``` in order for them being used in the pipeline. Let's set up the transformation pipeline using the classes we defined above. 

In [28]:
x_train = df.drop('PRICE', axis=1)
y_train = df.loc[:,['PRICE']]

In [29]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)

In [30]:
processing_pipeline = make_pipeline(
    
    # RowNanChecker(my_column) # to be implemented if time allows
    MergeColumns('GBA', 'LIVING_GBA'),
    ConvertZeroToN('AC'),
    ConvertStringDateToYear('SALEDATE'),
    ColumnSelector(cols_all),  # we have to select all the columns used in the union
    make_union(
        make_pipeline(ColumnSelector(cols_num),
                      StandardScaler()
        ),
        make_pipeline(ColumnSelector(cols_ord),
                      OrdinalEncoder()
        ),
        make_pipeline(ColumnSelector(cols_cat),
                      OneHotEncoder()
        )
    )
)

In [31]:
pipeline = (make_pipeline(processing_pipeline, RandomForestRegressor(random_state=1, 
                                                                          n_jobs=-1, 
                                                                          n_estimators=100)))

### 2.5 Fit Pipeline and Evaluate Accuracy

In [32]:
pipeline.fit(x_train, pd.Series.ravel(y_train))

Pipeline(memory=None,
     steps=[('pipeline', Pipeline(memory=None,
     steps=[('mergecolumns', <__main__.MergeColumns object at 0x0000020FB6322550>), ('convertzeroton', <__main__.ConvertZeroToN object at 0x0000020FB6322588>), ('convertstringdatetoyear', <__main__.ConvertStringDateToYear object at 0x0000020FB63225C0>), ('co...stimators=100, n_jobs=-1,
           oob_score=False, random_state=1, verbose=0, warm_start=False))])

In [33]:
pred_train = compare_predictions(x_train, y_train, pipeline, y_train['PRICE'].mean())

RMSE Lazy Predictor 494960.32152078144
MAE Lazy Predictor 302987.27212755894
R^2 Lazy Predictor 0.0

RMSE 309195.97905906546
MAE 152774.9008782249
R^2 0.6097643833561411

RMSE Improvement: 185764.34246171598
MAE Inprovement: 150212.37124933404
R^2 Improvement: 0.6097643833561411


In [34]:
pred_train = compare_predictions(x_test, y_test, pipeline, y_test['PRICE'].mean())

RMSE Lazy Predictor 494699.8282013914
MAE Lazy Predictor 302669.58503934625
R^2 Lazy Predictor 0.0

RMSE 171285.2062030734
MAE 76589.72365685426
R^2 0.8801173897063577

RMSE Improvement: 323414.621998318
MAE Inprovement: 226079.861382492
R^2 Improvement: 0.8801173897063577


MAE 76589