In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder

from category_encoders import MEstimateEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
# from category_encoders import TargetEncoder

## Preprocessing

I will begin by using the knowledge I gained in my EDA to remove outliers and drop irrelevant columns.

In [54]:
df_all = pd.read_csv('./house-prices-advanced-regression-techniques/train.csv', index_col=0)

df_dropped = df_all.drop(['PoolQC', 'MiscFeature', 'Alley'], axis=1)
df = df_dropped[[
       # 'OverallQual', 
       # 'Neighborhood', 
       # 'GarageArea', 
       # 'GrLivArea', 
       # 'YearBuilt',
       # 'TotalBsmtSF', 
       # 'LotArea', 
       'BsmtQual', 
       'ExterQual',
       'KitchenQual', 
       # '1stFlrSF', 
       # 'MSSubClass', 
       # 'YearRemodAdd',
       # 'FullBath',
       'GarageFinish', 
       # 'GarageYrBlt', 
       # 'LotFrontage', 
       'FireplaceQu',
       # 'TotRmsAbvGrd', 
       'SalePrice'
       ]].copy()
# df = df.drop(df[(df['SalePrice']<300000) & (df['GrLivArea'] > 4000)].index)

y = df.pop('SalePrice')
log_y = np.log(y)

In [3]:
def custom_imputer(df):
    # df['FireplaceQu'] = df['FireplaceQu'].fillna('None')
    # df['LotFrontage'] = df.groupby(by='Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
    df['GarageFinish'] = df['GarageFinish'].fillna('None')
    # df['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)
    # df['BsmtQual'] = df['BsmtQual'].fillna('None')
    return df

def log_scaler(df):
    df['GrLivArea'] = np.log(df['GrLivArea'])
    return df


## Pipeline

In [64]:
# Custom transformers using FunctionTransformer
custom_imputer_transformer = FunctionTransformer(func=custom_imputer, validate=False)
log_transformer = FunctionTransformer(func=log_scaler, validate=False)

# Orindal encoding setup
ordinal_features = [
                    'GarageFinish',
                    'BsmtQual',
                    'ExterQual',
                    'KitchenQual', 
                    'FireplaceQu'
                    ]
five_lvls = ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
garage_lvls = ['None', 'Unf', 'RFn', 'Fin']


# Target encoding
target_enc_features = ['Neighborhood']

# Onehot encoding setup
onehot_features = ['MSSubClass']

# Features to log scale
log_features = ['GrLivArea']


pipe1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('encoder', OrdinalEncoder(categories=[garage_lvls,five_lvls,five_lvls,five_lvls,five_lvls], 
                                        handle_unknown='error',
    ))
])


# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('SimpleImputer', pipe1, ordinal_features),
        ('MedianImputer', SimpleImputer(strategy='median'), ['LotFrontage']),
        # ('log_scaler', log_transformer, log_features),
        # ('target', MEstimateEncoder( m=.06), target_enc_features),
        ('onehot', OneHotEncoder(handle_unknown='ignore'), onehot_features),
    ])


lebosh = pd.DataFrame(preprocessor.fit_transform(df))
lebosh.loc[30:40,:]


Unnamed: 0,0,1,2,3,4
30,1.0,3.0,3.0,3.0,0.0
31,1.0,3.0,3.0,4.0,0.0
32,2.0,5.0,4.0,4.0,0.0
33,2.0,3.0,3.0,4.0,4.0
34,3.0,5.0,5.0,5.0,4.0
35,3.0,5.0,4.0,4.0,4.0
36,1.0,4.0,3.0,3.0,0.0
37,3.0,3.0,3.0,3.0,3.0
38,1.0,3.0,3.0,4.0,0.0
39,0.0,0.0,3.0,2.0,0.0


In [65]:
df.loc[31:41,ordinal_features]

Unnamed: 0_level_0,GarageFinish,BsmtQual,ExterQual,KitchenQual,FireplaceQu
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
31,Unf,TA,TA,TA,
32,Unf,TA,TA,Gd,
33,RFn,Ex,Gd,Gd,
34,RFn,TA,TA,Gd,Gd
35,Fin,Ex,Ex,Ex,Gd
36,Fin,Ex,Gd,Gd,Gd
37,Unf,Gd,TA,TA,
38,Fin,TA,TA,TA,TA
39,Unf,TA,TA,Gd,
40,,,TA,Fa,


In [104]:
# Create the final pipeline with the preprocessor and your model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', Ridge())
])

scores = -1 * cross_val_score(pipeline, df, y,
                                  cv=5,
                                  scoring='neg_mean_squared_error',
                                  error_score='raise',
                                  )
print(scores.mean())

ValueError: Found unknown categories [nan] in column 0 during fit

In [None]:
def cv(my_pipeline):
    scores = -1 * cross_val_score(my_pipeline, X, y,
                                  cv=5,
                                  scoring='neg_mean_squared_error')
    return np.sqrt(scores)