In [126]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
# from category_encoders import TargetEncoder

## Preprocessing

I will begin by using the knowledge I gained in my EDA to remove outliers and drop irrelevant columns.

In [30]:
df_all = pd.read_csv('./house-prices-advanced-regression-techniques/train.csv', index_col=0)

In [31]:
df_dropped = df_all.drop(['PoolQC', 'MiscFeature', 'Alley'], axis=1)
df = df_dropped[['OverallQual', 'Neighborhood', 'GarageArea', 'GrLivArea', 'YearBuilt',
       'TotalBsmtSF', 'LotArea', 'BsmtQual', 'ExterQual',
       'KitchenQual', '1stFlrSF', 'MSSubClass', 'YearRemodAdd', 'FullBath',
       'GarageFinish', 'GarageYrBlt', 'LotFrontage', 'FireplaceQu',
       'TotRmsAbvGrd', 'SalePrice']].copy()
df = df.drop(df[(df['SalePrice']<300000) & (df['GrLivArea'] > 4000)].index)

In [32]:
y = df.pop('SalePrice')
log_y = np.log(y)

In [24]:
df.dtypes

OverallQual       int64
Neighborhood     object
GarageArea        int64
GrLivArea         int64
YearBuilt         int64
TotalBsmtSF       int64
LotArea           int64
BsmtQual         object
ExterQual        object
KitchenQual      object
1stFlrSF          int64
MSSubClass        int64
YearRemodAdd      int64
FullBath          int64
GarageFinish     object
GarageYrBlt     float64
LotFrontage     float64
FireplaceQu      object
TotRmsAbvGrd      int64
SalePrice         int64
dtype: object

In [21]:
df.head(10)

Unnamed: 0_level_0,OverallQual,Neighborhood,GarageArea,GrLivArea,YearBuilt,TotalBsmtSF,LotArea,BsmtQual,ExterQual,KitchenQual,1stFlrSF,MSSubClass,YearRemodAdd,FullBath,GarageFinish,GarageYrBlt,LotFrontage,FireplaceQu,TotRmsAbvGrd,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,7,CollgCr,548,1710,2003,856,8450,Gd,Gd,Gd,856,60,2003,2,RFn,2003.0,65.0,,8,208500
2,6,Veenker,460,1262,1976,1262,9600,Gd,TA,TA,1262,20,1976,2,RFn,1976.0,80.0,TA,6,181500
3,7,CollgCr,608,1786,2001,920,11250,Gd,Gd,Gd,920,60,2002,2,RFn,2001.0,68.0,TA,6,223500
4,7,Crawfor,642,1717,1915,756,9550,TA,TA,Gd,961,70,1970,1,Unf,1998.0,60.0,Gd,7,140000
5,8,NoRidge,836,2198,2000,1145,14260,Gd,Gd,Gd,1145,60,2000,2,RFn,2000.0,84.0,TA,9,250000
6,5,Mitchel,480,1362,1993,796,14115,Gd,TA,TA,796,50,1995,1,Unf,1993.0,85.0,,5,143000
7,8,Somerst,636,1694,2004,1686,10084,Ex,Gd,Gd,1694,20,2005,2,RFn,2004.0,75.0,Gd,7,307000
8,7,NWAmes,484,2090,1973,1107,10382,Gd,TA,TA,1107,60,1973,2,RFn,1973.0,,TA,7,200000
9,7,OldTown,468,1774,1931,952,6120,TA,TA,TA,1022,50,1950,2,Unf,1931.0,51.0,TA,8,129900
10,5,BrkSide,205,1077,1939,991,7420,TA,TA,TA,1077,190,1950,1,RFn,1939.0,50.0,TA,5,118000


In [132]:
def custom_imputer(df):
    df['FireplaceQu'] = df['FireplaceQu'].fillna('None')
    df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
    df['GarageFinish'] = df['GarageFinish'].fillna('None')
    df['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)
    df['BsmtQual'] = df['BsmtQual'].fillna('None')
    return df

def log_scaler(df):
    df['GrLivArea'] = np.log(df['GrLivArea'])
    return df


## Pipeline

In [133]:
# Custom transformers using FunctionTransformer
custom_imputer_transformer = FunctionTransformer(func=custom_imputer, validate=False)
log_transformer = FunctionTransformer(func=log_scaler, validate=False)

# Orindal encoding setup
ordinal_features = ['BsmtQual','ExterQual', 'KitchenQual', 'GarageFinish','FireplaceQu']
five_lvls = ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
garage_lvls = ['None', 'Unf', 'RFn', 'Fin']


# Target encoding
target_enc_features = ['Neighborhood']

# Onehot encoding setup
onehot_features = ['MSSubClass']

# Features to log scale
log_features = ['GrLivArea']


# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('custom', custom_imputer_transformer, ['FireplaceQu', 'LotFrontage', 'GarageFinish', 'GarageYrBlt', 'BsmtQual']),
        ('log_scaler', log_transformer, log_features),
        ('ordinal', OrdinalEncoder(categories=[five_lvls,five_lvls,five_lvls,garage_lvls,five_lvls]), ordinal_features),
        ('target', LabelEncoder(), target_enc_features),
        ('onehot', OneHotEncoder(), onehot_features)
    ])

# Create the final pipeline with the preprocessor and your model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('model', Ridge())])


In [137]:
scores = -1 * cross_val_score(pipeline, df, y,
                                  cv=5,
                                  scoring='neg_mean_squared_error',
                                  error_score='raise')

KeyError: 'Neighborhood'

In [None]:
def cv(my_pipeline):
    scores = -1 * cross_val_score(my_pipeline, X, y,
                                  cv=5,
                                  scoring='neg_mean_squared_error')
    return np.sqrt(scores)