In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder

from category_encoders import MEstimateEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import cross_val_score
# from category_encoders import TargetEncoder

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, Lasso, LassoCV, LassoLarsCV
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

## Preprocessing

I will begin by using the knowledge I gained in my EDA to remove outliers and drop irrelevant columns.

In [5]:
df_all = pd.read_csv('./house-prices-advanced-regression-techniques/train.csv', index_col=0)

df_dropped = df_all.drop(['PoolQC', 'MiscFeature', 'Alley'], axis=1)
df = df_dropped[[
       'OverallQual', 
       'Neighborhood', 
       'GarageArea', 
       'GrLivArea', 
       'YearBuilt',
       'TotalBsmtSF', 
       'LotArea', 
       'BsmtQual', 
       'ExterQual',
       'KitchenQual', 
       '1stFlrSF', 
       'MSSubClass', 
       'YearRemodAdd',
       'FullBath',
       'GarageFinish', 
       'GarageYrBlt', 
       'LotFrontage', 
       'FireplaceQu',
       'TotRmsAbvGrd', 
       'SalePrice'
       ]].copy()
df = df.drop(df[(df['SalePrice']<300000) & (df['GrLivArea'] > 4000)].index)


y = df.pop('SalePrice')
log_y = np.log(y)

## Pipeline

One could argue there is little need for a pipeline with a relatively simple model, however it is good pratice beacuse it reduces data leakage in cross validation and allows for easy experimentation with models and preprocessing methods.

Remember a column transformer can be used to transform different columns but it doesn't link the transforms togther. So if you need to apply to transformations to one column you need to put them in another pipeline then put that pipeline in the column transformer, as I've done here.

In [31]:
# Need to just use multivariate imputer for this I think
# def custom_imputer(df):
#     df['LotFrontage'] = df.groupby(by='Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
#     return df
# custom_imputer_transformer = FunctionTransformer(func=custom_imputer, validate=False)

def log_scaler(df):
    df['GrLivArea'] = np.log(df['GrLivArea'])
    return df

log_transformer = FunctionTransformer(func=log_scaler, validate=False)

# Orindal encoding setup
ordinal_features = [
                    'GarageFinish',
                    'BsmtQual',
                    'ExterQual',
                    'KitchenQual', 
                    'FireplaceQu'
                    ]
five_lvls = ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
garage_lvls = ['None', 'Unf', 'RFn', 'Fin']


# Target encoding
target_enc_features = ['Neighborhood']

# Onehot encoding setup
onehot_features = ['MSSubClass','Neighborhood']

# Features to log scale
log_features = ['GrLivArea']


preprocessPipe1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('encoder', OrdinalEncoder(categories=[garage_lvls,five_lvls,five_lvls,five_lvls,five_lvls], 
                                        handle_unknown='error',
    ))
])


# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('Pipe1', preprocessPipe1, ordinal_features),
        ('MedianImputer', SimpleImputer(strategy='median'), ['LotFrontage']),
        ('imputeGaragerYrBlt', SimpleImputer(strategy='constant', fill_value=0), ['GarageYrBlt']),
        ('log_scaler', log_transformer, log_features),
        # ('targetEncoder', MEstimateEncoder( m=.06), target_enc_features),
        ('onehot', OneHotEncoder(handle_unknown='ignore'), onehot_features),
    ], remainder='passthrough')


# Create the final pipeline with the preprocessor and your model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', ElasticNet(alpha=0.0001, l1_ratio=1.0))
])

# lebosh = pd.DataFrame(preprocessor.fit_transform(df))
# # lebosh.isna().any().sum()
# lebosh.head(10)


In [32]:
def cv(my_pipeline):
    scores = -1 * cross_val_score(my_pipeline, df, log_y,
                                  cv=5,
                                  scoring='neg_mean_squared_error')
    return np.mean(np.sqrt(scores))

print(f'The RMSLE in SalePrice is : \n{cv(pipeline)}')

The RMSLE in SalePrice is : 
0.13409685261578358


### Grid search

In [26]:
params = {'model__alpha' : [0.01, 0.001, 0.0001, 0.00001],
          'model__l1_ratio': [1.0, 0.9, 0.8, 0.7]}

gs= GridSearchCV(pipeline,
                    param_grid=params,
                    scoring='neg_mean_squared_error',
                    cv=5,
                    verbose=1
                )

gs.fit(df, log_y)
gs.best_params_


Fitting 5 folds for each of 16 candidates, totalling 80 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'model__alpha': 0.0001, 'model__l1_ratio': 1.0}

## Stacking models