In [1]:
%run utils.ipynb

In [2]:
warnings.filterwarnings('ignore')

In [3]:
pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)

In [4]:
df=pd.read_csv("../data/train.csv")

#drop some useless columns
cols_to_drop=["Id","PID"]
df=df.drop(columns=cols_to_drop)

#drop outliers identified in part 1
df=df[df['Gr Liv Area']<=4500]
df=df[df['SalePrice']>np.expm1(10)]

df_copy=df.copy()

In [5]:
features_to_keep=['Gr Liv Area', 'Overall Qual', 'Year Built', 'Total Bsmt SF', 'Fireplace Qu',
 'BsmtFin SF 1', 'Overall Cond', 'Garage Cars', 'Functional', 'Year Remod/Add', 'Exter Qual',
 'Foundation_PConc', 'Garage Type_Attchd', 'Kitchen Qual', 'Garage Type_Detchd', 'Lot Frontage',
 'Lot Area']

In [6]:
cols_to_impute_with_none=["Pool QC","Misc Feature","Alley","Fence","Fireplace Qu",
                          "Garage Finish","Garage Qual","Garage Cond","Garage Type",
                         "Bsmt Exposure","BsmtFin Type 2","Bsmt Cond","Bsmt Qual","BsmtFin Type 1",
                         "Mas Vnr Type"]

cols_to_impute_with_zero=["Garage Yr Blt","Mas Vnr Area","Bsmt Full Bath","Bsmt Half Bath","Garage Area",
                         "Garage Cars","Total Bsmt SF","Bsmt Unf SF","BsmtFin SF 2","BsmtFin SF 1"]

cols_to_impute_with_mode=["Electrical"]

correlated_to_drop=["1st Flr SF","Garage Yr Blt","TotRms AbvGrd","Garage Area"]

In [7]:
#Create a regressor to dynamically creates a pipeline
def create_regressor(model,**kwargs):
    '''
    model: estimator instance
    return: TransformedTargetRegressor
    '''
    pipeline=Pipeline(steps=[
        ('drop_correlated',DropCorrelated(correlated_to_drop)),
        ('ms_subclass_convert',MSSubClassConvert()),
        ('standard_impute',StandardImpute(none=cols_to_impute_with_none,
                                          zero=cols_to_impute_with_zero,
                                          mode=cols_to_impute_with_mode)),
        ('lot_frotage_impute',LotFrontageImpute()),
        ('ordinal_to_numerical',OrdinalToNumeric()),
        ('onehotencode',OneHotEncode()),
        ('align_train_and_predict',AlignTrainPredict(**kwargs)), #This is to align the train and predict DF in case they are different
        ('passthrough',Passthrough()), #Passthrough step, does nothing. Only exists to allow external code to retrieve feature names.
        ('robustscalar',RobustScaler()),
        ('model',model)
    ])
    
    return TransformedTargetRegressor(regressor=pipeline,
                                    func=np.log1p,
                                    inverse_func=np.expm1
                                    )

In [8]:
#set up X and Y
X=df.drop(columns="SalePrice")
y=df["SalePrice"]

#make a backup copy
X_copy=X.copy()
y_copy=y.copy()

#### Without any regularization

In [9]:
#create regressor and instruct to keep only numerical features
reg=create_regressor(LinearRegression(),feature_names=features_to_keep)

grid = GridSearchCV(reg,cv=5,param_grid={'regressor__model__fit_intercept':[True]},
                    scoring='neg_root_mean_squared_error')
grid.fit(X,y)

display(-1*grid.best_score_)

24207.397640972507

#### With L1 regularization

In [10]:
L1_alpha=np.linspace(0,1,10)

#create regressor and instruct to keep only numerical features
reg=create_regressor(Lasso(),feature_names=features_to_keep)

grid = GridSearchCV(reg,param_grid={'regressor__model__alpha':L1_alpha},
                   cv=5,
                    scoring='neg_root_mean_squared_error')
grid.fit(X,y)

display(-1*grid.best_score_)
display(grid.best_params_)

24207.397640972526

{'regressor__model__alpha': 0.0}

Best L1 model is when $\alpha = 0$

#### With L2 regularization

In [None]:
L2_alpha=np.linspace(0,100,10)

#create regressor and instruct to keep only numerical features
reg=create_regressor(Lasso(),feature_names=features_to_keep)

grid = GridSearchCV(reg,param_grid={'regressor__model__alpha':L2_alpha},
                   cv=5,scoring='neg_root_mean_squared_error')
grid.fit(X,y)

display(-1*grid.best_score_)
display(grid.best_params_)

With Elastic Net regularization

In [None]:
EN_alpha=np.linspace(0,1,10)
EN_l1_ratio=np.linspace(0,1,10)

#create regressor and instruct to keep only numerical features
reg=create_regressor(ElasticNet(),feature_names=features_to_keep)

grid = GridSearchCV(reg,param_grid={'regressor__model__alpha':EN_alpha,
                                   'regressor__model__l1_ratio':EN_l1_ratio},
                   cv=5,scoring='neg_root_mean_squared_error')
grid.fit(X,y)

display(-1*grid.best_score_)
display(grid.best_params_)

It appears the best model is still the linear model with no regularization.

### Try a submission

In [None]:
#Instantiate a copy of Tester helper class
tester=Tester("../data/test.csv")

In [None]:
#obtain a copy of the test set
df_test=tester.get_test_df()

In [None]:
#reset the df
X=X_copy.copy()

#instantiate and train the model using the entire dataset
lr=create_regressor(LinearRegression(),feature_names=features_to_keep)
lr.fit(X,y)

In [None]:
#make prediction
ypred=lr.predict(df_test)

In [None]:
tester.write_submission(ypred,"../data/05_final.csv")