# House price prediction with linear regression

In [108]:
# %pip install pipenv
# %pip install scikit-learn
# %pip install seaborn
# %pip install matplotlib
# %pip install numpy
# %pip install tensorflow
# %pip install xgboost
# %pip install ipympl
%matplotlib widget

In [109]:
import math
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, FunctionTransformer, PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt


In [110]:
train_dataset = pd.read_csv('train_inliers.csv')

In [111]:
X_all = train_dataset.iloc[:,:-1]
Y_all = train_dataset.iloc[:,-1]
X_train, X_validate, Y_train, Y_validate = train_test_split(X_all, Y_all, test_size=0.2)
print(X_train.shape, Y_train.shape, X_validate.shape, Y_validate.shape)
# Y_train = np.log(Y_train)
# Y_validate = np.log(Y_validate)

(1137, 81) (1137,) (285, 81) (285,)


In [112]:
def check_model(X_train, Y_train, X_validate, Y_validate, model, preprocessor):
    print(X_train.shape, Y_train.shape)
    preprocessor.fit(X_train)
    X_train_scaled = preprocessor.transform(X_train)
    print('X_train_scaled shape', X_train_scaled.shape)
    model.fit(X_train_scaled, Y_train)
    Y_train_predict = model.predict(X_train_scaled)
    train_r2score = r2_score(Y_train, Y_train_predict)
    print(f'Train R2={train_r2score:.3f}')

    X_validate_scaled = preprocessor.transform(X_validate)
    Y_validate_predict = model.predict(X_validate_scaled)
    validate_r2score = r2_score(Y_validate, Y_validate_predict)
    print(f'Validate R2={validate_r2score:.3f}')

    return validate_r2score


In [113]:
def build_preprocessor():
    return make_pipeline(
        ColumnTransformer(transformers=[
           ('numeric1', StandardScaler(), ['LotArea', 'GrLivArea', 'OverallQual', 'YearBuilt'])
        ])
    )

def build_model():
    model = LinearRegression()
    return model

check_model(X_train, Y_train, X_validate, Y_validate, build_model(), build_preprocessor())

(1137, 81) (1137,)
X_train_scaled shape (1137, 4)
Train R2=0.788
Validate R2=0.791


0.791

In [114]:
def build_preprocessor_2():
    return make_pipeline(
        ColumnTransformer(transformers=[
           ('numeric1', StandardScaler(), 
            ['LotArea', 'GrLivArea', 'OverallQual', 'YearBuilt', 'GarageArea', 'Fireplaces']
            )
        ])
    )

check_model(X_train, Y_train, X_validate, Y_validate, build_model(), build_preprocessor_2())

(1137, 81) (1137,)
X_train_scaled shape (1137, 6)
Train R2=0.806
Validate R2=0.816


0.816

In [115]:
def build_preprocessor_3():
    return make_pipeline(
        ColumnTransformer(transformers=[
           ('numeric1', StandardScaler(), ['OverallQual', 'YearBuilt', 'Fireplaces', 'GarageArea']),
           ('numeric_log', FunctionTransformer(lambda x: np.log(x+0.001)), ['LotArea', 'GrLivArea'])
        ])
    )

check_model(X_train, Y_train, X_validate, Y_validate, build_model(), build_preprocessor_3())

(1137, 81) (1137,)
X_train_scaled shape (1137, 6)
Train R2=0.798
Validate R2=0.807


0.807

In [116]:
selected_numerical_features = [
    'OverallQual', 'YearBuilt', 'Fireplaces', 'GarageArea',
    'LotArea', 'GrLivArea',
    'YearRemodAdd', 'TotRmsAbvGrd',
    'WoodDeckSF', 
]

def build_preprocessor_3a():
    return make_pipeline(
        ColumnTransformer(transformers=[
           ('numeric1', StandardScaler(), selected_numerical_features),
        ])
    )

check_model(X_train, Y_train, X_validate, Y_validate, build_model(), build_preprocessor_3a())

(1137, 81) (1137,)
X_train_scaled shape (1137, 9)
Train R2=0.815
Validate R2=0.820


0.820

## Enhance with one hot encoded features

In [117]:
one_hot_features = [
'MSZoning',
'LotShape',
'LandContour',
'LotConfig',
'LandSlope',
'Neighborhood',
'BldgType',
'HouseStyle',
# 'RoofStyle',
# 'Exterior1st',
# 'Exterior2nd',
# 'Foundation',
# 'CentralAir',
# 'Electrical',
# 'GarageType',
'GarageFinish',
# 'SaleType',
#  'MSSubClass'
# 'SaleCondition',
# 'RoofMatl' #Missing ['ClyTile', 'Metal'] in train data
# 'BldgType'
]

def build_preprocessor_with_onehot_1():
    return make_pipeline(
        ColumnTransformer(transformers=[
           ('numeric1', StandardScaler(), selected_numerical_features),
           ('one_hot', OneHotEncoder(), one_hot_features)
        ])
    )

check_model(X_train, Y_train, X_validate, Y_validate, build_model(), build_preprocessor_with_onehot_1())

(1137, 81) (1137,)
X_train_scaled shape (1137, 72)
Train R2=0.877
Validate R2=0.877


0.877

## Enhance with ordinal features

In [118]:
ordinal_values_order = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
ordinal_features = [
    'ExterQual',
    'ExterCond',
    # 'HeatingQC',
    # 'KitchenQual'
]
ordinal_value_per_feature = len(ordinal_features)*[ordinal_values_order]

def build_preprocessor_with_ordinal():
    numeric_transformer = make_pipeline(
        SimpleImputer(strategy='mean'),
        StandardScaler()
    )
    ordinal_encoder = make_pipeline(
        OrdinalEncoder(categories=ordinal_value_per_feature, 
                                      encoded_missing_value=ordinal_values_order.index('TA')),
        StandardScaler()
    )
    one_hot_encoded = make_pipeline(
        SimpleImputer(strategy='most_frequent'), OneHotEncoder()
    )
    return make_pipeline(
        ColumnTransformer(transformers=[
           ('numeric1', numeric_transformer, selected_numerical_features),
           ('one_hot', one_hot_encoded, one_hot_features),
           ('ordinal', ordinal_encoder, ordinal_features),
        ])
    )

check_model(X_train, Y_train, X_validate, Y_validate, build_model(), build_preprocessor_with_ordinal())

(1137, 81) (1137,)
X_train_scaled shape (1137, 73)
Train R2=0.881
Validate R2=0.878


0.878

### Test with MasVnrArea

In [119]:
def build_preprocessor_with_masvnrarea():
    numeric_transformer = make_pipeline(
        SimpleImputer(strategy='mean'),
        StandardScaler()
    )
    numeric_transformer_2 = make_pipeline(
        SimpleImputer(strategy='constant', fill_value=0),
        StandardScaler() 
    )
    ordinal_encoder = make_pipeline(
        OrdinalEncoder(categories=ordinal_value_per_feature, 
                                      encoded_missing_value=ordinal_values_order.index('TA')),
        StandardScaler()
    )
    one_hot_encoded = make_pipeline(
        SimpleImputer(strategy='most_frequent'), OneHotEncoder()
    )
    return make_pipeline(
        ColumnTransformer(transformers=[
           ('numeric1', numeric_transformer, selected_numerical_features),
           ('numeric2', numeric_transformer_2, ['MasVnrArea']),
           ('one_hot', one_hot_encoded, one_hot_features),
           ('ordinal', ordinal_encoder, ordinal_features),
        ])
    )

check_model(X_train, Y_train, X_validate, Y_validate, build_model(), build_preprocessor_with_masvnrarea())

(1137, 81) (1137,)
X_train_scaled shape (1137, 74)
Train R2=0.882
Validate R2=0.881


0.881

## Check model with polynomials

In [120]:
def build_preprocessor_4(degree):
    numeric_transformer = make_pipeline(
        SimpleImputer(strategy='mean'),
        StandardScaler(),
        PolynomialFeatures(degree=degree, include_bias=False),
        StandardScaler(with_mean=False)
    )
    numeric_transformer_2 = make_pipeline(
        SimpleImputer(strategy='constant', fill_value=0),
        StandardScaler() 
    )
    ordinal_encoder = make_pipeline(
        OrdinalEncoder(categories=ordinal_value_per_feature, 
                                      encoded_missing_value=ordinal_values_order.index('TA')),
        StandardScaler()
    )
    one_hot_encoded = make_pipeline(
        SimpleImputer(strategy='most_frequent'), OneHotEncoder()
    )
    return make_pipeline(
        ColumnTransformer(transformers=[
           ('numeric1', numeric_transformer, selected_numerical_features),
           ('numeric2', numeric_transformer_2, ['MasVnrArea']),
           ('one_hot', one_hot_encoded, one_hot_features),
           ('ordinal', ordinal_encoder, ordinal_features),
        ])
    )

def model_2(alpha):
    return Ridge(alpha=alpha)

best_params = (None, None)
best_result = None

for degree in range(2, 4):
    alpha = 3
    print(f'Degree={degree}, alpha={alpha}')
    preprocessor = build_preprocessor_4(degree)
    model = model_2(alpha)
    score = check_model(X_train, Y_train, X_validate, Y_validate, model, preprocessor)
    if not best_result or score > best_result:
        best_result = score
        best_params = (degree, alpha)
    print('---')

all_alphas = [1.0, 3.0, 10.0, 30.0, 100.0]
for alpha in all_alphas:
    degree = 2
    print(f'Degree={degree}, alpha={alpha}')
    preprocessor = build_preprocessor_4(degree)
    model = model_2(alpha)
    score = check_model(X_train, Y_train, X_validate, Y_validate, model, preprocessor)
    if score > best_result:
        best_result = score
        best_params = (degree, alpha)
    print('---')

print(f'The best score: {best_result} for degree={best_params[0]}, alpha={best_params[1]}')

Degree=2, alpha=3
(1137, 81) (1137,)
X_train_scaled shape (1137, 119)
Train R2=0.904
Validate R2=0.892
---
Degree=3, alpha=3
(1137, 81) (1137,)
X_train_scaled shape (1137, 284)
Train R2=0.929
Validate R2=0.878
---
Degree=2, alpha=1.0
(1137, 81) (1137,)
X_train_scaled shape (1137, 119)
Train R2=0.905
Validate R2=0.891
---
Degree=2, alpha=3.0
(1137, 81) (1137,)
X_train_scaled shape (1137, 119)
Train R2=0.904
Validate R2=0.892
---
Degree=2, alpha=10.0
(1137, 81) (1137,)
X_train_scaled shape (1137, 119)
Train R2=0.902
Validate R2=0.891
---
Degree=2, alpha=30.0
(1137, 81) (1137,)
X_train_scaled shape (1137, 119)
Train R2=0.896
Validate R2=0.890
---
Degree=2, alpha=100.0
(1137, 81) (1137,)
X_train_scaled shape (1137, 119)
Train R2=0.887
Validate R2=0.887
---
The best score: 0.8918143927491584 for degree=2, alpha=3


## Checkpoint with the best model

In [121]:
def build_best_preprocessor(degree=2):
    numeric_transformer = make_pipeline(
        SimpleImputer(strategy='mean'),
        StandardScaler(),
        PolynomialFeatures(degree=degree, include_bias=False),
        StandardScaler(with_mean=False)
    )
    numeric_transformer_2 = make_pipeline(
        SimpleImputer(strategy='constant', fill_value=0),
        StandardScaler() 
    )
    ordinal_encoder = make_pipeline(
        OrdinalEncoder(categories=ordinal_value_per_feature, 
                                      encoded_missing_value=ordinal_values_order.index('TA')),
        StandardScaler()
    )
    one_hot_encoded = make_pipeline(
        SimpleImputer(strategy='most_frequent'), OneHotEncoder()
    )
    return make_pipeline(
        ColumnTransformer(transformers=[
           ('numeric1', numeric_transformer, selected_numerical_features),
           ('numeric2', numeric_transformer_2, ['MasVnrArea']),
           ('one_hot', one_hot_encoded, one_hot_features),
           ('ordinal', ordinal_encoder, ordinal_features),
        ])
    )

def build_best_model(alpha=30.0):
    return Ridge(alpha=alpha)

In [139]:
best_preprocessor = build_best_preprocessor()
best_model = build_best_model()
check_model(X_train, Y_train, X_validate, Y_validate, best_model, best_preprocessor)

(1137, 81) (1137,)
X_train_scaled shape (1137, 119)
Train R2=0.896
Validate R2=0.890


0.890

In [140]:
X_train_scaled = best_preprocessor.transform(X_train)
Y_train_predic = best_model.predict(X_train_scaled)
df_prediction = pd.DataFrame(data={'y': Y_train, 'y-predict': Y_train_predic})
df_prediction.head(n=20)

Unnamed: 0,y,y-predict
508,234000,231532.167378
1282,206900,192940.542038
1056,192000,212893.388568
1275,147000,130597.883614
530,179665,203568.810975
107,136900,149710.614882
1360,137500,119772.474407
380,426000,396712.722133
529,133000,133772.605974
1421,147500,137419.144974


In [141]:
X_validate_scaled = best_preprocessor.transform(X_validate)
Y_validate_predic = best_model.predict(X_validate_scaled)
df_v_prediction = pd.DataFrame(data={'y': Y_validate, 'y-predict': Y_validate_predic})
df_v_prediction.head(n=20)

Unnamed: 0,y,y-predict
272,415298,355347.233809
1401,149700,149247.905282
260,185000,173270.612249
328,214000,209180.810144
1352,131000,130836.321201
484,91300,105274.658183
166,228000,250104.044189
250,207500,207927.071991
1380,340000,333263.418042
1284,208900,191662.313925


## F-statistic and p-values

In [142]:
%precision 3
preprocessed_columns = best_preprocessor.get_feature_names_out()
from sklearn.feature_selection import f_regression
X_train_scaled = best_preprocessor.transform(X_train)
f_stat, p_values = f_regression(X_train_scaled,Y_train.to_numpy().reshape(1,-1)[0])
df_stat = pd.DataFrame(data={'fstat': f_stat, 'pvalue': p_values}, index=preprocessed_columns)
pd.set_option('display.max_rows', 200)
df_stat

Unnamed: 0,fstat,pvalue
numeric1__OverallQual,2081.543282,5.436350999999999e-259
numeric1__YearBuilt,555.876432,2.353973e-100
numeric1__Fireplaces,298.905037,1.2538090000000001e-59
numeric1__GarageArea,737.43571,1.567939e-125
numeric1__LotArea,91.820103,5.750004e-21
numeric1__GrLivArea,1144.112296,5.031759e-174
numeric1__YearRemodAdd,504.628254,9.365245e-93
numeric1__TotRmsAbvGrd,395.829435,8.534092e-76
numeric1__WoodDeckSF,136.280377,8.115484e-30
numeric1__OverallQual^2,107.858783,3.365016e-24


## Test dataset evaluation

In [143]:
X_test = pd.read_csv('test.csv')
X_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [144]:
tmp = X_test[selected_numerical_features + ordinal_features + one_hot_features]

tmp.isnull().any()

OverallQual     False
YearBuilt       False
Fireplaces      False
GarageArea       True
LotArea         False
GrLivArea       False
YearRemodAdd    False
TotRmsAbvGrd    False
WoodDeckSF      False
ExterQual       False
ExterCond       False
MSZoning         True
LotShape        False
LandContour     False
LotConfig       False
LandSlope       False
Neighborhood    False
BldgType        False
HouseStyle      False
GarageFinish     True
dtype: bool

In [145]:
X_test_scaled = best_preprocessor.transform(X_test)
Y_test_predict = best_model.predict(X_test_scaled)

In [146]:
df_output = pd.DataFrame(data={'Id': X_test['Id'], 'SalePrice': Y_test_predict})
df_output.head()

Unnamed: 0,Id,SalePrice
0,1461,126589.909552
1,1462,146319.253683
2,1463,175726.048901
3,1464,185086.491959
4,1465,205405.442017


In [147]:
# df_output.to_csv('./A_submission.csv', index=False)