# Predicting housing prices with supervised learning - Linear Regression

# Data Loading

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
training = pd.read_csv("data//housing_train.csv", encoding="utf-8") 
testing = pd.read_csv("data//housing_test.csv", encoding="utf-8") 



# Data cleaning and preparation


In [33]:
# TODO
# Data overview
training.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [35]:
training.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [36]:
correlations = training.corr()
correlations = correlations["SalePrice"].sort_values(ascending=False)
correlations

  correlations = training.corr()


SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
Id              -0.021917
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePr

In [37]:
training_null = pd.isnull(training).sum()
testing_null = pd.isnull(testing).sum()
null = pd.concat([training_null,testing_null],axis =1 , keys = ['Training','Testing'])

In [38]:
print(null)

               Training  Testing
Id                    0      0.0
MSSubClass            0      0.0
MSZoning              0      4.0
LotFrontage         259    227.0
LotArea               0      0.0
...                 ...      ...
MoSold                0      0.0
YrSold                0      0.0
SaleType              0      1.0
SaleCondition         0      0.0
SalePrice             0      NaN

[81 rows x 2 columns]


In [39]:
null_many = null[null.sum(axis=1)>200]
null_few = null[(null.sum(axis=1)>0) & (null.sum(axis=1)<200)]
print(null_many)

             Training  Testing
LotFrontage       259    227.0
Alley            1369   1352.0
FireplaceQu       690    730.0
PoolQC           1453   1456.0
Fence            1179   1169.0
MiscFeature      1406   1408.0


In [40]:
null_objects = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "MasVnrType","BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature"]

for i in null_objects:
    training[i].fillna('None', inplace = True)
    testing[i].fillna('None', inplace = True)


In [43]:
training_null = pd.isnull(training).sum()
testing_null = pd.isnull(testing).sum()
null = pd.concat([training_null,testing_null],axis =1 , keys = ['Training','Testing'])
print(null)

               Training  Testing
Id                    0      0.0
MSSubClass            0      0.0
MSZoning              0      4.0
LotFrontage         259    227.0
LotArea               0      0.0
...                 ...      ...
MoSold                0      0.0
YrSold                0      0.0
SaleType              0      1.0
SaleCondition         0      0.0
SalePrice             0      NaN

[81 rows x 2 columns]


In [44]:
null_many = null[null.sum(axis=1)>200]
null_few = null[(null.sum(axis=1)>0) & (null.sum(axis=1)<200)]
print(null_many)

             Training  Testing
LotFrontage       259    227.0


In [45]:
null_few

Unnamed: 0,Training,Testing
MSZoning,0,4.0
Utilities,0,2.0
Exterior1st,0,1.0
Exterior2nd,0,1.0
MasVnrArea,8,15.0
BsmtFinSF1,0,1.0
BsmtFinSF2,0,1.0
BsmtUnfSF,0,1.0
TotalBsmtSF,0,1.0
Electrical,1,0.0


In [48]:
from sklearn.impute import SimpleImputer
Imputer = SimpleImputer(strategy = 'median')
training["GarageYrBlt"].fillna(training["GarageYrBlt"].median(), inplace = True)
testing["GarageYrBlt"].fillna(testing["GarageYrBlt"].median(), inplace = True)
training["MasVnrArea"].fillna(training["MasVnrArea"].median(), inplace = True)
testing["MasVnrArea"].fillna(testing["MasVnrArea"].median(), inplace = True)
train_types = training.dtypes
test_types = testing.dtypes
num_train = train_types[(train_types==int) | (train_types==float)]
cat_train = train_types[train_types==object]
num_test = test_types[(test_types==int) | (test_types==float)]
cat_test = test_types[test_types==object]
num_trainval = list(num_train.index)
num_testval = list(num_test.index)
cat_trainval = list(cat_train.index)
cat_testval = list(cat_test.index)
fill_num = []

for i in num_trainval:
    if i in list(null_few.index):
        fill_num.append(i)

print(fill_num)

['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageArea']


In [50]:
training["newprice"] = np.log(training["SalePrice"])

In [53]:
for i in cat_trainval:
    feature_set = set(training[i])
    for j in feature_set:
        feature_list= list(feature_set)
        training.loc[training[i]==j,i] = feature_list.index(j)
        
for i in cat_testval:
    feature_set = set(testing[i])
    for j in feature_set:
        feature_list= list(feature_set)
        testing.loc[testing[i]==j,i] = feature_list.index(j)  

In [70]:
training.fillna(0, inplace=True)

# Modeling


In [79]:
X_tr = training.drop(['Id','SalePrice','newprice'],axis=1)
Y_tr = training['newprice'].values
X_te = testing.drop(['Id'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_tr,Y_tr,random_state=0)

In [81]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train, y_train)



In [84]:
print (linreg.intercept_)
print (linreg.coef_)

3.858984670863501
[-2.09232782e-04 -2.37966754e-02  4.83671698e-06  2.56282985e-06
  2.28523833e-01  9.74725417e-04  3.90749686e-03  1.82032226e-02
  1.26154038e-01  2.45569781e-03 -4.25428764e-02 -2.13509727e-03
 -7.11378560e-03 -3.36461395e-02 -4.02193688e-03  1.27551536e-03
  6.86336762e-02  4.52991426e-02  1.50742552e-03  9.64233515e-04
 -4.33562689e-03  1.07068201e-02  3.03429029e-03 -2.20278715e-03
  4.09496029e-03  3.99740397e-06  1.48267263e-02 -1.92925262e-02
 -8.69691407e-03  1.19000086e-02 -5.42620871e-03  8.00772505e-03
 -1.01886255e-03  5.48240540e-05 -3.84011128e-03  2.84932142e-05
 -5.69438769e-06  7.76228805e-05  1.87062875e-02  9.46004026e-03
  5.20506703e-02 -2.21021456e-02  6.16580628e-05  4.01552893e-05
  5.95727514e-05  1.61386109e-04  3.42276303e-02  1.21630100e-02
  3.12591066e-02  1.90781150e-02 -7.06671644e-04 -5.85195521e-02
 -1.53904718e-02  1.12673727e-02  1.26364630e-02  4.68662513e-02
 -3.29356511e-03 -2.61936048e-03 -9.25610067e-05 -6.32915549e-03
  3.710

In [110]:
rfe = RFE(linreg, n_features_to_select=20, step=1)
rfe = rfe.fit(X_train, y_train)

# Evaluation


In [111]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

[('MSSubClass', False, 48),
 ('MSZoning', False, 2),
 ('LotFrontage', False, 56),
 ('LotArea', False, 58),
 ('Street', True, 1),
 ('Alley', False, 44),
 ('LotShape', False, 25),
 ('LandContour', False, 9),
 ('Utilities', False, 24),
 ('LotConfig', False, 33),
 ('LandSlope', False, 15),
 ('Neighborhood', False, 34),
 ('Condition1', False, 27),
 ('Condition2', True, 1),
 ('BldgType', False, 39),
 ('HouseStyle', False, 45),
 ('OverallQual', True, 1),
 ('OverallCond', True, 1),
 ('YearBuilt', False, 40),
 ('YearRemodAdd', False, 42),
 ('RoofStyle', False, 29),
 ('RoofMatl', False, 21),
 ('Exterior1st', False, 37),
 ('Exterior2nd', False, 38),
 ('MasVnrType', False, 36),
 ('MasVnrArea', False, 57),
 ('ExterQual', False, 12),
 ('ExterCond', False, 6),
 ('Foundation', False, 22),
 ('BsmtQual', False, 20),
 ('BsmtCond', False, 28),
 ('BsmtExposure', False, 7),
 ('BsmtFinType1', False, 47),
 ('BsmtFinSF1', True, 1),
 ('BsmtFinType2', False, 32),
 ('BsmtFinSF2', True, 1),
 ('BsmtUnfSF', True, 1)

In [112]:
col = X_train.columns[rfe.support_]
col

Index(['Street', 'Condition2', 'OverallQual', 'OverallCond', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'CentralAir', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'FullBath',
       'HalfBath', 'KitchenAbvGr', 'Fireplaces', 'GarageCars', 'SaleType'],
      dtype='object')

In [113]:
X_train.columns[~rfe.support_]

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Alley', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'BldgType', 'HouseStyle', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'Electrical', 'BsmtHalfBath', 'BedroomAbvGr',
       'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'FireplaceQu',
       'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleCondition'],
      dtype='object')

In [114]:
X_train_rfe = X_train[col]

In [115]:
import statsmodels.api as sm  
X_train_rfe = sm.add_constant(X_train_rfe)

In [116]:
lm = sm.OLS(y_train,X_train_rfe).fit()   # Running the linear model

In [117]:
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.874
Model:                            OLS   Adj. R-squared:                  0.871
Method:                 Least Squares   F-statistic:                     413.0
Date:                Mon, 03 Apr 2023   Prob (F-statistic):               0.00
Time:                        10:37:06   Log-Likelihood:                 571.60
No. Observations:                1095   AIC:                            -1105.
Df Residuals:                    1076   BIC:                            -1010.
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           10.5137      0.096    109.502   

In [118]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train_rfe
vif['Features'] = X.columns

# compute VIF values for each variable
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# replace inf values with a high value (e.g., 100)
vif.replace([np.inf, -np.inf], 100, inplace=True)

vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif



  vif = 1. / (1. - r_squared_i)


Unnamed: 0,Features,VIF
0,const,481.25
7,BsmtUnfSF,100.0
13,GrLivArea,100.0
12,LowQualFinSF,100.0
11,2ndFlrSF,100.0
8,TotalBsmtSF,100.0
10,1stFlrSF,100.0
6,BsmtFinSF2,100.0
5,BsmtFinSF1,100.0
3,OverallQual,2.57
