In this notebook we will see the effect of overfitting while duing Linear regression on house price prediction. We will use Ridge and Lasso regression technique to reduce overfitting and improve the result on test datasets.

In [56]:
import pandas as pd

In [71]:
df = pd.read_csv('Melbourne_housing_price.csv')
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         34857 non-null  object 
 1   Address        34857 non-null  object 
 2   Rooms          34857 non-null  int64  
 3   Type           34857 non-null  object 
 4   Price          27247 non-null  float64
 5   Method         34857 non-null  object 
 6   SellerG        34857 non-null  object 
 7   Date           34857 non-null  object 
 8   Distance       34856 non-null  float64
 9   Postcode       34856 non-null  float64
 10  Bedroom2       26640 non-null  float64
 11  Bathroom       26631 non-null  float64
 12  Car            26129 non-null  float64
 13  Landsize       23047 non-null  float64
 14  BuildingArea   13742 non-null  float64
 15  YearBuilt      15551 non-null  float64
 16  CouncilArea    34854 non-null  object 
 17  Lattitude      26881 non-null  float64
 18  Longti

In [73]:
#There are some features which can be dropped 
#For example, as region name is a feature, we don't need address to have in our experiment
df.drop(['Address', 'Date', 'Postcode', 'YearBuilt', 'CouncilArea', 'Lattitude', 'Longtitude', ], 
        axis=1, inplace=True)

In [74]:
# Now lets check where we have N/A value
df.isna().sum()

Suburb               0
Rooms                0
Type                 0
Price             7610
Method               0
SellerG              0
Distance             1
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
Regionname           3
Propertycount        3
dtype: int64

In [75]:
# Some values are N/A because there are no values to be insereted e.g. Zero
# For example, if car is N/A, it means there is no car parking available and it should be Zero

df[['Distance', 'Bedroom2', 'Bathroom', 'Car', 'Propertycount']] = df[['Distance', 'Bedroom2', 'Bathroom',
                                                                       'Car', 'Propertycount']].fillna(0)

In [76]:
df.isna().sum()

Suburb               0
Rooms                0
Type                 0
Price             7610
Method               0
SellerG              0
Distance             0
Bedroom2             0
Bathroom             0
Car                  0
Landsize         11810
BuildingArea     21115
Regionname           3
Propertycount        0
dtype: int64

In [77]:
# We will fill the Landsize and Building are with the average value of that column
# But it is not a perfect way in this kind of problem
df['Landsize'] = df['Landsize'].fillna(df['Landsize'].mean())
df['BuildingArea'] = df['BuildingArea'].fillna(df['BuildingArea'].mean())

In [78]:
df.isna().sum()

Suburb              0
Rooms               0
Type                0
Price            7610
Method              0
SellerG             0
Distance            0
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
Regionname          3
Propertycount       0
dtype: int64

In [79]:
df.dropna(subset = ['Regionname'],inplace = True)
df.dropna(subset = ['Price'],inplace = True)

In [80]:
df.isna().sum()

Suburb           0
Rooms            0
Type             0
Price            0
Method           0
SellerG          0
Distance         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
Regionname       0
Propertycount    0
dtype: int64

In [81]:
# There are also some categorical features. 
# We need some dummy variables

df_dummies = pd.get_dummies(df.drop('Price', axis = 1), drop_first = True)
df_dummies.head()

Unnamed: 0,Rooms,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Propertycount,Suburb_Aberfeldie,Suburb_Airport West,...,SellerG_iProperty,SellerG_iSell,SellerG_iTRAK,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
1,2,2.5,2.0,1.0,1.0,202.0,160.2564,4019.0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,2,2.5,2.0,1.0,0.0,156.0,79.0,4019.0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,3,2.5,3.0,2.0,0.0,134.0,150.0,4019.0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,3,2.5,3.0,2.0,1.0,94.0,160.2564,4019.0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,4,2.5,3.0,1.0,2.0,120.0,142.0,4019.0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [91]:
# Now separate the train and test sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_dummies, df['Price'], test_size = 0.35)

In [92]:
# Let's create the Linear Regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [93]:
# Now lets check how our model performed

print('Score on training data', model.score(X_train, y_train))
print('Score on test data', model.score(X_test, y_test))

Score on training data 0.6831640983370021
Score on test data 0.6651836976887342


In [98]:
# It can not be said that the model is overfitted with the training data
# But the overall performance is not good 
# We will still implemente L1 & L2 regularization
# Even though regularization helps to reduce overfitting, it will not improve the training accuracy
from sklearn.linear_model import Lasso, Ridge
lasso_reg = Lasso(alpha = 10, max_iter = 500)

In [99]:
lasso_reg.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


Lasso(alpha=10, max_iter=500)

In [100]:
lasso_reg.score(X_train, y_train), lasso_reg.score(X_test, y_test)


(0.6825737430396677, 0.6664168627039068)

In [101]:
ridge_reg = Ridge(alpha = 10, max_iter = 500)
ridge_reg.fit(X_train, y_train)
ridge_reg.score(X_train, y_train), lasso_reg.score(X_test, y_test)

(0.6758057807027189, 0.6664168627039068)