In [2]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [3]:

# Suppress Warnings for clean notebook
import warnings
warnings.filterwarnings('ignore')

In [4]:
# read dataset
dataset = pd.read_csv('melb_data.csv')

In [5]:
dataset.head()


Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000,S,Biggin,03-12-2016,2.5,3067,...,1,1.0,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,1035000,S,Biggin,04-02-2016,2.5,3067,...,1,0.0,156,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,1465000,SP,Biggin,04-03-2017,2.5,3067,...,2,0.0,134,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019
3,Abbotsford,40 Federation La,3,h,850000,PI,Biggin,04-03-2017,2.5,3067,...,2,1.0,94,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019
4,Abbotsford,55a Park St,4,h,1600000,VB,Nelson,04-06-2016,2.5,3067,...,1,2.0,120,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019


In [6]:

dataset.shape

(13580, 21)

In [7]:
dataset.isna().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

Handling Missing values

In [8]:
# Some feature's missing values can be treated as zero (another class for NA values or absence of that feature)
# like 0 for Propertycount, Bedroom2 will refer to other class of NA values
# like 0 for Car feature will mean that there's no car parking feature with house
cols_to_fill_zero = ['Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car']
dataset[cols_to_fill_zero] = dataset[cols_to_fill_zero].fillna(0)

# other continuous features can be imputed with mean for faster results since our focus is on Reducing overfitting
# using Lasso and Ridge Regression
dataset['Landsize'] = dataset['Landsize'].fillna(dataset.Landsize.mean())
dataset['BuildingArea'] = dataset['BuildingArea'].fillna(dataset.BuildingArea.mean())

In [9]:
dataset.dropna(inplace=True)

In [10]:
dataset.shape

(7448, 21)

In [11]:
dataset.shape

(7448, 21)

In [12]:
dataset = pd.get_dummies(dataset, drop_first=True)

In [13]:

dataset.head()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,...,CouncilArea_Wyndham,CouncilArea_Yarra,CouncilArea_Yarra Ranges,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
1,2,1035000,2.5,3067,2,1,0.0,156,79.0,1900.0,...,0,1,0,0,1,0,0,0,0,0
2,3,1465000,2.5,3067,3,2,0.0,134,150.0,1900.0,...,0,1,0,0,1,0,0,0,0,0
4,4,1600000,2.5,3067,3,1,2.0,120,142.0,2014.0,...,0,1,0,0,1,0,0,0,0,0
6,3,1876000,2.5,3067,4,2,0.0,245,210.0,1910.0,...,0,1,0,0,1,0,0,0,0,0
7,2,1636000,2.5,3067,2,1,2.0,256,107.0,1890.0,...,0,1,0,0,1,0,0,0,0,0


Let's bifurcate our dataset into train and test dataset

In [14]:
X = dataset.drop('Price', axis=1)
y = dataset['Price']

In [16]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=2)

In [17]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(train_X, train_y)

In [18]:
reg.score(test_X, test_y)

0.6723464306496185

In [19]:
reg.score(train_X, train_y)

1.0

# Normal Regression is clearly overfitting the data, let's try other models.Using Lasso (L1 Regularized) Regression Model

In [42]:
from sklearn import linear_model
lasso_reg = linear_model.Lasso(alpha=390, max_iter=100, tol=0.4)
lasso_reg.fit(train_X, train_y)


In [43]:

lasso_reg.score(test_X, test_y)

0.6859193734023344

In [44]:
lasso_reg.score(train_X, train_y)

0.7305298981286172

Using Ridge (L2 Regularized) Regression Model# Using Ridge (L2 Regularized) Regression Model

In [45]:
from sklearn.linear_model import Ridge
ridge_reg= Ridge(alpha=50, max_iter=100, tol=0.1)
ridge_reg.fit(train_X, train_y)

In [46]:

ridge_reg.score(test_X, test_y)

0.7055911539099998

In [47]:
ridge_reg.score(train_X, train_y)

0.7145929486565008