In [416]:
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np

In [417]:
housing_df = pd.read_csv('Melbourne_housing_FULL.csv')

housing_df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [418]:
housing_df.nunique()

Suburb             351
Address          34009
Rooms               12
Type                 3
Price             2871
Method               9
SellerG            388
Date                78
Distance           215
Postcode           211
Bedroom2            15
Bathroom            11
Car                 15
Landsize          1684
BuildingArea       740
YearBuilt          160
CouncilArea         33
Lattitude        13402
Longtitude       14524
Regionname           8
Propertycount      342
dtype: int64

In [419]:
housing_df.shape

(34857, 21)

In [420]:
columns_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 'Distance', 'CouncilArea', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'Price']

housing_df = housing_df[columns_to_use]

housing_df.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,126.0,,
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,202.0,,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,0.0,156.0,79.0,1035000.0
3,Abbotsford,3,u,VB,Rounds,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,1.0,0.0,,
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,0.0,134.0,150.0,1465000.0


In [421]:
housing_df.shape

(34857, 15)

In [422]:
housing_df.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           3
Propertycount        3
Distance             1
CouncilArea          3
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
Price             7610
dtype: int64

In [423]:
cols_to_fill_zero = ['Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car']

In [424]:
housing_df[cols_to_fill_zero] = housing_df[cols_to_fill_zero].fillna(0)

housing_df.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           3
Propertycount        0
Distance             0
CouncilArea          3
Bedroom2             0
Bathroom             0
Car                  0
Landsize         11810
BuildingArea     21115
Price             7610
dtype: int64

In [425]:
housing_df['BuildingArea'].fillna(housing_df['BuildingArea'].mean(), inplace=True)

In [426]:
housing_df.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           3
Propertycount        0
Distance             0
CouncilArea          3
Bedroom2             0
Bathroom             0
Car                  0
Landsize         11810
BuildingArea         0
Price             7610
dtype: int64

In [427]:
housing_df['Landsize'].fillna(housing_df['Landsize'].mean(), inplace=True)

In [428]:
housing_df.isna().sum()

Suburb              0
Rooms               0
Type                0
Method              0
SellerG             0
Regionname          3
Propertycount       0
Distance            0
CouncilArea         3
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
Price            7610
dtype: int64

In [429]:
housing_df['Price'].fillna(housing_df['Price'].mean(), inplace=True)

In [430]:
housing_df.isna().sum()

Suburb           0
Rooms            0
Type             0
Method           0
SellerG          0
Regionname       3
Propertycount    0
Distance         0
CouncilArea      3
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
Price            0
dtype: int64

In [431]:
housing_df.dropna(inplace=True)

housing_df.isna().sum()

Suburb           0
Rooms            0
Type             0
Method           0
SellerG          0
Regionname       0
Propertycount    0
Distance         0
CouncilArea      0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
Price            0
dtype: int64

In [432]:
housing_df.head()

suburb_encoding = pd.get_dummies(housing_df['Suburb'])

suburb_encoding.head()

Unnamed: 0,Abbotsford,Aberfeldie,Airport West,Albanvale,Albert Park,Albion,Alphington,Altona,Altona Meadows,Altona North,...,Williamstown North,Windsor,Wollert,Wonga Park,Wyndham Vale,Yallambie,Yarra Glen,Yarraville,croydon,viewbank
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [433]:
housing_df = pd.concat([housing_df, suburb_encoding], axis='columns')

housing_df.drop(columns=['Suburb'], inplace=True)

housing_df.head()

Unnamed: 0,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,...,Williamstown North,Windsor,Wollert,Wonga Park,Wyndham Vale,Yallambie,Yarra Glen,Yarraville,croydon,viewbank
0,2,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,3,u,VB,Rounds,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,...,0,0,0,0,0,0,0,0,0,0
4,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,...,0,0,0,0,0,0,0,0,0,0


In [434]:
type_encoding = pd.get_dummies(housing_df['Type'], drop_first=True)
region_name_encoding = pd.get_dummies(housing_df['Regionname'], drop_first=True)
seller_encoding = pd.get_dummies(housing_df['SellerG'], drop_first = True)
council_area_encoding = pd.get_dummies(housing_df['CouncilArea'], drop_first=True)
method_encoding = pd.get_dummies(housing_df['Method'], drop_first=True)

In [435]:
housing_df = pd.concat([housing_df, type_encoding, region_name_encoding, seller_encoding, council_area_encoding, method_encoding], axis='columns')

housing_df = housing_df.drop(columns=['Type', 'Regionname', 'SellerG', 'CouncilArea', 'Method'])

housing_df.head()

Unnamed: 0,Rooms,Propertycount,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price,Abbotsford,...,Yarra City Council,Yarra Ranges Shire Council,PN,S,SA,SN,SP,SS,VB,W
0,2,4019.0,2.5,2.0,1.0,1.0,126.0,160.2564,1050173.0,1,...,1,0,0,0,0,0,0,1,0,0
1,2,4019.0,2.5,2.0,1.0,1.0,202.0,160.2564,1480000.0,1,...,1,0,0,1,0,0,0,0,0,0
2,2,4019.0,2.5,2.0,1.0,0.0,156.0,79.0,1035000.0,1,...,1,0,0,1,0,0,0,0,0,0
3,3,4019.0,2.5,3.0,2.0,1.0,0.0,160.2564,1050173.0,1,...,1,0,0,0,0,0,0,0,1,0
4,3,4019.0,2.5,3.0,2.0,0.0,134.0,150.0,1465000.0,1,...,1,0,0,0,0,0,1,0,0,0


In [436]:
housing_df.columns

Index(['Rooms', 'Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'Price', 'Abbotsford',
       ...
       'Yarra City Council', 'Yarra Ranges Shire Council', 'PN', 'S', 'SA',
       'SN', 'SP', 'SS', 'VB', 'W'],
      dtype='object', length=795)

In [437]:
x = housing_df.drop(columns=['Price'])
y = housing_df['Price']

In [438]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3)

In [439]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(train_x, train_y)

LinearRegression()

In [440]:
reg.score(test_x, test_y)

0.48499139745422015

In [441]:
reg.score(train_x, train_y)

0.49031334882790734

In [451]:
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=50, max_iter=100, tol=0.1)

lasso_reg.fit(train_x, train_y)

  model = cd_fast.enet_coordinate_descent(


Lasso(alpha=50, max_iter=100, tol=0.1)

In [443]:
lasso_reg.score(test_x, test_y)

0.48937569485225363

In [444]:
lasso_reg.score(train_x, train_y)

0.4846683932251006

In [446]:
from sklearn.linear_model import Ridge

r_model = Ridge()

r_model.fit(train_x, train_y)

Ridge()

In [447]:
r_model.score(test_x, test_y)

0.48744154367076953

In [448]:
r_model.score(train_x, train_y)

0.48954303124039733