In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import linear_model
import time
%matplotlib inline

from general_utils import categorize_objects, dummify_all_categories, standardize_series_value

In [4]:
housing = pd.read_csv("MELBOURNE_HOUSE_PRICES_LESS.csv")

In [5]:
housing.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Regionname,Propertycount,Distance,CouncilArea
0,Abbotsford,49 Lithgow St,3,h,1490000.0,S,Jellis,1/04/2017,Northern Metropolitan,4019,3.0,Yarra City Council
1,Abbotsford,59A Turner St,3,h,1220000.0,S,Marshall,1/04/2017,Northern Metropolitan,4019,3.0,Yarra City Council
2,Abbotsford,119B Yarra St,3,h,1420000.0,S,Nelson,1/04/2017,Northern Metropolitan,4019,3.0,Yarra City Council
3,Aberfeldie,68 Vida St,3,h,1515000.0,S,Barry,1/04/2017,Western Metropolitan,1543,7.5,Moonee Valley City Council
4,Airport West,92 Clydesdale Rd,2,h,670000.0,S,Nelson,1/04/2017,Western Metropolitan,3464,10.4,Moonee Valley City Council


In [6]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52964 entries, 0 to 52963
Data columns (total 12 columns):
Suburb           52964 non-null object
Address          52964 non-null object
Rooms            52964 non-null int64
Type             52964 non-null object
Price            41196 non-null float64
Method           52964 non-null object
SellerG          52964 non-null object
Date             52964 non-null object
Regionname       52964 non-null object
Propertycount    52964 non-null int64
Distance         52964 non-null float64
CouncilArea      52964 non-null object
dtypes: float64(2), int64(2), object(8)
memory usage: 4.8+ MB


In [7]:
housing = housing.drop(['SellerG', 'CouncilArea', 'Date', 'Address', 'Regionname'], axis=1)

In [8]:
housing['Suburb'].dtype == np.object

True

In [9]:
housing = categorize_objects(housing)

In [10]:
housing, feat_cols = dummify_all_categories(housing, 'Price')

In [11]:
housing['Price'].median()

835000.0

In [12]:
housing['Price'] = housing['Price'].fillna(housing['Price'].median())

In [13]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52964 entries, 0 to 52963
Columns: 394 entries, Rooms to Method_UNK
dtypes: float64(2), int64(2), uint8(390)
memory usage: 21.3 MB


In [14]:
housing['Price'] = standardize_series_value(housing['Price'])
housing['Propertycount'] = standardize_series_value(housing['Propertycount'])
housing['Distance'] = standardize_series_value(housing['Distance'])
housing['Rooms'] = standardize_series_value(housing['Rooms'])

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(housing.drop(['Price'], axis=1), housing['Price'], test_size=.25, random_state=42)

In [22]:
X_train

Unnamed: 0,Rooms,Propertycount,Distance,Suburb_Abbotsford,Suburb_Aberfeldie,Suburb_Airport West,Suburb_Albanvale,Suburb_Albert Park,Suburb_Albion,Suburb_Alphington,...,Method_PI,Method_PN,Method_S,Method_SA,Method_SN,Method_SP,Method_SS,Method_VB,Method_W,Method_UNK
28319,-0.120226,-0.152399,-0.315076,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
41705,-0.120226,-0.010152,3.314660,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
39663,-0.120226,-0.495332,0.261488,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
16419,-0.120226,0.767449,0.209073,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
17731,0.920582,0.767449,0.209073,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
49813,0.920582,0.750914,0.903572,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
11313,-0.120226,-0.328169,0.955987,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1623,-0.120226,1.746415,0.523563,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
37457,-0.120226,-0.276978,0.274592,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
46351,-0.120226,-0.704172,-0.419906,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [23]:
ridge = Ridge()
ridge.fit(X_train, Y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [24]:
ridge.score(X_test, Y_test)

0.42797005494165341

In [25]:
ridge.coef_

array([  2.96151207e-01,   1.32114772e-02,  -2.61611412e-01,
         4.00477823e-02,   3.93667499e-01,  -3.35987776e-01,
        -6.21914794e-01,   1.01714020e+00,  -4.84658495e-01,
         3.67601529e-01,  -3.59480597e-02,  -5.30467267e-01,
        -4.38507849e-01,  -5.45874950e-01,   8.70251859e-01,
        -9.07491270e-02,   6.34739699e-01,   1.47400338e-01,
         3.22508113e-01,  -6.81191092e-03,  -4.45305442e-01,
        -4.39334299e-01,   3.50069783e-01,  -7.16889852e-02,
         1.50664313e-01,   9.80826738e-01,   6.70249183e-01,
         3.71719330e-02,  -1.11831622e-01,   3.23898825e-01,
         1.74788359e-01,   8.12423454e-01,  -9.79661329e-02,
        -2.38402849e-01,   0.00000000e+00,  -4.06268998e-01,
         4.78226334e-01,   2.63063588e-01,   1.79885500e-01,
         1.10904819e+00,   3.03519369e-01,   6.39060366e-02,
        -1.20866961e-02,   3.43621709e-01,   5.11369666e-02,
         0.00000000e+00,   4.23045025e-01,  -5.12652039e-01,
        -1.86960186e-01,