In [1]:
import pandas as pd

In [2]:
rental_data = pd.read_csv('data/house_rental.csv', index_col='Unnamed: 0')

In [3]:
rental_data.head()

Unnamed: 0,Sqft,Floor,TotalFloor,Bedroom,Living.Room,Bathroom,Price
1,1177.698,2,7,2,2,2,62000
2,2134.8,5,7,4,2,2,78000
3,1138.56,5,7,2,2,1,58000
4,1458.78,2,7,3,2,2,45000
5,967.776,11,14,3,2,2,45000


In [4]:
rental_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 645 entries, 1 to 648
Data columns (total 7 columns):
Sqft           645 non-null float64
Floor          645 non-null int64
TotalFloor     645 non-null int64
Bedroom        645 non-null int64
Living.Room    645 non-null int64
Bathroom       645 non-null int64
Price          645 non-null int64
dtypes: float64(1), int64(6)
memory usage: 40.3 KB


#### Feature Selection
* Removing of all features with less variance

In [5]:
from sklearn.feature_selection import VarianceThreshold

In [6]:
import numpy as np

In [7]:
for col in rental_data.columns.tolist():
    print (col,np.var(rental_data[col]))

Sqft 587969.0926318591
Floor 15.067661799170713
TotalFloor 24.923396430502994
Bedroom 1.0200108166576518
Living.Room 0.21344871101496418
Bathroom 0.4665488852833334
Price 1267890936.3284514


In [8]:
#As we see above, living room has least varience but not too low

In [9]:
from sklearn.feature_selection import SelectKBest,f_regression

In [10]:
feature_col = [ x for x in rental_data.columns.tolist() if x != 'Price' ]

In [11]:
X_data = rental_data[feature_col]
y_target = rental_data['Price']

In [12]:
X_new_data = SelectKBest(score_func=f_regression, k = 5).fit_transform(X_data, y_target)

In [13]:
X_new_data.shape

(645, 5)

In [14]:
#SelectPercentile - removes all but a configured percentage imp features

In [15]:
d = SelectKBest(k = 5).fit(X_data, y_target)

In [16]:
#returns imp of each col
d.scores_

array([ 12.98143914,   2.39966869,   2.44028919,   3.69793914,
         2.76236087,   4.73083499])

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
trainX, testX, trainY, testY = train_test_split(X_new_data, y_target)

In [19]:
from sklearn import linear_model

In [20]:
lr = linear_model.LinearRegression()

In [21]:
lr.fit(trainX,trainY)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [22]:
lr.score(testX,testY)

0.74505956576389942

In [23]:
rr = linear_model.Ridge( alpha=0.5 )

In [24]:
rr.fit(trainX,trainY)

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [25]:
rr.score(testX,testY)

0.74502264770064519

In [26]:
lasreg = linear_model.Lasso(alpha = 0.1)

In [27]:
lasreg.fit(trainX,trainY)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [28]:
lasreg.score(testX,testY)

0.74505820026081415

In [29]:
enreg = linear_model.ElasticNet(alpha=0.1)

In [30]:
enreg.fit(trainX,trainY)

ElasticNet(alpha=0.1, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [31]:
enreg.score(testX,testY)

0.74364025197358785

In [32]:
from sklearn.preprocessing import PolynomialFeatures

In [33]:
pol = PolynomialFeatures(degree=3)

In [34]:
X_tf_data = pol.fit_transform(X_new_data)

In [35]:
trainX, testX, trainY, testY = train_test_split(X_tf_data, y_target)

In [44]:
models = [ linear_model.LinearRegression(),
           linear_model.Ridge( alpha=0.5 ),
           linear_model.Lasso(alpha = 0.1),
           linear_model.ElasticNet(alpha=0.1)
         ]

In [37]:
for model in models:
    model.fit(trainX,trainY)
    print (model.score(testX,testY))

0.350238549773
0.434381175655
0.636702344649
0.640693070735




In [38]:
rental_data.corr()

Unnamed: 0,Sqft,Floor,TotalFloor,Bedroom,Living.Room,Bathroom,Price
Sqft,1.0,0.143249,0.304515,0.615608,0.45594,0.728869,0.825514
Floor,0.143249,1.0,0.564221,0.023986,-0.023563,0.099772,0.244553
TotalFloor,0.304515,0.564221,1.0,0.067298,0.029373,0.146132,0.333631
Bedroom,0.615608,0.023986,0.067298,1.0,0.526532,0.643083,0.467059
Living.Room,0.45594,-0.023563,0.029373,0.526532,1.0,0.546826,0.328788
Bathroom,0.728869,0.099772,0.146132,0.643083,0.546826,1.0,0.605542
Price,0.825514,0.244553,0.333631,0.467059,0.328788,0.605542,1.0


In [39]:
#Lets do some scaling
from sklearn import preprocessing

In [40]:
minmaxscaler = preprocessing.MinMaxScaler()

In [41]:
X_data = minmaxscaler.fit_transform(X_new_data)

In [46]:
pol = PolynomialFeatures(degree=3)
X_tf_data = pol.fit_transform(X_data)

In [47]:
trainX, testX, trainY, testY = train_test_split(X_tf_data, y_target)

In [48]:
for model in models:
    model.fit(trainX,trainY)
    print (model.score(testX,testY))

0.25780627242
0.705281725974
0.595883691661
0.545188742163


