In [1]:
import pandas as pd

In [4]:
house_data = pd.read_csv('house_rental.csv.txt', index_col='Unnamed: 0')

In [10]:
features = list(set(house_data.columns.tolist()) - set(['Price']))

In [13]:
house_data[features].head()

Unnamed: 0,Sqft,Floor,Living.Room,Bathroom,Bedroom,TotalFloor
1,1177.698,2,2,2,2,7
2,2134.8,5,2,2,4,7
3,1138.56,5,2,1,2,7
4,1458.78,2,2,2,3,7
5,967.776,11,2,2,3,14


#### Splitting data into train & test

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
trainX, testX, trainY, testY = train_test_split(house_data[features], house_data.Price)

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [20]:
for degree in range(1,5):
    pol = PolynomialFeatures(degree)
    data_tf = pol.fit_transform(trainX)
    model  = LinearRegression()
    model.fit(data_tf, trainY)
    
    data_tf_test = pol.fit_transform(testX)
    print (degree,model.score(data_tf, trainY))
    print (degree,model.score(data_tf_test, testY))

1 0.695406330272
1 0.716064335862
2 0.731416197737
2 0.65764958177
3 0.824046920714
3 0.512632486655
4 0.824138746622
4 -3134.40259291


#### The above algo - Ordinary least squares algorithm. If there are outliers, the model & coef's are completely impacted

In [21]:
from sklearn.linear_model import Ridge

In [22]:
ridge_model = Ridge(alpha= .5)

In [23]:
for degree in range(1,5):
    pol = PolynomialFeatures(degree)
    data_tf = pol.fit_transform(trainX)
    model  = Ridge(alpha= .5)
    model.fit(data_tf, trainY)
    
    data_tf_test = pol.fit_transform(testX)
    print (degree,model.score(data_tf, trainY))
    print (degree,model.score(data_tf_test, testY))
    print ('\n')

1 0.695406118975
1 0.716116821737


2 0.731209344042
2 0.662461275265


3 0.82343283319
3 0.334936545666


4 0.89356117343
4 -683.810622858




In [24]:
house_data.corr()

Unnamed: 0,Sqft,Floor,TotalFloor,Bedroom,Living.Room,Bathroom,Price
Sqft,1.0,0.143249,0.304515,0.615608,0.45594,0.728869,0.825514
Floor,0.143249,1.0,0.564221,0.023986,-0.023563,0.099772,0.244553
TotalFloor,0.304515,0.564221,1.0,0.067298,0.029373,0.146132,0.333631
Bedroom,0.615608,0.023986,0.067298,1.0,0.526532,0.643083,0.467059
Living.Room,0.45594,-0.023563,0.029373,0.526532,1.0,0.546826,0.328788
Bathroom,0.728869,0.099772,0.146132,0.643083,0.546826,1.0,0.605542
Price,0.825514,0.244553,0.333631,0.467059,0.328788,0.605542,1.0


#### Lasso
* Estimates sparse coefs
* Prefers solutions with lesser params

In [25]:
from sklearn.linear_model import Lasso

In [26]:
for degree in range(1,5):
    pol = PolynomialFeatures(degree)
    data_tf = pol.fit_transform(trainX)
    model  = Lasso(0.3)
    model.fit(data_tf, trainY)
    
    data_tf_test = pol.fit_transform(testX)
    print (degree,model.score(data_tf, trainY))
    print (degree,model.score(data_tf_test, testY))
    print ('\n')

1 0.695406328926
1 0.716066519664


2 0.731320748166
2 0.650444248061






3 0.793079460574
3 0.505793596728


4 0.817124046973
4 0.140697670363




In [28]:
model.get_params()

{'alpha': 0.3,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'normalize': False,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [29]:
model.coef_

array([  0.00000000e+00,   2.98492552e+01,   3.11966853e+03,
         2.07552948e+04,  -1.75287431e+04,  -1.73437505e+04,
         2.48831779e+03,   8.86268524e-03,  -3.58442672e-01,
        -1.71018615e+01,   2.23327394e+00,   8.13207383e+00,
        -4.53183041e-01,  -1.98702596e+02,  -1.93785448e+03,
        -1.43866908e+03,   1.87334003e+02,   1.11600533e+02,
         1.21434457e+03,   2.81027062e+03,  -2.13021416e+03,
         8.40053535e+02,   3.66911582e+03,   7.57744452e+02,
        -4.04752755e+02,   2.00312144e+02,   8.82091632e+02,
        -2.00465548e+02,  -8.93482054e-07,   1.12905379e-03,
        -3.23898844e-03,  -1.68185961e-03,   2.99773966e-03,
        -1.37792934e-04,  -4.05731569e-02,   4.37781387e-01,
        -6.29130104e-01,  -5.25481124e-01,   2.77426920e-02,
        -2.78657297e+00,  -2.93226261e-01,   1.10996318e+00,
         2.45713742e-01,  -1.05698019e+00,   4.20915930e-01,
         3.30611922e-01,  -7.73071722e-01,   8.54146983e-02,
        -8.46911489e-02,

#### Elastic Net
* This is useful whene there are multiple features which are correlated with one another
* Lasso will pick one of them randomly
* elastic-net picks them both

In [32]:
from sklearn.linear_model import ElasticNet

for degree in range(1,5):
    pol = PolynomialFeatures(degree)
    data_tf = pol.fit_transform(trainX)
    model  = ElasticNet(alpha=0.8)
    model.fit(data_tf, trainY)
    
    data_tf_test = pol.fit_transform(testX)
    print (degree,model.score(data_tf, trainY))
    print (degree,model.score(data_tf_test, testY))
    print ('\n')

1 0.693600245389
1 0.719009141162


2 0.725127436491
2 0.66331950927


3 



0.788652085783
3 0.651382867971


4 0.81568756867
4 0.365673989744


