In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model

In [None]:
data_all = np.genfromtxt('../data/kc_house_data.csv', dtype=None, delimiter=',', names=True)
data = np.genfromtxt('../data/wk3_kc_house_train_data.csv', dtype=None, delimiter=',', names=True)
d_val = np.genfromtxt('../data/wk3_kc_house_valid_data.csv', dtype=None, delimiter=',', names=True)
d_test = np.genfromtxt('../data/wk3_kc_house_test_data.csv', dtype=None, delimiter=',', names=True)

# alternative way with titles
# data = np.genfromtxt('../data/kc_house_train_data.csv', delimiter=',', skip_header=1)
# d_test = np.genfromtxt('../data/kc_house_test_data.csv', delimiter=',', skip_header=1)

In [None]:
def tmp_format_in_out(data):
    sqft_living_sqrt = np.sqrt(data['sqft_living'])
    sqft_lot_sqrt = np.sqrt(data['sqft_lot'])
    bedrooms_sq = np.multiply(data['bedrooms'], data['bedrooms'])

    tmp = np.empty((len(data['floors'])), dtype=np.float64)
    for i in range(len(data['floors'])):
        tmp[i] = np.float(data['floors'][i][1:-1])
        # [1:-1] -> for some reason they are encoded as "[num]" and cannot 
        # be converted with an obvious way otherwise.

    floors_sq = np.multiply(tmp, tmp)
    
    c1 = [sqft_living_sqrt, sqft_lot_sqrt, bedrooms_sq, floors_sq, tmp]
    lf = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
          'waterfront', 'view', 'condition', 'grade', 'sqft_above',
          'sqft_basement', 'yr_built', 'yr_renovated']
    for ft in lf:
        d1 = data[ft]
        assert(not np.any(np.isnan(d1)))
        c1.append(d1)
    
    inp = np.array(c1).T
    output = data['price']
    
    return inp, output

In [None]:
inp, output = tmp_format_in_out(data_all)

In [None]:
from sklearn import linear_model  # using scikit-learn

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(inp, output) # learn weights
print(model_all.coef_)
print(np.where(model_all.coef_ != 0))

In [None]:
# format the train, valid, test data with the specified feats
inp, output = tmp_format_in_out(data)
inp_v, output_v = tmp_format_in_out(d_val)
inp_t, output_t = tmp_format_in_out(d_test)

In [None]:
def compute_rss_lasso(y_pred, y_true, w):
    erri = y_pred - y_true
    return np.sum(np.multiply(erri, erri)) + np.linalg.norm(w, 1)

In [None]:
rss_v = []  # save the RSS on validation set
l1_pool = np.logspace(1, 7, num=13)
for l1_penalty in l1_pool:
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(inp, output)
    y_pred = model.predict(inp_v)
    rss_v.append(compute_rss_lasso(y_pred, output_v, model.coef_))

In [None]:
# find the l1_penalty that minimises the rss on validation set
rss_v = np.array(rss_v)
argsort = np.argsort(rss_v)
best_l1 = l1_pool[argsort[0]]
print('The min RSS by {} l1_penalty.'.format(best_l1))

In [None]:
# compute RSS on test set
model = linear_model.Lasso(alpha=best_l1, normalize=True)
model.fit(inp, output)
y_pred = model.predict(inp_t)

rss = compute_rss_lasso(y_pred, output_t, model.coef_)

print('Nonzero coef: {}'.format(np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)))

# Build model with specific non-zero features

In [None]:
max_nonzeros = 7

more_nonzeros = max_nonzeros + 100 
less_nonzeros = 0
l1_penalty_min, l1_penalty_max = 0, 0

for l1_penalty in np.logspace(1, 4, num=20):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(inp, output)
    nz = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    
    if nz > max_nonzeros and nz <= more_nonzeros:  # = because this is an ascending l1_p list
        assert(l1_penalty_min < l1_penalty)
        l1_penalty_min = l1_penalty
        more_nonzeros = nz
        
    if nz < max_nonzeros and nz > less_nonzeros:
        l1_penalty_max = l1_penalty
        less_nonzeros = nz

In [None]:
print(l1_penalty_max)

In [None]:
rss_v = []  # save the RSS on validation set
l1_spec = []
l1_pool = np.linspace(l1_penalty_min,l1_penalty_max,20)
for l1_penalty in l1_pool:
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(inp, output)
    nz = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    if nz == max_nonzeros:
        l1_spec.append(l1_penalty)    
        y_pred = model.predict(inp_v)
        rss_v.append(compute_rss_lasso(y_pred, output_v, model.coef_))

In [None]:
rss_v = np.array(rss_v)
argsort = np.argsort(rss_v)
best_l1 = l1_spec[argsort[0]]
print('The min RSS by {} l1_penalty.'.format(best_l1))

In [None]:
# train a model on the best l1
model = linear_model.Lasso(alpha=best_l1, normalize=True)
model.fit(inp, output)
print model.coef_
y_pred = model.predict(inp_t)

rss = compute_rss_lasso(y_pred, output_t, model.coef_)

print('Nonzero coef: {}'.format(np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)))


In [None]:
print(np.where(model.coef_ != 0))