In [1]:
# This notebook explores the linear regression using Lasso regularization
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html
# 
# We use California housing dataset.

In [2]:
# import external libraies
%matplotlib inline

import numpy as np
from sklearn import datasets, __version__
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [3]:
# import dataset
CAL = datasets.fetch_california_housing()

In [4]:
# description
print(CAL['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [None]:
# let's plot each feature vs. labelled data
plt.figure(figsize=(15,10))
for i in range(8):
    plt.subplot(4,2,i+1)
    plt.scatter(CAL.data[:,i], CAL.target, s=2, label=CAL.feature_names[i])
    plt.legend(loc='best')

In [None]:
# split the dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(CAL.data, CAL.target, test_size=5000, shuffle=True)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# function to carry out Lasso Regression
def doLassoRegression(Xtrain, ytrain, Xval, yval, alpha=1e3):
    
    lasso_reg = Lasso(alpha)
    lasso_reg.fit(Xtrain, ytrain)
    print('coefficients:', lasso_reg.coef_)
    print('intercept:', lasso_reg.intercept_)

    predictions = lasso_reg.predict(Xval)
    print("Mean squared error: %.3f" % mean_squared_error(yval, predictions))
    
    pass


# do the regression for each feature 
for i, name in enumerate(CAL.feature_names):
    print(i, name)
    Xtrain = X_train[:,i].reshape(-1,1)
    ytrain = y_train
    
    Xval = X_test[:,i].reshape(-1,1)
    yval = y_test

    scaler = StandardScaler().fit(Xtrain)
    Xtrain = scaler.transform(Xtrain)
    Xval = scaler.transform(Xval)
    
    doLassoRegression(Xtrain, ytrain, Xval, yval)
    
    print('=====')
    print()

In [None]:
# Multiple features
for i, name in enumerate(CAL.feature_names):
    j = i + 1
    print(i, CAL.feature_names[:j])
    
    Xtrain = X_train[:,:j].reshape(-1,j)
    ytrain = y_train
    
    Xval = X_test[:,:j].reshape(-1,j)
    yval = y_test

    scaler = StandardScaler().fit(Xtrain)
    Xtrain = scaler.transform(Xtrain)
    Xval = scaler.transform(Xval)
    
    doLassoRegression(Xtrain, ytrain, Xval, yval)
    
    print('=====')
    print()