In [None]:
# !pip install numpy
# !pip install pandas
# !pip install scikit-learn

In [27]:
import numpy as np
import pandas as pd
# sklearn(package).~~~(modlue) import function, class
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
california = fetch_california_housing()

In [29]:
print(california.DESCR)
# # of instances => 행(sample)의 수
# # of attributes => 열(var)의 수

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [5]:
X, y = california.data, california.target

In [6]:
type(X), type(y)

(numpy.ndarray, numpy.ndarray)

In [7]:
print(X.shape, y.shape)

(20640, 8) (20640,)


In [8]:
y[-10:]

array([1.12 , 1.072, 1.156, 0.983, 1.168, 0.781, 0.771, 0.923, 0.847,
       0.894])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# L1 regularization: LassoCV (5-fold Cross-Validation)
lasso_cv = LassoCV(cv=5, random_state=42)
lasso_cv.fit(X_train, y_train)

In [11]:
lasso_best_alpha = lasso_cv.alpha_   # alpha_: The amount of penalization chosen by cross validation
print("The chosen L1 regularization parameter : ", lasso_best_alpha)

The chosen L1 regularization parameter :  0.03422256157349769


In [12]:
y_pred_lasso = lasso_cv.predict(X_test)
y_pred_lasso

array([0.85277801, 1.6961987 , 2.41968462, ..., 4.35568964, 1.42232444,
       1.87803232], shape=(4128,))

In [13]:
lasso_mse = mean_squared_error(y_test, y_pred_lasso)
lasso_mse

0.5555752649052167

In [14]:
lasso_cv.coef_
# lasso에서 coef가 0이 되는 걸 알 수 있다 -> feature selection 가능하다.

array([ 3.84304802e-01,  1.14701603e-02,  2.12303084e-03,  0.00000000e+00,
        1.95412420e-06, -3.23117418e-03, -3.31025912e-01, -3.30715755e-01])

In [15]:
# L2 regularization: LassoCV (5-fold Cross-Validation)
ridge_alphas = np.logspace(-3, 3, 100)  # alpha 검색 범위 설정
ridge_cv = RidgeCV(alphas=ridge_alphas, scoring='neg_mean_squared_error', cv=5)
ridge_cv.fit(X_train, y_train)

In [16]:
ridge_best_alpha = ridge_cv.alpha_    # alpha_: The amount of penalization chosen by cross validation
print("The chosen L2 regularization parameter : ", ridge_best_alpha)

The chosen L2 regularization parameter :  8.697490026177835


In [17]:
y_pred_ridge = ridge_cv.predict(X_test)
y_pred_ridge

array([0.72013445, 1.76345377, 2.70480946, ..., 4.4676901 , 1.19046416,
       2.00702579], shape=(4128,))

In [42]:
ridge_mse = mean_squared_error(y_test, y_pred_ridge)
ridge_mse

0.555147682667783

In [43]:
ridge_cv.coef_

array([ 4.47273670e-01,  9.73908619e-03, -1.20680356e-01,  7.68366381e-01,
       -1.99627707e-06, -3.52241448e-03, -4.19731555e-01, -4.33460807e-01])

In [20]:
# Elastic Net: ElasticNetCV (5-fold Cross-Validation)
elastic_net_cv = ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], cv=5, random_state=42)
# L1_ratio => elasticNet의 또 다른 parameter..!
elastic_net_cv.fit(X_train, y_train)

In [21]:
elastic_net_best_alpha = elastic_net_cv.alpha_
print("The chosen Elastic Net regularization parameter : ", elastic_net_best_alpha)

The chosen Elastic Net regularization parameter :  0.03422256157349769


In [22]:
elastic_net_best_l1_ratio = elastic_net_cv.l1_ratio_
elastic_net_best_l1_ratio # L1 ratio가 1 ! -> 즉 lasso랑 같다!

np.float64(1.0)

In [23]:
y_pred_elastic_net = elastic_net_cv.predict(X_test)
y_pred_elastic_net

array([0.85277801, 1.6961987 , 2.41968462, ..., 4.35568964, 1.42232444,
       1.87803232], shape=(4128,))

In [24]:
elastic_net_mse = mean_squared_error(y_test, y_pred_elastic_net)
elastic_net_mse

0.5555752649052167

In [25]:
elastic_net_cv.coef_

array([ 3.84304802e-01,  1.14701603e-02,  2.12303084e-03,  0.00000000e+00,
        1.95412420e-06, -3.23117418e-03, -3.31025912e-01, -3.30715755e-01])