In [79]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
import pandas as pd

california = fetch_california_housing()

model = Lasso(alpha=0.1)
X_train, X_test, y_train, y_test = train_test_split(california.data, california.target, test_size=0.3, random_state=100)
model.fit(X_train,y_train)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [80]:
print('학습 데이터 점수: {}'.format(model.score(X_train, y_train)))
print('평가 데이터 점수: {}'.format(model.score(X_test, y_test)))
print('y = ' + str(model.intercept_) + ' ')
for i,c in enumerate(model.coef_):
  print(str(c) + ' * x' + str(i))

학습 데이터 점수: 0.5389071766593554
평가 데이터 점수: 0.5573020238926716
y = -7.622686396195066 
0.387914488971013 * x0
0.015202012631075505 * x1
-0.0 * x2
0.0 * x3
1.66971051187986e-05 * x4
-0.0036059747326681745 * x5
-0.11241351266587753 * x6
-0.09830323196349072 * x7


In [81]:
california_df = pd.DataFrame(data=california.data, columns = california.feature_names)
california_df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [82]:
scaler = MinMaxScaler()
california_scaled = scaler.fit_transform(california_df)
california_df_scaled = pd.DataFrame(data=california_scaled, columns=california.feature_names)
california_df_scaled.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,0.232464,0.541951,0.032488,0.022629,0.039869,0.001914,0.328572,0.476125
std,0.13102,0.246776,0.017539,0.014049,0.03174,0.008358,0.226988,0.199555
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.142308,0.333333,0.025482,0.019943,0.021974,0.001398,0.147715,0.253984
50%,0.209301,0.54902,0.031071,0.021209,0.032596,0.001711,0.182784,0.583665
75%,0.292641,0.705882,0.036907,0.022713,0.048264,0.002084,0.549416,0.631474
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [84]:
X_train, X_test, y_train, y_test = train_test_split(california_df_scaled, california.target, test_size=0.3, random_state=100)

model.fit(X_train,y_train)

print('학습 데이터 점수: {}'.format(model.score(X_train, y_train)))
print('평가 데이터 점수: {}'.format(model.score(X_test, y_test)))

학습 데이터 점수: 0.03179418081928609
평가 데이터 점수: 0.03200231614440907


정규화를 진행했지만 모델 성능의 영향 x

In [85]:
alpha = [0.001,0.01,0.1,1,10,100,1000]
param_grid = dict(alpha=alpha)

gs = GridSearchCV(estimator=Lasso(),param_grid = param_grid,cv=10)
result = gs.fit(california_df_scaled,california.target)

print('최적 점수 : {}'.format(result.best_score_))
print('최적 파라미터 : {}'.format(result.best_params_))
print(gs.best_estimator_)
pd.DataFrame(result.cv_results_)

최적 점수 : 0.5025759576475576
최적 파라미터 : {'alpha': 0.001}
Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.014054,0.0024,0.002141,0.000255,0.001,{'alpha': 0.001},0.534944,0.585549,0.387213,0.493246,0.53285,0.548366,0.453401,0.416193,0.470465,0.603531,0.502576,0.067412,1
1,0.011291,0.001801,0.001938,6.5e-05,0.01,{'alpha': 0.01},0.623058,0.45097,0.351363,0.515987,0.484377,0.554198,0.195601,0.31909,0.250817,0.515669,0.426113,0.132744,2
2,0.006408,0.00373,0.001755,7.9e-05,0.1,{'alpha': 0.1},0.006583,-0.339775,-0.042696,0.077949,-0.126251,0.005027,-0.879471,0.008454,-0.716938,-0.257611,-0.226473,0.313107,3
3,0.005006,0.000193,0.00185,0.000472,1.0,{'alpha': 1},-0.044958,-0.37672,-0.069812,-0.001038,-0.130402,-0.028682,-0.992364,-0.033294,-0.716938,-0.330984,-0.272519,0.321468,4
4,0.005265,0.000579,0.001749,7.4e-05,10.0,{'alpha': 10},-0.044958,-0.37672,-0.069812,-0.001038,-0.130402,-0.028682,-0.992364,-0.033294,-0.716938,-0.330984,-0.272519,0.321468,4
5,0.004976,7.9e-05,0.001715,7.5e-05,100.0,{'alpha': 100},-0.044958,-0.37672,-0.069812,-0.001038,-0.130402,-0.028682,-0.992364,-0.033294,-0.716938,-0.330984,-0.272519,0.321468,4
6,0.005002,0.000205,0.00175,9.2e-05,1000.0,{'alpha': 1000},-0.044958,-0.37672,-0.069812,-0.001038,-0.130402,-0.028682,-0.992364,-0.033294,-0.716938,-0.330984,-0.272519,0.321468,4


In [88]:
model = Lasso(alpha=0.001)

X_train, X_test, y_train, y_test = train_test_split(california_df_scaled, california.target, test_size=0.3, random_state=100)

model.fit(X_train,y_train)

print('학습 데이터 점수: {}'.format(model.score(X_train, y_train)))
print('평가 데이터 점수: {}'.format(model.score(X_test, y_test)))

학습 데이터 점수: 0.5875254870755893
평가 데이터 점수: 0.6089154932679167


최적파라미터를 대입하여 모델 성능 상승