## 회귀 및 릿지모델

### 구글 드라이브 마운트

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/house-standard-VALP.csv')
df.shape

(20495, 93)

In [3]:
df.head()

Unnamed: 0,AGEP,BDSP,ELEP,GASP,HINCP,NRC,RMSP,VALP,BATH,FESRP,...,SCHL_16,SCHL_17,SCHL_18,SCHL_19,SCHL_20,SCHL_21,SCHL_22,SCHL_23,SCHL_24,SEX_2
0,1.277527,-0.185587,-0.218278,-0.030785,-0.857357,-0.757285,-0.779457,0.401699,1.0,0,...,0,0,0,0,0,1,0,0,0,1
1,0.115555,-0.185587,1.548888,-1.029228,0.537778,-0.757285,-0.261327,6.192561,1.0,0,...,1,0,0,0,0,0,0,0,0,1
2,0.074056,-0.185587,1.548888,-1.029228,0.537778,-0.757285,-0.261327,6.192561,1.0,0,...,1,0,0,0,0,0,0,0,0,0
3,0.945535,-0.185587,-0.975635,-0.243219,-0.564886,-0.757285,-0.261327,-0.578293,1.0,0,...,0,0,1,0,0,0,0,0,0,0
4,0.987034,-0.185587,0.286627,2.093563,0.048701,-0.757285,0.256802,-0.043752,1.0,0,...,0,0,0,0,0,0,1,0,0,0


### 타겟변수를 제외한 입력변수를 data에 저장

In [4]:
data = df.drop(['VALP'], axis=1)
target = df['VALP']

### train data, test data 분리

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.5, random_state=42)

print("X_train shape : ", X_train.shape)
print("X_test shape : ", X_test.shape)
print("y_train shape : ", y_train.shape)
print("y_test shape : ", y_test.shape)

X_train shape :  (10247, 92)
X_test shape :  (10248, 92)
y_train shape :  (10247,)
y_test shape :  (10248,)


### 연속 변수가 타겟 변수일 때 Linear Regression 모델(default 모델) 사용

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

linr = LinearRegression(n_jobs = -1)
model = linr.fit(X_train, y_train)
pred = model.predict(X_test)

print("Linear Reg. Training set r2 score : {:.5f}".format(model.score(X_train, y_train)))
print("    Linear Reg. Test set r2 score : {:.5f}".format(r2_score(y_test, pred)))

Linear Reg. Training set r2 score : 0.34319
    Linear Reg. Test set r2 score : 0.33329


### 연속 변수가 타겟변수일 때 Ridge 모델(default 모델) 사용

In [7]:
from sklearn.linear_model import Ridge

ridge = Ridge()
model = ridge.fit(X_train, y_train)
pred = model.predict(X_test)

print("Ridge Reg. Training set r2 score : {:.5f}".format(model.score(X_train, y_train)))
print("    Ridge Reg. Test set r2 score : {:.5f}".format(r2_score(y_test, pred)))

Ridge Reg. Training set r2 score : 0.34405
    Ridge Reg. Test set r2 score : 0.33379


### 그리드 서치 실행

In [8]:
from sklearn.model_selection import GridSearchCV

clf_Ridge = Ridge()
params = {'alpha' : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
          'solver' : ['auto','svc','lsqr','cholesky','spares_cg','sag','saga','lbfgs']}

grid_Ridge = GridSearchCV(clf_Ridge, param_grid=params, scoring='r2',cv=5, n_jobs=1, verbose=1)
grid_Ridge.fit(X_train, y_train)

print("GridSearchCV max score :{:.5f}".format(grid_Ridge.best_score_))
print("GridSearchCV best parameter : ", grid_Ridge.best_params_)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


105 fits failed out of a total of 280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_ridge.py", line 1123, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterErro

GridSearchCV max score :0.33140
GridSearchCV best parameter :  {'alpha': 10, 'solver': 'sag'}


### 연속 변수가 타겟변수 일 때 Lasso 모델 사용

In [16]:
from sklearn.linear_model import Lasso
clt_lasso = Lasso()

model = clt_lasso.fit(X_train, y_train)
pred = model.predict(X_test)

print("Lasso Reg. Training set r2 score : {:.5f}".format(clt_lasso.score(X_train, y_train)))
print("    Lasso Reg. Test set r2 score : {:.5f}".format(r2_score(y_test, pred)))

Lasso Reg. Training set r2 score : 0.00000
    Lasso Reg. Test set r2 score : -0.00015
