In [1]:
#2021.06.22. TUE
#Hankyeong

#00. 패키지 호출
import pandas as pd
import numpy as np 
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error
import warnings

#00-1. warning message ignore
warnings.filterwarnings(action='ignore')

#01. Diabetes 데이터셋을 다중선형회귀 모델로 예측하기.
#(1) 데이터셋 불러오기. 
diabetes = load_diabetes()

#(2) 데이터프레임으로 변환하기. 
df_diabetes = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df_diabetes['target'] = diabetes.target
df_diabetes

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930,220.0


In [2]:
#(3) 데이터셋 탐색하기. 
df_diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
 10  target  442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB


In [3]:
df_diabetes.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-3.634285e-16,1.308343e-16,-8.045349e-16,1.281655e-16,-8.835316000000001e-17,1.327024e-16,-4.574646e-16,3.777301e-16,-3.830854e-16,-3.412882e-16,152.133484
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,77.093005
min,-0.1072256,-0.04464164,-0.0902753,-0.1123996,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260974,-0.1377672,25.0
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665645,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324879,-0.03317903,87.0
50%,0.00538306,-0.04464164,-0.007283766,-0.005670611,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947634,-0.001077698,140.5
75%,0.03807591,0.05068012,0.03124802,0.03564384,0.02835801,0.02984439,0.0293115,0.03430886,0.03243323,0.02791705,211.5
max,0.1107267,0.05068012,0.1705552,0.1320442,0.1539137,0.198788,0.1811791,0.1852344,0.133599,0.1356118,346.0


In [4]:
#(4) train, test 데이터셋으로 분할하기. 
X_train,X_test,y_train,y_test = train_test_split(
    df_diabetes.iloc[:,:-1], df_diabetes.iloc[:,-1], test_size=0.1, random_state=2021
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((397, 10), (45, 10), (397,), (45,))

In [5]:
#(5) 모델 설정하기. 
lr = LinearRegression()

#(6) 모델의 하이퍼파라미터? 확인하기. 
lr.get_params()

{'copy_X': True,
 'fit_intercept': True,
 'n_jobs': None,
 'normalize': False,
 'positive': False}

In [6]:
#MEMO. 하이퍼파라미터 의미 해석하기. 
#      fit_intercept = 절편(intercept 혹은 딥러닝에서의 bias) 계산 여부
#      positivebool  = 계수(coefficient 혹은 딥러닝에서의 weight)를 양수로 강제로 지정할지에 대한 여부
#      ...

In [7]:
#(7) 모델 학습하기. 
lr.fit(X_train,y_train)

LinearRegression()

In [8]:
#(8) 모델 예측 및 평가하기. 
lr_pred = lr.predict(X_test)
lr_r2 = r2_score(y_test, lr_pred)
lr_rmse = np.sqrt(mean_squared_error(y_test,lr_pred))
f'Linear Regression`s R^2 = {lr_r2:.4f}, RMSE = {lr_rmse:.4F}'

'Linear Regression`s R^2 = 0.4558, RMSE = 48.6847'

In [32]:
#02. Diabetes 데이터셋을 SVR 모델로 예측하기.
#(1) 모델 설정하기. 
svr = SVR()

#(2) 하이퍼파라미터 설정하기. 
svr_params = {'C':[11,11.1,11.2,11.3,11.4,11.5,11.6,11.7,11.8,11.9,12,12.1,12.2,12.3,12.4,12.5,12.6,12.7,12.8,12.9,13]}

#(3) 그리드서치 모델 설정하기. 
gscv_svr = GridSearchCV(estimator=svr, param_grid=svr_params, cv=10)

#(4) 모델 학습하기. 
gscv_svr.fit(X_train,y_train)

GridSearchCV(cv=10, estimator=SVR(),
             param_grid={'C': [11, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7,
                               11.8, 11.9, 12, 12.1, 12.2, 12.3, 12.4, 12.5,
                               12.6, 12.7, 12.8, 12.9, 13]})

In [33]:
#(5) 최적 파라미터 구하기. 
gscv_svr.best_params_

{'C': 12}

In [34]:
#(6) 최적 파라미터에 의거한 validation score 파악하기. 
gscv_svr.best_score_

0.46317328783499023

In [35]:
#(7) 모델 예측 및 평가하기. 
gscv_svr_fit = gscv_svr.best_estimator_
gscv_svr_pred = gscv_svr_fit.predict(X_test)
svr_r2 = r2_score(y_test, gscv_svr_pred)
svr_rmse = np.sqrt(mean_squared_error(y_test,gscv_svr_pred))
f'SVR`s R^2 = {svr_r2:.4f}, RMSE = {svr_rmse:.4F}'

'SVR`s R^2 = 0.4038, RMSE = 50.9588'

In [13]:
#02. Diabetes 데이터셋을 Decision Tree 모델로 예측하기.
#(1) 모델 설정하기. 
dt = DecisionTreeRegressor()

#(2) 모델의 하이퍼파라미터 확인하기. 
dt.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [19]:
#(3) 하이퍼파라미터 설정하기. 
dt_params = {
    'max_depth':[3,4,5,6,7,8],
    'min_samples_split': [140,141,142,143,144,145,146,147,148,149,150]
}

#(3) 그리드서치 모델 설정하기. 
gscv_dt = GridSearchCV(estimator=dt, param_grid=dt_params, cv=10)

#(4) 모델 학습하기. 
gscv_dt.fit(X_train,y_train)

GridSearchCV(cv=10, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [3, 4, 5, 6, 7, 8],
                         'min_samples_split': [140, 141, 142, 143, 144, 145,
                                               146, 147, 148, 149, 150]})

In [21]:
#(5) 최적 파라미터 구하기. 
gscv_dt.best_params_

{'max_depth': 4, 'min_samples_split': 143}

In [16]:
#(6) 최적 파라미터에 의거한 validation score 파악하기. 
gscv_dt.best_score_

0.3068299854416371

In [17]:
#(7) 모델 예측 및 평가하기. 
gscv_dt_fit = gscv_dt.best_estimator_
gscv_dt_pred = gscv_dt_fit.predict(X_test)
dt_r2 = r2_score(y_test, gscv_dt_pred)
dt_rmse = np.sqrt(mean_squared_error(y_test,gscv_dt_pred))
f'DT`s R^2 = {dt_r2:.4f}, RMSE = {dt_rmse:.4F}'

'DT`s R^2 = 0.1477, RMSE = 60.9307'

In [18]:
#04. y_test, lr_pred, svr_pred, dt_pred를 통한 데이터프레임 만들기.
predict = pd.DataFrame({'y_test'   : y_test,
                        'lr_pred'  : lr_pred,
                        'svr_pred' : gscv_svr_pred,
                        'dt_pred'  : gscv_dt_pred
                        })
predict.head(5)

Unnamed: 0,y_test,lr_pred,svr_pred,dt_pred
50,155.0,156.031216,139.798613,104.835443
369,167.0,185.700495,172.559212,159.552632
13,185.0,162.86384,136.386753,163.037037
353,109.0,162.545441,167.900015,163.037037
34,65.0,78.518494,95.644334,104.835443
