In [76]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
import random

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

# Simple Regression for Wine Quality Prediction
Solidware Assignment for 2021 Summer Internship
- assignee: Kyuhyung Choi ([choigww@gmail.com](choigww@gmail.com))

<br>

## 모델 테스트
목적 : 전처리 및 파라미터 튜닝을 최소화한 상태에서 모델 기본 성능을 검증하고, 튜닝 모델 후보를 선정한다.

<br>

### Test Summary
- Non-boosting 모델 vs. Boosting 계열의 기본 모델 성능 비교
- Test set MSE 지표를 기반으로, Non-boosting, Boosting 각각 2개의 모델을 선정
    - Non-boosting 모델
        - 성능은 boosting에 비하여 떨어지지만 연산효율성이 매우 높음
            - Linear Regression Test MSE -> 0.42
            - SVM Regressor Test MSE -> 0.39
    - Boosting 모델
        - 성능이 우수하지만 연산효율성이 떨어짐
            - RandomForest Regressor Test MSE -> 0.38
            - CatBoost Regressor Test MSE -> 0.38
- 각 모델 특성에 따라 scaler 종류 및 split stratification 여부가 성능에 영향을 미침
    - 각 모델에 적합한 split 방법 적용해야 함
- Simple Neural Net은 머신러닝 모델보다 낮은 성능
    - 데이터 샘플이 딥러닝에 적합한 수준만큼 많지 않기 때문인 것으로 추정

<br>

Model(No Tuning)|Best Setting|Best Score(Test MSE)
-|-|-
Linear Regression|(stratified)|0.419
Lasso|(stratified)|0.671
Ridge|(stratified, MinMaxScaler)|0.417
ElasticNet|(stratified)|0.617
SVMRegressor|(non-stratified, RobustScaler)|0.384
RandomForest Regressor|(stratified, StandardScaler)|0.379
GB Regressor|(non-stratified, MinMaxScaler)|0.3846
XGB Regressor|(non-stratified, RobustScaler)|0.4729
LightGBM Regressor|(non-stratified, RobustScaler)|0.4153
CatBoost Regressor|(non-stratified, RobustScaler)|0.379
Keras Simple NN|(stratified, StandardScaler)|0.4707


<br>

### Conclusion
- LinearRegression, SVMRegressor, RandomForestRegressor, CatBoostRegressor 4개 모델 튜닝 진행
- LinearRegression / SVMRegressor:
    - 연산량이 적은 편
    - 모델 자체 하이퍼 파라미터 갯수 적은 편
    - 전처리 파이프라인 적용하여 모델별 최적 전처리 기법도 함께 탐색
        - Polynomial Features, Feature Selection, Dimension Reduction 등
- RandomForestRegressor / CatBoostRegerssor:
    - 연산량이 높은 편
    - 모델 자체 하이퍼 파라미터 갯수가 많은 편
    - 스케일러 테스트만 진행하고, 모델 자체 파라미터 튜닝에 집중

<br>

In [3]:
# load data
red = pd.read_csv('./data/winequality-red.csv', sep=';')
red = red.drop_duplicates(keep='last', ignore_index=True)
red.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
1,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
2,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6


In [67]:
def get_model_performance(model, Xtrain, Xtest, ytrain, ytest, X_scaler=None):
#     ytrain = pd.DataFrame(ytrain)
    
    if X_scaler:
        xs = X_scaler
        Xtrain = xs.fit_transform(Xtrain)
        Xtest = xs.transform(Xtest)
    
    model.fit(Xtrain, ytrain)
    
    try:
        pred = model.predict(Xtest)
    except:
        pred = model.predict(np.array(Xtest))
        
    try:
        return calc_mse(ytest, pred).values[0]
    except:
        return calc_mse(ytest, pred)

In [None]:
# split data
X = red.drop('quality', axis=1)
y = red[['quality']]

# split version 1 - stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    stratify=y,
                                                    shuffle=True,
                                                    random_state=2021)

# split version 2 - NO stratification
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=2021)

In [107]:
# stratified data
# 각 target에 해당하는 데이터 샘플의 갯수(value count)가 동일한 비율로 split 되었음 (80:20 = 4:1 비율 유지)
pd.concat([y_train.value_counts(), y_test.value_counts()], axis=1).rename(columns={0:'y_train',
                                                                                  1:'y_test'})

Unnamed: 0_level_0,y_train,y_test
quality,Unnamed: 1_level_1,Unnamed: 2_level_1
5,461,116
6,428,107
7,134,33
4,42,11
8,14,3
3,8,2


In [108]:
# non-stratified data
# 각 target에 해당하는 데이터 샘플의 갯수(value count)가 동일하지 않은 비율로 split 되었음
pd.concat([y_train2.value_counts(), y_test2.value_counts()], axis=1).rename(columns={0:'y_train',
                                                                                  1:'y_test'})

Unnamed: 0_level_0,y_train,y_test
quality,Unnamed: 1_level_1,Unnamed: 2_level_1
3,10,
4,41,12.0
5,465,112.0
6,434,101.0
7,127,40.0
8,10,7.0


In [92]:
RANDOM_SEED = 2021

# scale datasets - stratified
scalers = [MinMaxScaler(), StandardScaler(), RobustScaler()]
for s in scalers:
    s.fit(X_train)

X_train_scaled_list = [scaler.transform(X_train) for scaler in scalers]
X_test_scaled_list = [scaler.transform(X_test) for scaler in scalers]

# scale datasets - non-stratified
scalers2 = [MinMaxScaler(), StandardScaler(), RobustScaler()]
for s2 in scalers2:
    s2.fit(X_train)

X_train2_scaled_list = [scaler2.transform(X_train2) for scaler2 in scalers2]
X_test2_scaled_list = [scaler2.transform(X_test2) for scaler2 in scalers2]

In [98]:
# create a function to fit and eveluate scores for default
def fit_and_evaluate_default(model, name):
    scalers_name = ['MinMaxScaler', 'StandardScaler', 'RobustScaler']
    
    print(f'------------- {name} default -------------')
    for i, scaler in enumerate(scalers):
        print(f'@@@ {scalers_name[i]} @@@')
        Xtr = X_train_scaled_list[i]
        Xte = X_test_scaled_list[i]
        
        model.fit(Xtr, y_train)
        pred = model.predict(Xte)
        mse = mean_squared_error(y_test, pred)
        
        print('MSE score (stratified data) - ', round(mse, 4))
        
        Xtr2 = X_train2_scaled_list[i]
        Xte2 = X_test2_scaled_list[i]
        model.fit(Xtr2, y_train2)
        pred2 = model.predict(Xte2)
        mse2 = mean_squared_error(y_test2, pred2)
        print('MSE score (non-stratified data) - ', round(mse2, 4))
        print('===')
    
# get some scores
# linear models
fit_and_evaluate_default(LinearRegression(), 'LinearRegression')
fit_and_evaluate_default(Lasso(random_state=RANDOM_SEED), 'Lasso')
fit_and_evaluate_default(Ridge(random_state=RANDOM_SEED), 'Ridge')
fit_and_evaluate_default(ElasticNet(random_state=RANDOM_SEED), 'ElasticNet')
# non-linear models
fit_and_evaluate_default(svm.SVR(), 'SVMRegressor')

------------- LinearRegression default -------------
@@@ MinMaxScaler @@@
MSE score (stratified data) -  0.4191
MSE score (non-stratified data) -  0.4317
===
@@@ StandardScaler @@@
MSE score (stratified data) -  0.4191
MSE score (non-stratified data) -  0.4317
===
@@@ RobustScaler @@@
MSE score (stratified data) -  0.4191
MSE score (non-stratified data) -  0.4317
===
------------- Lasso default -------------
@@@ MinMaxScaler @@@
MSE score (stratified data) -  0.671
MSE score (non-stratified data) -  0.7562
===
@@@ StandardScaler @@@
MSE score (stratified data) -  0.671
MSE score (non-stratified data) -  0.7562
===
@@@ RobustScaler @@@
MSE score (stratified data) -  0.671
MSE score (non-stratified data) -  0.7562
===
------------- Ridge default -------------
@@@ MinMaxScaler @@@
MSE score (stratified data) -  0.4173
MSE score (non-stratified data) -  0.4344
===
@@@ StandardScaler @@@
MSE score (stratified data) -  0.419
MSE score (non-stratified data) -  0.4318
===
@@@ RobustScaler @@@


In [101]:
fit_and_evaluate_default(RandomForestRegressor(random_state=RANDOM_SEED), 'RandomForestRegressor')
fit_and_evaluate_default(GradientBoostingRegressor(random_state=RANDOM_SEED), 'GradientBoostingRegressor')
fit_and_evaluate_default(XGBRegressor(random_state=RANDOM_SEED), 'XGBoostRegressor')
fit_and_evaluate_default(LGBMRegressor(random_state=RANDOM_SEED, silent=True), 'LGBMRegressor')
fit_and_evaluate_default(CatBoostRegressor(random_state=RANDOM_SEED, silent=True), 'CatBoostRegressor')

------------- RandomForestRegressor default -------------
@@@ MinMaxScaler @@@
MSE score (stratified data) -  0.3809
MSE score (non-stratified data) -  0.3821
===
@@@ StandardScaler @@@
MSE score (stratified data) -  0.379
MSE score (non-stratified data) -  0.3828
===
@@@ RobustScaler @@@
MSE score (stratified data) -  0.3809
MSE score (non-stratified data) -  0.3817
===
------------- GradientBoostingRegressor default -------------
@@@ MinMaxScaler @@@
MSE score (stratified data) -  0.4079
MSE score (non-stratified data) -  0.3846
===
@@@ StandardScaler @@@
MSE score (stratified data) -  0.4073
MSE score (non-stratified data) -  0.3854
===
@@@ RobustScaler @@@
MSE score (stratified data) -  0.4075
MSE score (non-stratified data) -  0.3852
===
------------- XGBoostRegressor default -------------
@@@ MinMaxScaler @@@
MSE score (stratified data) -  0.4514
MSE score (non-stratified data) -  0.4739
===
@@@ StandardScaler @@@
MSE score (stratified data) -  0.4567
MSE score (non-stratified da

### Keras Simple NN
Best MSE = 0.400
- RobustScaler for X & y

In [37]:
# 램덤 시드 고정
SEED = 2021
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# 모델 설계 - 드랍아웃 활용
# 심층 신경망 모델
def build_model(train_data, train_target):
    model = Sequential()
    model.add(Dense(32, activation='relu', input_dim=train_data.shape[1]))
    model.add(Dropout(0.2))
    model.add(Dense(8, activation='relu'))
    model.add(Dropout(0.2))
#     model.add(Dense(32, activation='relu'))
    model.add(Dense(1))
    
    optimizer = tf.keras.optimizers.RMSprop(0.001)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae', 'mse'])
    
    return model

In [38]:
def keras_build_and_evaluate(X_train, X_test, y_train, y_test, X_scaler=None, y_scaler=None):
    model = build_model(X_train, y_train)
    
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train
                                                , test_size=0.2
                                                , shuffle=True
                                                , stratify=y_train
                                                , random_state=SEED)

    if X_scaler:
        print('X scaling applied.')
        xs = X_scaler
        X_tr = xs.fit_transform(X_tr)
        X_val = xs.transform(X_val)
        X_test = xs.transform(X_test)
    if y_scaler:
        print('y scaling applied.')
        ys = y_scaler
        y_tr = ys.fit_transform(y_tr)
        y_val = ys.transform(y_val)
#         y_test = ys.transform(y_test)
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=10)

    history = model.fit(X_tr, y_tr
                        , batch_size=64
                        , epochs=1000
                        , validation_data=(X_val, y_val)
                        , callbacks=[early_stopping]
                        ,verbose=0)
    
    if y_scaler:
        pred = ys.inverse_transform(model.predict(X_test))
        mse = np.mean(np.square(y_test-pred))
        mae = np.mean(np.abs(y_test-pred))
        print([mse.values[0], mae.values[0], mse.values[0]])
    else:
        print(model.evaluate(X_test, y_test))

In [103]:
keras_build_and_evaluate(X_train, X_test, y_train, y_test)
print('-')
for i in range(3):
    keras_build_and_evaluate(X_train_scaled_list[i], X_test_scaled_list[i], y_train, y_test)
    print('-')
for i in range(3):
    keras_build_and_evaluate(X_train2_scaled_list[i], X_test2_scaled_list[i], y_train2, y_test2)
    print('-')

[0.5276765823364258, 0.5415258407592773, 0.5276765823364258]
-
[0.6399917602539062, 0.6173838376998901, 0.6399917602539062]
-
[0.4706932604312897, 0.522005021572113, 0.4706932604312897]
-
[0.4848010241985321, 0.5286164283752441, 0.4848010241985321]
-
[0.7067611813545227, 0.6553390026092529, 0.7067611813545227]
-
[0.6731551885604858, 0.6423717141151428, 0.6731551885604858]
-
[0.48701539635658264, 0.5243487358093262, 0.48701539635658264]
-
