# 분석환경 준비

In [19]:
# 필수 라이브러리
import pandas as pd
import numpy as np
import random
import tensorflow as tf

# 랜덤 시드 고정
SEED=12
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
print("시드 고정: ", SEED)

시드 고정:  12


# 데이터 전처리

In [20]:
train = pd.read_csv("./data/wine/train.csv")#/content/train.csv
test = pd.read_csv("./data/wine/test.csv")
submission = pd.read_csv("./data/wine/sample_submission.csv")

print(train.shape, test.shape, submission.shape)

(5497, 14) (1000, 13) (1000, 2)


In [21]:
train.head(2)

Unnamed: 0,index,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,0,5,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,white
1,1,5,8.8,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,red


In [22]:
submission.head()

Unnamed: 0,index,quality
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [23]:
train['type'].value_counts()

type
white    4159
red      1338
Name: count, dtype: int64

In [24]:
train['type'] = np.where(train['type']=='white', 1, 0).astype(int)
test['type'] = np.where(test['type']=='white', 1, 0).astype(int)
train['type'].value_counts()

type
1    4159
0    1338
Name: count, dtype: int64

In [25]:
train['quality'].value_counts()

quality
6    2416
5    1788
7     924
4     186
8     152
3      26
9       5
Name: count, dtype: int64

In [26]:
# to_categorical()은 각 샘플의 라벨을 길이 num_classes짜리 벡터로 바꿈, 자기 클래스 위치만 1, 나머지는 0

from tensorflow.keras.utils import to_categorical
y_train = to_categorical(train.loc[:, 'quality'] - 3) #“3점”을 클래스 0, “9점”을 클래스 6으로 바꿔주는 정규화 작업
y_train

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(5497, 7))

In [27]:
# 피처 선택
X_train = train.loc[:, 'fixed acidity':]
X_test = test.loc[:, 'fixed acidity':]

# 피처 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

print(X_train_scaled.shape, y_train.shape)
print(X_test_scaled.shape)

(5497, 12) (5497, 7)
(1000, 12)


# 신경망 학습

In [28]:
# 심층 신경망 모델
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

def build_model(train_data, train_target):
    model = Sequential()
    model.add(Dense(128, activation='tanh', input_dim=train_data.shape[1]))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='tanh'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='tanh'))
    model.add(Dense(train_target.shape[1], activation='softmax'))

    model.compile(optimizer='RMSprop', loss='categorical_crossentropy',
                metrics=['acc', 'mae'])

    return model

model = build_model(X_train_scaled, y_train)
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


# 콜백 함수

In [29]:
# Early Stopping 기법
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

X_tr, X_val, y_tr, y_val = train_test_split(X_train_scaled, y_train, test_size=0.15,
                                            shuffle=True, random_state=SEED)

early_stopping = EarlyStopping(monitor='val_loss',  patience=10)
history = model.fit(X_tr, y_tr, batch_size=64, epochs=200,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stopping],
                    verbose=2)

Epoch 1/200
73/73 - 1s - 18ms/step - acc: 0.4640 - loss: 1.3206 - mae: 0.1955 - val_acc: 0.5164 - val_loss: 1.1726 - val_mae: 0.1819
Epoch 2/200
73/73 - 0s - 3ms/step - acc: 0.4981 - loss: 1.1841 - mae: 0.1793 - val_acc: 0.5248 - val_loss: 1.1192 - val_mae: 0.1726
Epoch 3/200
73/73 - 0s - 3ms/step - acc: 0.5131 - loss: 1.1485 - mae: 0.1743 - val_acc: 0.5358 - val_loss: 1.0982 - val_mae: 0.1689
Epoch 4/200
73/73 - 0s - 3ms/step - acc: 0.5216 - loss: 1.1285 - mae: 0.1719 - val_acc: 0.5503 - val_loss: 1.0847 - val_mae: 0.1677
Epoch 5/200
73/73 - 0s - 3ms/step - acc: 0.5257 - loss: 1.1199 - mae: 0.1716 - val_acc: 0.5321 - val_loss: 1.0743 - val_mae: 0.1673
Epoch 6/200
73/73 - 0s - 3ms/step - acc: 0.5210 - loss: 1.1095 - mae: 0.1707 - val_acc: 0.5552 - val_loss: 1.0701 - val_mae: 0.1664
Epoch 7/200
73/73 - 0s - 3ms/step - acc: 0.5283 - loss: 1.1029 - mae: 0.1699 - val_acc: 0.5612 - val_loss: 1.0681 - val_mae: 0.1659
Epoch 8/200
73/73 - 0s - 3ms/step - acc: 0.5261 - loss: 1.1023 - mae: 0.170

In [30]:
model.evaluate(X_val, y_val)

[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - acc: 0.5648 - loss: 1.0231 - mae: 0.1611 


[1.023134708404541, 0.5648484826087952, 0.1610509306192398]

In [31]:
# test 데이터에 대한 예측값 정리
y_pred_proba = model.predict(X_test_scaled)
y_pred_proba[:5]

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


array([[1.9972293e-04, 5.5295150e-03, 1.8350042e-01, 5.0586295e-01,
        2.6980433e-01, 3.5021160e-02, 8.1995262e-05],
       [5.2239811e-03, 5.2661773e-02, 6.7706543e-01, 2.4830788e-01,
        1.4321783e-02, 2.4157309e-03, 3.4108600e-06],
       [2.1068903e-03, 1.4283612e-02, 6.2976182e-01, 3.4011173e-01,
        1.1041547e-02, 2.6776541e-03, 1.6713462e-05],
       [1.4349333e-03, 3.9099667e-02, 3.9881307e-01, 4.5246866e-01,
        9.6432269e-02, 1.1719931e-02, 3.1412441e-05],
       [3.3740554e-04, 2.0633296e-03, 3.4092564e-02, 2.8969878e-01,
        5.7847840e-01, 9.4987400e-02, 3.4208904e-04]], dtype=float32)

In [32]:
y_pred_label = np.argmax(y_pred_proba, axis=-1) + 3
y_pred_label[:5]

array([6, 5, 5, 6, 7])

In [33]:
# 제출양식에 맞게 정리
submission['quality'] = y_pred_label.astype(int)
submission.head()

Unnamed: 0,index,quality
0,0,6
1,1,5
2,2,5
3,3,6
4,4,7


In [34]:
# 제출파일 저장
submission.to_csv("./data/wine_dnn_001.csv", index=False)