In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

from tensorflow.keras import models, layers
from tensorflow.keras.utils import to_categorical

from sklearn.metrics import classification_report,confusion_matrix

# -----------------------------
# 1) 데이터 불러오기
# -----------------------------

# 와인 데이터셋은 반드시 sep=";" 옵션을 지정해야 올바르게 열이 분리됨
df= pd.read_csv("/content/drive/MyDrive/ml-programming-lab/week4/winequality-white.csv", sep=";")

In [17]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [18]:
# quality에 어떤 값들이 있는지 확인
print("quality 종류:", df['quality'].unique())

quality 종류: [6 5 7 8 4 3 9]


In [19]:
# -----------------------------
# 2) 결측치 확인
# -----------------------------

df.isnull().sum()

# -----------------------------
# 3) 결측치 제거
# -----------------------------

df = df.dropna()
df.isnull().sum()

Unnamed: 0,0
fixed acidity,0
volatile acidity,0
citric acid,0
residual sugar,0
chlorides,0
free sulfur dioxide,0
total sulfur dioxide,0
density,0
pH,0
sulphates,0


In [20]:
# -----------------------------
# 4) 특징(X)과 타겟(y) 분리
# -----------------------------

X = df.drop(columns=['quality'])

# 타겟은 와인 품질 점수 (quality)
y = df['quality']

In [21]:
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8


In [22]:
y

Unnamed: 0,quality
0,6
1,6
2,6
3,6
4,6
...,...
4893,6
4894,5
4895,6
4896,7


In [23]:
# -----------------------------
# 5) 데이터 스케일링
# -----------------------------

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [24]:
# -----------------------------
# 6) 타겟 변수 원-핫 인코딩
# -----------------------------

Y = pd.get_dummies(y).values

In [25]:
# -----------------------------
# 7) 훈련 데이터와 테스트 데이터 분리
# -----------------------------

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [26]:
# -----------------------------
# 8) 딥러닝 모델 구성 (Dropout 레이어 제외)
# -----------------------------

model = models.Sequential([
    layers.Dense(64, activation="relu", input_shape=(X.shape[1],)),
    layers.Dense(32, activation="relu"),
    layers.Dense(Y.shape[1], activation="softmax")  # 출력층 노드 수는 클래스의 개수와 동일
])

# 모델 요약 정보 출력
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [27]:
# -----------------------------
# 9) 모델 컴파일
# -----------------------------

model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",  # 원-핫 인코딩된 다중 분류 문제에 사용
    metrics=["accuracy"]
)

In [28]:
# -----------------------------
# 10) 모델 학습
# -----------------------------

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=16,
    verbose=1
)

Epoch 1/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.4131 - loss: 1.5311 - val_accuracy: 0.5510 - val_loss: 1.1267
Epoch 2/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5542 - loss: 1.1105 - val_accuracy: 0.5497 - val_loss: 1.0703
Epoch 3/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5572 - loss: 1.0719 - val_accuracy: 0.5625 - val_loss: 1.0410
Epoch 4/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5841 - loss: 1.0045 - val_accuracy: 0.5446 - val_loss: 1.0441
Epoch 5/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5719 - loss: 1.0164 - val_accuracy: 0.5676 - val_loss: 1.0248
Epoch 6/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5970 - loss: 0.9687 - val_accuracy: 0.5676 - val_loss: 1.0222
Epoch 7/50
[1m196/196[0m 

In [29]:
# -----------------------------
# 11) 모델 예측
# -----------------------------

# 테스트 데이터로 예측 수행
y_pred = model.predict(X_test)

# 원-핫 인코딩된 결과를 단일 클래스 레이블로 변환
y_test_class = np.argmax(y_test, axis=1)
y_pred_class = np.argmax(y_pred, axis=1)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [30]:
# -----------------------------
# 12) 모델 평가
# -----------------------------

print("\n< Classification Report >")
print(classification_report(y_test_class,y_pred_class))

print("< Confusion Matrix (rows=true, cols=pred) >")
print(confusion_matrix(y_test_class,y_pred_class))


< Classification Report >
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.71      0.20      0.31        51
           2       0.61      0.51      0.56       295
           3       0.51      0.76      0.61       409
           4       0.53      0.31      0.39       183
           5       0.20      0.03      0.05        33

    accuracy                           0.54       980
   macro avg       0.43      0.30      0.32       980
weighted avg       0.54      0.54      0.51       980

< Confusion Matrix (rows=true, cols=pred) >
[[  0   1   4   4   0   0]
 [  0  10  24  15   2   0]
 [  0   2 151 140   2   0]
 [  1   1  61 312  33   1]
 [  0   0   8 116  56   3]
 [  0   0   0  19  13   1]]
