In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

from tensorflow.keras import models, layers
from tensorflow.keras.utils import to_categorical

from sklearn.metrics import classification_report,confusion_matrix

# -----------------------------
# 1) 데이터 불러오기 및 통합
# -----------------------------

# 와인 데이터셋은 반드시 sep=";" 옵션을 지정해야 올바르게 열이 분리됨
red_wine  = pd.read_csv("/content/drive/MyDrive/ml-programming-lab/week4/winequality-red.csv",   sep=";")
white_wine= pd.read_csv("/content/drive/MyDrive/ml-programming-lab/week4/winequality-white.csv", sep=";")

# 와인 종류를 구분하기 위해 'type' 열을 추가
red_wine["type"]  = "red"
white_wine["type"]= "white"

# 레드 와인과 화이트 와인 데이터를 하나로 통합
wine_data = pd.concat([red_wine, white_wine], ignore_index=True)

In [2]:
wine_data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


In [3]:
# quality에 어떤 값들이 있는지 확인
print("quality 종류:", wine_data['quality'].unique())

quality 종류: [5 6 7 4 8 3 9]


In [4]:
# -----------------------------
# 2) 결측치 확인
# -----------------------------

wine_data.isnull().sum()

# -----------------------------
# 3) 결측치 제거
# -----------------------------

wine_data = wine_data.dropna()
wine_data.isnull().sum()

Unnamed: 0,0
fixed acidity,0
volatile acidity,0
citric acid,0
residual sugar,0
chlorides,0
free sulfur dioxide,0
total sulfur dioxide,0
density,0
pH,0
sulphates,0


In [5]:
# -----------------------------
# 4) 특징(X)과 타겟(y) 분리
# -----------------------------

X = wine_data.drop(columns=['quality'])

# 타겟은 와인 품질 점수 (quality)
y = wine_data['quality']

In [6]:
# -----------------------------
# 5) 라벨 인코딩
# -----------------------------

# 'type'은 문자열(red/white)이므로 숫자(0/1)로 변환
le = LabelEncoder()
X['type'] = le.fit_transform(X['type'])

In [7]:
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,0
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,1
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,1
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,1
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,1


In [8]:
y

Unnamed: 0,quality
0,5
1,5
2,5
3,6
4,5
...,...
6492,6
6493,5
6494,6
6495,7


In [9]:
# -----------------------------
# 6) 데이터 스케일링
# -----------------------------

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [10]:
# -----------------------------
# 7) 타겟 변수 원-핫 인코딩
# -----------------------------

Y = pd.get_dummies(y).values

In [11]:
# -----------------------------
# 8) 훈련 데이터와 테스트 데이터 분리
# -----------------------------

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [12]:
# -----------------------------
# 9) 딥러닝 모델 구성 (Dropout 레이어 제외)
# -----------------------------

model = models.Sequential([
    layers.Dense(64, activation="relu", input_shape=(X.shape[1],)),
    layers.Dense(32, activation="relu"),
    layers.Dense(Y.shape[1], activation="softmax")  # 출력층 노드 수는 클래스의 개수와 동일
])

# 모델 요약 정보 출력
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [13]:
# -----------------------------
# 10) 모델 컴파일
# -----------------------------

model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",  # 원-핫 인코딩된 다중 분류 문제에 사용
    metrics=["accuracy"]
)

In [14]:
# -----------------------------
# 11) 모델 학습
# -----------------------------

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=16,
    verbose=1
)

Epoch 1/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3852 - loss: 1.5153 - val_accuracy: 0.5452 - val_loss: 1.1224
Epoch 2/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5534 - loss: 1.0882 - val_accuracy: 0.5577 - val_loss: 1.0833
Epoch 3/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5674 - loss: 1.0465 - val_accuracy: 0.5692 - val_loss: 1.0570
Epoch 4/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5702 - loss: 1.0341 - val_accuracy: 0.5779 - val_loss: 1.0554
Epoch 5/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5878 - loss: 1.0116 - val_accuracy: 0.5596 - val_loss: 1.0574
Epoch 6/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5663 - loss: 1.0085 - val_accuracy: 0.5673 - val_loss: 1.0357
Epoch 7/50
[1m260/260[0m 

In [15]:
# -----------------------------
# 12) 모델 예측
# -----------------------------

# 테스트 데이터로 예측 수행
y_pred = model.predict(X_test)

# 원-핫 인코딩된 결과를 단일 클래스 레이블로 변환
y_test_class = np.argmax(y_test, axis=1)
y_pred_class = np.argmax(y_pred, axis=1)

[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [16]:
# -----------------------------
# 13) 모델 평가
# -----------------------------

print("\n< Classification Report >")
print(classification_report(y_test_class,y_pred_class))

print("< Confusion Matrix (rows=true, cols=pred) >")
print(confusion_matrix(y_test_class,y_pred_class))


< Classification Report >
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.14      0.02      0.04        48
           2       0.62      0.59      0.61       413
           3       0.56      0.75      0.65       579
           4       0.54      0.30      0.39       211
           5       0.50      0.07      0.13        41
           6       0.00      0.00      0.00         1

    accuracy                           0.58      1300
   macro avg       0.34      0.25      0.26      1300
weighted avg       0.56      0.58      0.55      1300

< Confusion Matrix (rows=true, cols=pred) >
[[  0   2   5   0   0   0   0]
 [  0   1  31  16   0   0   0]
 [  0   3 245 161   4   0   0]
 [  0   1 106 437  34   1   0]
 [  0   0   8 138  63   2   0]
 [  0   0   0  23  15   3   0]
 [  0   0   0   1   0   0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
