In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# -----------------------------
# 1) 데이터 불러오기 및 통합
# -----------------------------

# 와인 데이터셋은 반드시 sep=";" 옵션을 지정해야 올바르게 열이 분리됨
red_wine = pd.read_csv("/content/drive/MyDrive/ml-programming-lab/week4/winequality-red.csv", sep=';')
white_wine = pd.read_csv("/content/drive/MyDrive/ml-programming-lab/week4/winequality-white.csv", sep=';')

# 와인 종류를 구분하기 위해 'type' 열을 추가
red_wine['type'] = 'red'
white_wine['type'] = 'white'

# 레드 와인과 화이트 와인 데이터를 하나로 통합
wine_data = pd.concat([red_wine, white_wine], ignore_index=True)

In [2]:
wine_data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


In [3]:
# quality에 어떤 값들이 있는지 확인
print("quality 종류:", wine_data['quality'].unique())

quality 종류: [5 6 7 4 8 3 9]


In [4]:
# -----------------------------
# 2) 결측치 확인
# -----------------------------

wine_data.isnull().sum()

# -----------------------------
# 3) 결측치 제거
# -----------------------------

wine_data = wine_data.dropna()
wine_data.isnull().sum()

Unnamed: 0,0
fixed acidity,0
volatile acidity,0
citric acid,0
residual sugar,0
chlorides,0
free sulfur dioxide,0
total sulfur dioxide,0
density,0
pH,0
sulphates,0


In [5]:
# -----------------------------
# 4) 특징(X)과 타겟(y) 분리
# -----------------------------

X = wine_data.drop(columns=['quality'])

# 타겟은 와인 품질 점수 (quality)
y = wine_data['quality']

In [6]:
# -----------------------------
# 5) 라벨 인코딩
# -----------------------------

# 'type'은 문자열(red/white)이므로 숫자(0/1)로 변환
le = LabelEncoder()
X['type'] = le.fit_transform(X['type'])

In [7]:
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,0
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,1
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,1
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,1
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,1


In [8]:
y

Unnamed: 0,quality
0,5
1,5
2,5
3,6
4,5
...,...
6492,6
6493,5
6494,6
6495,7


In [9]:
# -----------------------------
# 6) 훈련 데이터와 테스트 데이터 분리
# -----------------------------

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# -----------------------------
# 7) 데이터 스케일링
# -----------------------------

# 각 특성의 단위를 통일시켜 모델의 성능을 향상
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# -----------------------------
# 8) 모델 구성 / 학습 / 예측 / 평가
# -----------------------------

# Logistic Regression
lr = LogisticRegression(random_state=42)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
accuracy_lr = accuracy_score(y_test, y_pred_lr)

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)
y_pred_dt = dt.predict(X_test_scaled)
accuracy_dt = accuracy_score(y_test, y_pred_dt)

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)
accuracy_knn = accuracy_score(y_test, y_pred_knn)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# -----------------------------
# 9) 모델 정확도 출력
# -----------------------------

print("--- 모델 정확도 ---")
print(f"Logistic Regression Accuracy: {accuracy_lr:.4f}")
print(f"Decision Tree Accuracy: {accuracy_dt:.4f}")
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")
print(f"KNN Accuracy: {accuracy_knn:.4f}")

--- 모델 정확도 ---
Logistic Regression Accuracy: 0.5423
Decision Tree Accuracy: 0.6115
Random Forest Accuracy: 0.6646
KNN Accuracy: 0.5454


In [13]:
# -----------------------------
# 10) 로지스틱 회귀 기울기 및 절편 추출
# -----------------------------

print("--- 로지스틱 회귀 기울기 및 절편 ---")

# 클래스(quatliy) 별 기울기 & 절편 출력
for cls_idx, cls in enumerate(lr.classes_):
    print(f"\n< quality {cls} >")
    print(f"\n| 특성 별 기울기 (coef) |")
    for feature, coef in zip(X.columns, lr.coef_[cls_idx]):
        print(f"{feature}: {coef:.4f}")
    print(f"\n| 절편 (intercept) |\n{lr.intercept_[cls_idx]:.4f}")

--- 로지스틱 회귀 기울기 및 절편 ---

< quality 3 >

| 특성 별 기울기 (coef) |
fixed acidity: 0.5911
volatile acidity: 0.9219
citric acid: -0.0434
residual sugar: -0.1257
chlorides: 0.5816
free sulfur dioxide: 0.6151
total sulfur dioxide: -0.4774
density: -0.1314
pH: 0.4008
sulphates: -0.4332
alcohol: -0.6694
type: 0.8715

| 절편 (intercept) |
-2.1945

< quality 4 >

| 특성 별 기울기 (coef) |
fixed acidity: -0.3435
volatile acidity: 0.7516
citric acid: -0.0561
residual sugar: -0.8319
chlorides: 0.2568
free sulfur dioxide: -1.0418
total sulfur dioxide: 0.1881
density: 0.7588
pH: -0.3320
sulphates: -0.1383
alcohol: -0.5311
type: 1.0290

| 절편 (intercept) |
-0.1892

< quality 5 >

| 특성 별 기울기 (coef) |
fixed acidity: -0.5636
volatile acidity: 0.2333
citric acid: 0.0608
residual sugar: -0.5487
chlorides: 0.1608
free sulfur dioxide: -0.2415
total sulfur dioxide: 0.4741
density: 0.5453
pH: -0.4303
sulphates: -0.1535
alcohol: -0.8195
type: -0.4687

| 절편 (intercept) |
2.5387

< quality 6 >

| 특성 별 기울기 (coef) |
fixed acidi