In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# -----------------------------
# 1) 데이터 불러오기
# -----------------------------

# 와인 데이터셋은 반드시 sep=";" 옵션을 지정해야 올바르게 열이 분리됨
df = pd.read_csv("/content/drive/MyDrive/ml-programming-lab/week4/winequality-white.csv", sep=';')

In [20]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [21]:
# quality에 어떤 값들이 있는지 확인
print("quality 종류:", df['quality'].unique())

quality 종류: [6 5 7 8 4 3 9]


In [22]:
# -----------------------------
# 2) 결측치 확인
# -----------------------------

df.isnull().sum()

# -----------------------------
# 3) 결측치 제거
# -----------------------------

df = df.dropna()
df.isnull().sum()

Unnamed: 0,0
fixed acidity,0
volatile acidity,0
citric acid,0
residual sugar,0
chlorides,0
free sulfur dioxide,0
total sulfur dioxide,0
density,0
pH,0
sulphates,0


In [23]:
# -----------------------------
# 4) 특징(X)과 타겟(y) 분리
# -----------------------------

X = df.drop(columns=['quality'])

# 타겟은 와인 품질 점수 (quality)
y = df['quality']

In [24]:
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8


In [25]:
y

Unnamed: 0,quality
0,6
1,6
2,6
3,6
4,6
...,...
4893,6
4894,5
4895,6
4896,7


In [26]:
# -----------------------------
# 5) 훈련 데이터와 테스트 데이터 분리
# -----------------------------

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# -----------------------------
# 6) 데이터 스케일링
# -----------------------------

# 각 특성의 단위를 통일시켜 모델의 성능을 향상
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [28]:
# -----------------------------
# 7) 모델 구성 / 학습 / 예측 / 평가
# -----------------------------

# Logistic Regression
lr = LogisticRegression(random_state=42)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
accuracy_lr = accuracy_score(y_test, y_pred_lr)

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)
y_pred_dt = dt.predict(X_test_scaled)
accuracy_dt = accuracy_score(y_test, y_pred_dt)

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)
accuracy_knn = accuracy_score(y_test, y_pred_knn)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
# -----------------------------
# 8) 모델 정확도 출력
# -----------------------------

print("--- 모델 정확도 ---")
print(f"Logistic Regression Accuracy: {accuracy_lr:.4f}")
print(f"Decision Tree Accuracy: {accuracy_dt:.4f}")
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")
print(f"KNN Accuracy: {accuracy_knn:.4f}")

--- 모델 정확도 ---
Logistic Regression Accuracy: 0.5306
Decision Tree Accuracy: 0.6102
Random Forest Accuracy: 0.6908
KNN Accuracy: 0.5429


In [30]:
# -----------------------------
# 9) 로지스틱 회귀 기울기 및 절편 추출
# -----------------------------

print("--- 로지스틱 회귀 기울기 및 절편 ---")

# 클래스(quatliy) 별 기울기 & 절편 출력
for cls_idx, cls in enumerate(lr.classes_):
    print(f"\n< quality {cls} >")
    print(f"\n| 특성 별 기울기 (coef) |")
    for feature, coef in zip(X.columns, lr.coef_[cls_idx]):
        print(f"{feature}: {coef:.4f}")
    print(f"\n| 절편 (intercept) |\n{lr.intercept_[cls_idx]:.4f}")

--- 로지스틱 회귀 기울기 및 절편 ---

< quality 3 >

| 특성 별 기울기 (coef) |
fixed acidity: 0.7954
volatile acidity: 0.5314
citric acid: -0.0066
residual sugar: -0.3489
chlorides: 0.5075
free sulfur dioxide: 0.4685
total sulfur dioxide: -0.2202
density: -0.0978
pH: 0.2737
sulphates: -0.3352
alcohol: -0.4338

| 절편 (intercept) |
-2.4353

< quality 4 >

| 특성 별 기울기 (coef) |
fixed acidity: -0.3563
volatile acidity: 0.6393
citric acid: 0.0081
residual sugar: -1.2427
chlorides: 0.2386
free sulfur dioxide: -0.7843
total sulfur dioxide: -0.0409
density: 1.2443
pH: -0.4368
sulphates: -0.0931
alcohol: -0.5871

| 절편 (intercept) |
-0.1213

< quality 5 >

| 특성 별 기울기 (coef) |
fixed acidity: -0.4907
volatile acidity: 0.2615
citric acid: 0.0067
residual sugar: -0.5147
chlorides: 0.2143
free sulfur dioxide: -0.2434
total sulfur dioxide: 0.2095
density: 0.6665
pH: -0.4875
sulphates: -0.0760
alcohol: -0.8912

| 절편 (intercept) |
2.5324

< quality 6 >

| 특성 별 기울기 (coef) |
fixed acidity: -0.6161
volatile acidity: -0.3410
ci