In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# -----------------------------
# 1) 데이터 불러오기 및 통합
# -----------------------------

# 파일 경로와 sep=';' 인자를 올바르게 분리하여 수정했습니다.
red_wine = pd.read_csv("/content/drive/MyDrive/ml-programming-lab/week4/winequality-red.csv", sep=';')
white_wine = pd.read_csv("/content/drive/MyDrive/ml-programming-lab/week4/winequality-white.csv", sep=';')

# 와인 종류를 구분하기 위해 'type' 열을 추가합니다.
red_wine['type'] = 'red'
white_wine['type'] = 'white'

# 레드 와인과 화이트 와인 데이터를 하나로 합칩니다.
wine_data = pd.concat([red_wine, white_wine], ignore_index=True)

print(wine_data.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality type  
0      9.4        5  red  
1      9.8        5  red  
2    

In [None]:
# -----------------------------
# 2) 특징(X)과 타겟(y) 분리 및 라벨 인코딩
# -----------------------------

# 특성(X)과 타겟(y)을 분리합니다.
# 'quality'는 와인 종류 예측에 사용하지 않으므로 제외합니다.
X = wine_data.drop(['type'], axis=1)

# LabelEncoder를 사용하여 'type' 열을 숫자(0, 1)로 변환합니다.
# 이 부분이 수정된 핵심입니다.
le = LabelEncoder()
y = le.fit_transform(wine_data['type'])

In [None]:
df

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
1,7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5
2,7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...
3,11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...
4,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
...,...
1594,6.2;0.6;0.08;2;0.09;32;44;0.9949;3.45;0.58;10.5;5
1595,5.9;0.55;0.1;2.2;0.062;39;51;0.99512;3.52;0.76...
1596,6.3;0.51;0.13;2.3;0.076;29;40;0.99574;3.42;0.7...
1597,5.9;0.645;0.12;2;0.075;32;44;0.99547;3.57;0.71...


In [None]:
# -----------------------------
# 3) 훈련 데이터와 테스트 데이터 분리
# -----------------------------

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# -----------------------------
# 4) 데이터 스케일링
# -----------------------------

# 각 특성의 단위를 통일시켜 모델의 성능을 향상시킵니다.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# -----------------------------
# 5) 모델 학습 및 평가 (이하 동일)
# -----------------------------

# K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)
accuracy_knn = accuracy_score(y_test, y_pred_knn)

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)
y_pred_dt = dt.predict(X_test_scaled)
accuracy_dt = accuracy_score(y_test, y_pred_dt)

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# Logistic Regression
lr = LogisticRegression(random_state=42)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
accuracy_lr = accuracy_score(y_test, y_pred_lr)

In [None]:
# -----------------------------
# 6) 로지스틱 회귀 모델의 기울기와 절편 추출
# -----------------------------

coefficients = lr.coef_[0]
intercept = lr.intercept_[0]

In [None]:
# -----------------------------
# 7) 결과 출력
# -----------------------------

print("--- 모델 정확도 ---")
print(f"KNN Accuracy: {accuracy_knn:.4f}")
print(f"Decision Tree Accuracy: {accuracy_dt:.4f}")
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")
print(f"Logistic Regression Accuracy: {accuracy_lr:.4f}")
print("\n" + "="*30 + "\n")

print("--- 로지스틱 회귀 상세 정보 ---")
print("기울기 (Coefficients):")
for feature, coef in zip(X.columns, coefficients):
    print(f"{feature}: {coef:.4f}")

print(f"\n절편 (Intercept): {intercept:.4f}")

--- 모델 정확도 ---
KNN Accuracy: 0.9877
Decision Tree Accuracy: 0.9831
Random Forest Accuracy: 0.9946
Logistic Regression Accuracy: 0.9892


--- 로지스틱 회귀 상세 정보 ---
기울기 (Coefficients):
fixed acidity: -0.6121
volatile acidity: -1.4759
citric acid: 0.3340
residual sugar: 3.3481
chlorides: -0.9287
free sulfur dioxide: -0.8614
total sulfur dioxide: 2.9687
density: -3.1092
pH: -0.3971
sulphates: -0.7658
alcohol: -0.8749
quality: -0.3092

절편 (Intercept): 4.3720
