In [None]:
# unacc가 레이블
# 차량 등급 평가

import pandas as pd

# 데이터 로딩
df = pd.read_csv('./car_evaluation.csv')
df

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc
...,...,...,...,...,...,...,...
1722,low,low,5more,more,med,med,good
1723,low,low,5more,more,med,high,vgood
1724,low,low,5more,more,big,low,unacc
1725,low,low,5more,more,big,med,good


In [None]:

print(df.isna().sum())

missing_rows = df[df.isna().any(axis=1)]
print(missing_rows)

vhigh      0
vhigh.1    0
2          0
2.1        0
small      0
low        0
unacc      0
dtype: int64
Empty DataFrame
Columns: [vhigh, vhigh.1, 2, 2.1, small, low, unacc]
Index: []


In [4]:
import seaborn as sns

print(df['unacc'].value_counts())

unacc
unacc    1209
acc       384
good       69
vgood      65
Name: count, dtype: int64


In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for i in df.columns:
    df[i] = le.fit_transform(df[i])
df

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,3,3,0,0,2,2,2
1,3,3,0,0,2,0,2
2,3,3,0,0,1,1,2
3,3,3,0,0,1,2,2
4,3,3,0,0,1,0,2
...,...,...,...,...,...,...,...
1722,1,1,3,2,1,2,1
1723,1,1,3,2,1,0,3
1724,1,1,3,2,0,1,2
1725,1,1,3,2,0,2,1


In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop(columns='unacc')
y = df['unacc']

# 데이터 분할 (훈련 80%, 테스트 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 표준화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# 모델 리스트
models = {
    "RF": RandomForestClassifier(random_state=42),
    "DT": DecisionTreeClassifier(random_state=42),
    "LR": LogisticRegression(max_iter=200),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC()
}

# 결과 저장용
results = {}

# 각 모델 학습 및 평가
for name, model in models.items():
    if name in ["KNN", "SVM"]:  # KNN, SVM은 표준화된 데이터 사용
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    # 정확도 저장
    acc = accuracy_score(y_test, y_pred)
    results[name] = {"Accuracy": acc, "Confusion Matrix": confusion_matrix(y_test, y_pred)}

# 결과 출력
for name, result in results.items():
    print(f"{name} 정확도: {result['Accuracy']:.4f}")
    print("Confusion Matrix:")
    print(result['Confusion Matrix'])
    print("="*50)

print(models)


RF 정확도: 0.9827
Confusion Matrix:
[[ 75   0   2   0]
 [  1  13   0   0]
 [  2   0 240   0]
 [  1   0   0  12]]
DT 정확도: 0.9884
Confusion Matrix:
[[ 76   1   0   0]
 [  0  14   0   0]
 [  1   0 241   0]
 [  2   0   0  11]]
LR 정확도: 0.6821
Confusion Matrix:
[[  6   0  63   8]
 [  1   0  13   0]
 [ 13   0 228   1]
 [  3   0   8   2]]
KNN 정확도: 0.9624
Confusion Matrix:
[[ 74   0   3   0]
 [  6   8   0   0]
 [  1   0 241   0]
 [  2   1   0  10]]
SVM 정확도: 0.9133
Confusion Matrix:
[[ 70   0   6   1]
 [ 12   2   0   0]
 [  8   0 234   0]
 [  3   0   0  10]]
{'RF': RandomForestClassifier(random_state=42), 'DT': DecisionTreeClassifier(random_state=42), 'LR': LogisticRegression(max_iter=200), 'KNN': KNeighborsClassifier(), 'SVM': SVC()}
