In [46]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [47]:
# 데이터 로딩
file_path = "D:/AI_data/car_evaluation.csv"

df = pd.read_csv(file_path)
print(df.columns)
df

Index(['vhigh', 'vhigh.1', '2', '2.1', 'small', 'low', 'unacc'], dtype='object')


Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc
...,...,...,...,...,...,...,...
1722,low,low,5more,more,med,med,good
1723,low,low,5more,more,med,high,vgood
1724,low,low,5more,more,big,low,unacc
1725,low,low,5more,more,big,med,good


In [48]:
# 결측치 확인
df.isnull().sum() # 결측치 개수 출력

vhigh      0
vhigh.1    0
2          0
2.1        0
small      0
low        0
unacc      0
dtype: int64

In [49]:
# 레이블 확인
df['unacc'].value_counts()

unacc
unacc    1209
acc       384
good       69
vgood      65
Name: count, dtype: int64

In [50]:
# encoding
# LabelEncoder 초기화
label_encoder = LabelEncoder() # 문자형 데이터를 숫자로 변환 (범주형 데이터 -> 정수)

for column in df.columns:
    df[column] = label_encoder.fit_transform(df[column])

df

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,3,3,0,0,2,2,2
1,3,3,0,0,2,0,2
2,3,3,0,0,1,1,2
3,3,3,0,0,1,2,2
4,3,3,0,0,1,0,2
...,...,...,...,...,...,...,...
1722,1,1,3,2,1,2,1
1723,1,1,3,2,1,0,3
1724,1,1,3,2,0,1,2
1725,1,1,3,2,0,2,1


In [51]:
# 변환 후 데이터 개수 확인
for column in df.columns:
    print(df[column].value_counts())

vhigh
0    432
2    432
1    432
3    431
Name: count, dtype: int64
vhigh.1
0    432
2    432
1    432
3    431
Name: count, dtype: int64
2
1    432
2    432
3    432
0    431
Name: count, dtype: int64
2.1
1    576
2    576
0    575
Name: count, dtype: int64
small
1    576
0    576
2    575
Name: count, dtype: int64
low
2    576
0    576
1    575
Name: count, dtype: int64
unacc
2    1209
0     384
1      69
3      65
Name: count, dtype: int64


In [52]:
# 데이터 분할
X = df.drop(columns=['unacc'])
y = df['unacc']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
# 데이터 정규화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [54]:
# 랜덤 포레스트 학습 및 평가
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_pred))

Random Forest Accuracy: 0.9740
Confusion Matrix:
[[ 75   1   0   1]
 [  1  10   0   4]
 [  1   0 236   0]
 [  1   0   0  16]]


In [55]:
# 결정 트리 학습 및 평가
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_pred)

print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, dt_pred))

Decision Tree Accuracy: 0.9711
Confusion Matrix:
[[ 73   2   0   2]
 [  2  12   0   1]
 [  1   0 236   0]
 [  0   2   0  15]]


In [56]:
# 로지스틱 회귀 학습 및 평가
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)

print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, lr_pred))

Logistic Regression Accuracy: 0.6850
Confusion Matrix:
[[ 12   0  64   1]
 [  2   0  13   0]
 [ 13   0 224   0]
 [ 11   0   5   1]]


In [57]:
# KNN 학습 및 평가
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_pred)

print(f"KNN Model Accuracy: {knn_accuracy:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, knn_pred))

KNN Model Accuracy: 0.9509
Confusion Matrix:
[[ 75   0   2   0]
 [  9   5   1   0]
 [  1   0 236   0]
 [  4   0   0  13]]


In [58]:
# SVM 학습 및 평가
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)

print(f"SVM Accuracy: {svm_accuracy:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, svm_pred))

SVM Accuracy: 0.8873
Confusion Matrix:
[[ 69   0   8   0]
 [ 14   0   0   1]
 [ 13   0 224   0]
 [  3   0   0  14]]
