In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# -----------------------------

# 1) 데이터 준비

# -----------------------------

col = ["buying","maint","doors","persons","lug_boot","safety","label"]
# Skip the first row when reading the CSV
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/car_evaluation.csv", header=None, names=col).dropna()


# feature이랑 label을 분리
X = df.drop(columns=["label"])
y = df["label"]


# Identify categorical columns
categorical_features = X.columns

# Create a ColumnTransformer to apply OneHotEncoder to categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep other columns (if any) as they are
)


#train set, test set 분리 / stratify > y를 기준으로 셔플(뭉치지 않게)
X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=0.2, stratify=y, random_state=42

)



# -----------------------------

# 2) 모델 구성

# -----------------------------

# Create pipelines that include the preprocessor and the model
dt_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', DecisionTreeClassifier(random_state=42))])
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))])
lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression(max_iter=1000))]) # Increased max_iter for Logistic Regression



# -----------------------------

# 3) 모델 학습

# -----------------------------

dt_pipeline.fit(X_train, y_train)

rf_pipeline.fit(X_train, y_train)

lr_pipeline.fit(X_train, y_train)


# -----------------------------

# 4) 모델 평가

# -----------------------------

dt_acc = accuracy_score(y_test, dt_pipeline.predict(X_test))

rf_acc = accuracy_score(y_test, rf_pipeline.predict(X_test))

lr_acc = accuracy_score(y_test, lr_pipeline.predict(X_test))



print("=== Test Accuracy ===")

print(f"Decision Tree : {dt_acc:.4f}")

print(f"Random Forest : {rf_acc:.4f}")

print(f"Logistic Reg. : {lr_acc:.4f}")

=== Test Accuracy ===
Decision Tree : 0.9740
Random Forest : 0.9798
Logistic Reg. : 0.9017
