In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from ucimlrepo import fetch_ucirepo

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [33]:
# fetch dataset 
car_evaluation = fetch_ucirepo(id=19) 
  
# data (as pandas dataframes) 
X = car_evaluation.data.features 
y = car_evaluation.data.targets 
  
# metadata 
print(car_evaluation.metadata) 
  
# variable information 
print(car_evaluation.variables) 

{'uci_id': 19, 'name': 'Car Evaluation', 'repository_url': 'https://archive.ics.uci.edu/dataset/19/car+evaluation', 'data_url': 'https://archive.ics.uci.edu/static/public/19/data.csv', 'abstract': 'Derived from simple hierarchical decision model, this database may be useful for testing constructive induction and structure discovery methods.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1728, 'num_features': 6, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5JP48', 'creators': ['Marko Bohanec'], 'intro_paper': {'ID': 249, 'type': 'NATIVE', 'title': 'Knowledge acquisition and explanation for multi-attribute decision making', 'authors': 'M. Bohanec, V. Rajkovič', 'venue': '8th Intl Workshop on Expert Systems and their Applications, 

## Preprocessing the Data

In [34]:
y_binary = y.iloc[:, 0].apply(lambda label: 1 if label in ['good', 'vgood'] else 0)

for column in X.columns:
    if X[column].dtype == 'object':
        X.loc[:, column] = LabelEncoder().fit_transform(X[column])

## Split data into Training and Test sets, Train and Evaluate Classifiers

In [35]:
splits = [(0.2, 0.8), (0.5, 0.5), (0.8, 0.2)]
trials = 3
results = {}


for train_size, test_size in splits:
    split_results = {clf_name: [] for clf_name in ["Decision Tree", "Random Forest", "SVM"]}

    # Perform multiple trials
    for trial in range(trials):
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y_binary, train_size=train_size, test_size=test_size, random_state=42 + trial
        )

        # Define classifiers
        classifiers = {
            "Decision Tree": DecisionTreeClassifier(),
            "Random Forest": RandomForestClassifier(n_estimators=100),
            "SVM": SVC(kernel='linear')
        }

        # Train and evaluate classifiers
        for clf_name, clf in classifiers.items():
            # Cross-validation for hyperparameter tuning
            cv_scores = cross_val_score(clf, X_train, y_train, cv=5)

            # Train the classifier
            clf.fit(X_train, y_train)

            # Evaluate on the test set
            y_pred = clf.predict(X_test)
            test_accuracy = accuracy_score(y_test, y_pred)

            # Store results for this trial
            split_results[clf_name].append({
                "CV Accuracy (Mean)": cv_scores.mean(),
                "CV Accuracy (Std)": cv_scores.std(),
                "Test Accuracy": test_accuracy
            })

    # Average results across trials
    results[f"Train {int(train_size*100)}% / Test {int(test_size*100)}%"] = {
        clf_name: {
            metric: np.mean([trial_result[metric] for trial_result in metrics])
            for metric in metrics[0]
        }
        for clf_name, metrics in split_results.items()
    }

## Display Results

In [36]:
for split, split_results in results.items():
    print(f"Results for {split} split:")
    for clf_name, metrics in split_results.items():
        print(f"  {clf_name}:")
        for metric, value in metrics.items():
            print(f"    {metric}: {value:.4f}")
    print("\n")

Results for Train 20% / Test 80% split:
  Decision Tree:
    CV Accuracy (Mean): 0.9469
    CV Accuracy (Std): 0.0347
    Test Accuracy: 0.9605
  Random Forest:
    CV Accuracy (Mean): 0.9556
    CV Accuracy (Std): 0.0173
    Test Accuracy: 0.9677
  SVM:
    CV Accuracy (Mean): 0.9082
    CV Accuracy (Std): 0.0047
    Test Accuracy: 0.9260


Results for Train 50% / Test 50% split:
  Decision Tree:
    CV Accuracy (Mean): 0.9823
    CV Accuracy (Std): 0.0088
    Test Accuracy: 0.9815
  Random Forest:
    CV Accuracy (Mean): 0.9707
    CV Accuracy (Std): 0.0088
    Test Accuracy: 0.9807
  SVM:
    CV Accuracy (Mean): 0.9236
    CV Accuracy (Std): 0.0027
    Test Accuracy: 0.9213


Results for Train 80% / Test 20% split:
  Decision Tree:
    CV Accuracy (Mean): 0.9879
    CV Accuracy (Std): 0.0084
    Test Accuracy: 0.9817
  Random Forest:
    CV Accuracy (Mean): 0.9824
    CV Accuracy (Std): 0.0093
    Test Accuracy: 0.9827
  SVM:
    CV Accuracy (Mean): 0.9235
    CV Accuracy (Std): 0.0