# Car Evaluation

## Setup

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import metrics

In [2]:
import warnings
warnings.filterwarnings("ignore")

## Load the Data

In [23]:
url = "https://archive.ics.uci.edu/static/public/19/data.csv"
df = pd.read_csv(url)

In [24]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [26]:
for col in df.columns:
    print(df[col].value_counts())

vhigh    432
high     432
med      432
low      432
Name: buying, dtype: int64
vhigh    432
high     432
med      432
low      432
Name: maint, dtype: int64
2        432
3        432
4        432
5more    432
Name: doors, dtype: int64
2       576
4       576
more    576
Name: persons, dtype: int64
small    576
med      576
big      576
Name: lug_boot, dtype: int64
low     576
med     576
high    576
Name: safety, dtype: int64
unacc    1210
acc       384
good       69
vgood      65
Name: class, dtype: int64


## Data Modeling
Two criteria:
1. Treat input variables as numeric
2. Treat input variables as categorical 

Flow:
- Step 1: Nested CV - identify the best model 
- Step 2: GridSearchCV - Find the best hyperparameter set for the best model
- Step 3: Build the optimal model based on the results of GridSearchCV

### 1. Treat input variables as numerical

### Step 1: Nested CV - identify the best model 

### Data Processing - ordinal encoding and label encoding

In [27]:
# separate into input and output columns
X = df.drop('class', axis=1)
y = df['class']

# ordinal encode input variables
ordinal_encoder = OrdinalEncoder()
X = ordinal_encoder.fit(X).transform(X)

# lable encode target variables
label_encoder = LabelEncoder()
y = label_encoder.fit(y).transform(y)

### 1. Decision Tree

In [28]:
# configure inner loop cross-validation
cv_inner = KFold(n_splits=3, shuffle=True, random_state=42)

# define the model
clf = tree.DecisionTreeClassifier(random_state=42)

# define hyparameter to search
param_grid = {
    'max_depth': list(range(1, 21)),
    'criterion': ['gini','entropy']
}

# define search - using GridSearchCV and refit the model with the best hyparameter set
search = GridSearchCV(clf, param_grid, scoring='accuracy', cv=cv_inner, refit=True)

# configure outer loop cross-validation
cv_outer = KFold(n_splits=10, shuffle=True, random_state=42)

# executed the nested CV
scores = cross_val_score(search, X, y, scoring='accuracy', cv=cv_outer)

# report average performace
print('Accuracy: {0:.5f}'.format(np.mean(scores)))

Accuracy: 0.98148


### 2. Logistic Regression

In [29]:
# configure inner loop cross-validation
cv_inner = KFold(n_splits=3, shuffle=True, random_state=42)

# define the model
clf = LogisticRegression(random_state=42, max_iter=1000)

# define hyparameter to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

# define search - using GridSearchCV and refit the model with the best hyparameter set
search = GridSearchCV(clf, param_grid, scoring='accuracy', cv=cv_inner, refit=True)

# configure outer loop cross-validation
cv_outer = KFold(n_splits=10, shuffle=True, random_state=42)

# executed the nested CV
scores = cross_val_score(search, X, y, scoring='accuracy', cv=cv_outer)

# report average performace
print('Accuracy: {0:.5f}'.format(np.mean(scores)))

Accuracy: 0.70193


### 3. SVM

In [30]:
# configure inner loop cross-validation
cv_inner = KFold(n_splits=3, shuffle=True, random_state=42)

# define the model
clf = SVC(random_state=42)

# define hyparameter to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1, 1]
}

# define search - using GridSearchCV and refit the model with the best hyparameter set
search = GridSearchCV(clf, param_grid, scoring='accuracy', cv=cv_inner, refit=True)

# configure outer loop cross-validation
cv_outer = KFold(n_splits=10, shuffle=True, random_state=42)

# executed the nested CV
scores = cross_val_score(search, X, y, scoring='accuracy', cv=cv_outer)

# report average performace
print('Accuracy: {0:.5f}'.format(np.mean(scores)))

Accuracy: 0.98206


### 4. KNN

In [31]:
# configure inner loop cross-validation
cv_inner = KFold(n_splits=3, shuffle=True, random_state=42)

# define the model
clf = KNeighborsClassifier()

# define hyparameter to search
param_grid = {'n_neighbors': list(range(1,21))}

# define search - using GridSearchCV and refit the model with the best hyparameter set
search = GridSearchCV(clf, param_grid, scoring='accuracy', cv=cv_inner, refit=True)
 
# configure outer loop cross-validation
cv_outer = KFold(n_splits=10, shuffle=True, random_state=42)

# executed the nested CV
scores = cross_val_score(search, X, y, scoring='accuracy', cv=cv_outer)

# report average performace
print('Accuracy: {0:.5f}'.format(np.mean(scores)))

Accuracy: 0.87380


### The SVM model has the best performance with an accuracy of 0.98206

### Step 2: GridSearchCV - Find the best hyperparameter set for the best model

In [32]:
# separate into input and output columns
X = df.drop('class', axis=1)
y = df['class']

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=38)

# fit encoding on the training data first, then apply to training and testing data
# ordinal encode input variables
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(X_train)
X_train = ordinal_encoder.transform(X_train)

# lable encode target variables
label_encoder = LabelEncoder()
label_encoder.fit(y_train) 
y_train = label_encoder.transform(y_train)

# define the model 
svm_clf_grid = SVC()

# define hyparameter to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1, 1]
}

# define search
grid_search = GridSearchCV(svm_clf_grid, param_grid, scoring='accuracy', cv=5)

# fit the model
grid_search.fit(X_train, y_train)

### Best parameter set

In [33]:
grid_search.best_params_

{'C': 10, 'gamma': 0.1}

### Step 3: Build the optimal model based on the results of GridSearchCV

### Train and fit the model

In [34]:
# separate into input and output columns
X = df.drop('class', axis=1)
y = df['class']

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=38)

# fit encoding on the training data first, then apply to training and testing data
# ordinal encode input variables
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(X_train)
X_train = ordinal_encoder.transform(X_train)
X_test = ordinal_encoder.transform(X_test)

# lable encode target variables
label_encoder = LabelEncoder()
label_encoder.fit(y_train) 
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

# define the model
svm_clf_best = SVC(C=10, gamma=0.1)

# fit on training dataset
svm_clf_best.fit(X_train, y_train)

# predict on testing dataset
y_pred = svm_clf_best.predict(X_test)
# y_score = tree_clf_best.predict_proba(X_test)

### Performance Measures

### Precision, Recall, Accuracy, F1

In [35]:
print("Accuracy: " + str(round(metrics.accuracy_score(y_test, y_pred),5)))

Accuracy: 0.96322


In [36]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.96      0.90       104
           1       0.95      0.88      0.91        24
           2       1.00      0.97      0.98       415
           3       0.97      1.00      0.98        28

    accuracy                           0.96       571
   macro avg       0.94      0.95      0.95       571
weighted avg       0.97      0.96      0.96       571



### Confusion martix

In [37]:
metrics.confusion_matrix(y_test, y_pred)

array([[100,   1,   2,   1],
       [  3,  21,   0,   0],
       [ 14,   0, 401,   0],
       [  0,   0,   0,  28]])

### By treating the target variables as numeric, we achieved the best model performance with the SVM, resulting in an overall accuracy of 0.96322

### 2. Treat input variables as categorical

### Step 1: Nested CV - identify the best model 

### Data Processing - one−hot encoding and label encoding

In [38]:
# separate into input and output columns
X = df.drop('class', axis=1)
y = df['class']

# ordinal encode input variables
onehot_encoder = OneHotEncoder(sparse=False)
X = onehot_encoder.fit(X).transform(X)

# lable encode target variables
label_encoder = LabelEncoder()
y = label_encoder.fit(y).transform(y)

### 1. Decision Tree

In [39]:
# configure inner loop cross-validation
cv_inner = KFold(n_splits=3, shuffle=True, random_state=42)

# define the model
clf = tree.DecisionTreeClassifier(random_state=42)

# define hyparameter to search
param_grid = {
    'max_depth': list(range(1, 21)),
    'criterion': ['gini','entropy']
}

# define search - using GridSearchCV and refit the model with the best hyparameter set
search = GridSearchCV(clf, param_grid, scoring='accuracy', cv=cv_inner, refit=True)

# configure outer loop cross-validation
cv_outer = KFold(n_splits=10, shuffle=True, random_state=42)

# executed the nested CV
scores = cross_val_score(search, X, y, scoring='accuracy', cv=cv_outer)

# report average performace
print('Accuracy: {0:.5f}'.format(np.mean(scores)))

Accuracy: 0.97048


### 2. Logistic Regression

In [40]:
# configure inner loop cross-validation
cv_inner = KFold(n_splits=3, shuffle=True, random_state=42)

# define the model
clf = LogisticRegression(random_state=42, max_iter=1000)

# define hyparameter to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

# define search - using GridSearchCV and refit the model with the best hyparameter set
search = GridSearchCV(clf, param_grid, scoring='accuracy', cv=cv_inner, refit=True)

# configure outer loop cross-validation
cv_outer = KFold(n_splits=10, shuffle=True, random_state=42)

# executed the nested CV
scores = cross_val_score(search, X, y, scoring='accuracy', cv=cv_outer)

# report average performace
print('Accuracy: {0:.5f}'.format(np.mean(scores)))

Accuracy: 0.93170


### 3. SVM

In [41]:
# configure inner loop cross-validation
cv_inner = KFold(n_splits=3, shuffle=True, random_state=42)

# define the model
clf = SVC(random_state=42)

# define hyparameter to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1, 1]
}

# define search - using GridSearchCV and refit the model with the best hyparameter set
search = GridSearchCV(clf, param_grid, scoring='accuracy', cv=cv_inner, refit=True)

# configure outer loop cross-validation
cv_outer = KFold(n_splits=10, shuffle=True, random_state=42)

# executed the nested CV
scores = cross_val_score(search, X, y, scoring='accuracy', cv=cv_outer)

# report average performace
print('Accuracy: {0:.5f}'.format(np.mean(scores)))

Accuracy: 0.99248


### 4. KNN

In [42]:
# configure inner loop cross-validation
cv_inner = KFold(n_splits=3, shuffle=True, random_state=42)

# define the model
clf = KNeighborsClassifier()

# define hyparameter to search
param_grid = {'n_neighbors': list(range(1,21))}

# define search - using GridSearchCV and refit the model with the best hyparameter set
search = GridSearchCV(clf, param_grid, scoring='accuracy', cv=cv_inner, refit=True)

# configure outer loop cross-validation
cv_outer = KFold(n_splits=10, shuffle=True, random_state=42)

# executed the nested CV
scores = cross_val_score(search, X, y, scoring='accuracy', cv=cv_outer)

# report average performace
print('Accuracy: {0:.5f}'.format(np.mean(scores)))

Accuracy: 0.87441


### The SVM model has the best performance with an accuracy of 0.99248

### Step 2: GridSearchCV - Find the best hyperparameter set for the best model

In [43]:
# separate into input and output columns
X = df.drop('class', axis=1)
y = df['class']

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=38)

# fit encoding on the training data first, then apply to training and testing data
# ordinal encode input variables
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoder.fit(X_train)
X_train = onehot_encoder.transform(X_train)

# lable encode target variables
label_encoder = LabelEncoder()
label_encoder.fit(y_train) 
y_train = label_encoder.transform(y_train)

# define the model 
svm_clf_grid = SVC()

# define hyparameter to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1, 1]
}

# define search
grid_search = GridSearchCV(svm_clf_best, param_grid, scoring='accuracy', cv=5)

# fit the model
grid_search.fit(X_train, y_train)

In [44]:
grid_search.best_params_

{'C': 10, 'gamma': 0.1}

### Step 3: Build the optimal model based on the results of GridSearchCV

### Train and fit the model

In [45]:
# separate into input and output columns
X = df.drop('class', axis=1)
y = df['class']

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# fit encoding on the training data first, then apply to training and testing data
# onehot encode input variables
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoder.fit(X_train)
X_train = onehot_encoder.transform(X_train)
X_test = onehot_encoder.transform(X_test)

# lable encode target variables
label_encoder = LabelEncoder()
label_encoder.fit(y_train) 
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

# define the model
svm_clf_best = SVC(C=10, gamma=0.1)

# fit on training dataset
svm_clf_best.fit(X_train, y_train)

# predict on testing dataset
y_pred = svm_clf_best.predict(X_test)

### Performance Measures

### Precision, Recall, Accuracy, F1

In [46]:
print("Accuracy: " + str(round(metrics.accuracy_score(y_test, y_pred),5)))

Accuracy: 0.99299


In [47]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       123
           1       1.00      0.91      0.95        23
           2       1.00      1.00      1.00       406
           3       1.00      1.00      1.00        19

    accuracy                           0.99       571
   macro avg       0.99      0.98      0.98       571
weighted avg       0.99      0.99      0.99       571



### Confusion Matirx

In [48]:
metrics.confusion_matrix(y_test, y_pred)

array([[122,   0,   1,   0],
       [  2,  21,   0,   0],
       [  1,   0, 405,   0],
       [  0,   0,   0,  19]])

### By treating the target variables as categorical, it achieved the best model performance with the SVM, resulting in an overall accuracy of 0.99299.

### Conclusion: SVM demonstrates better predictive performance compared to the other three models, irrespective of whether the target variable is treated as numeric or categorical. Nevertheless, in the final model, treating the target variables as categorical results in a higher accuracy of 0.99299.