In [87]:
# import necessary libraries and specify that graphs should be plotted inline. 
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scikitplot as skplt
import warnings
warnings.filterwarnings('ignore')

from sklearn import tree, metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import neighbors, datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_curve,auc
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import matthews_corrcoef

### Data Exploration

In [88]:
#load dataset
car_df = pd.read_csv('car.data')
columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
car_df.columns = columns
car_df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [89]:
print("buying: ",car_df.buying.unique())
print("maint: ",car_df.maint.unique())
print("doors: ",car_df.doors.unique())
print("persons: ",car_df.persons.unique())
print("lug_boot: ",car_df.lug_boot.unique())
print("safety: ",car_df.safety.unique())

buying:  ['vhigh' 'high' 'med' 'low']
maint:  ['vhigh' 'high' 'med' 'low']
doors:  ['2' '3' '4' '5more']
persons:  ['2' '4' 'more']
lug_boot:  ['small' 'med' 'big']
safety:  ['med' 'high' 'low']


In [90]:
# Check missing value
car_df.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

In [91]:
# Check if there is imbalanced data
car_df.groupby('class')['class'].count()

class
acc       384
good       69
unacc    1209
vgood      65
Name: class, dtype: int64

## Transform ordinal features into categorical

Pros: Treating them as categories avoids making the assumption of a linear relationship between categories(such as integer mapping), ensuring that the model captures the true nature of the ordinal variable. Moreover, it preserves this inherent ordinal information, allowing the model to capture the relationship between categories based on their order.

Cons: Increased dimensionality.

In [92]:
# define dependent and independent variables
x = car_df.iloc[:,:6]
y = car_df['class']

In [93]:
#  treat ordinal variables as categorical
nominal_features = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
x = pd.get_dummies(x, columns=nominal_features, drop_first=True)

### Modeling

In [94]:
# Cross-validation for inner and outer loops
i = 42
inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)
score = 'accuracy'

In [95]:
# Desicion tree

# Tuning hyperparameters
criterions = ['gini', 'entropy']
d_rng =  list(range(1,10)) # max_depth
s_rng = list(range(2,10)) # min_samples_split
t_grid = dict(criterion = criterions,max_depth = d_rng,min_samples_split = s_rng)

# model
tree = DecisionTreeClassifier()

# non-nested parameter search and scoring
tree_clf = GridSearchCV(tree, t_grid, cv=inner_cv, scoring='accuracy')

# nested CV with parameter opitimization
tree_score = cross_val_score(tree_clf,X=x,y=y, cv=outer_cv)

In [96]:
#knn

# Tuning hyperparameters
weights = ['uniform', 'distance']
k_values = list(range(1,30))
k_grid = dict(weights = weights,n_neighbors = k_values)

# model
knn = KNeighborsClassifier()

# non-nested parameter search and scoring
knn_clf = GridSearchCV(knn, k_grid, cv=inner_cv, scoring='accuracy')

# nested CV with parameter opitimization
knn_score = cross_val_score(knn_clf,X=x,y=y, cv=outer_cv)

In [97]:
#logistic regression

# Tuning hyperparameters
penalty = ['l1', 'l2', 'elasticnet']
c_values = [10**i for i in range(-5,9)]
l_grid = dict(penalty = penalty, C = c_values)

# model
lg = LogisticRegression()

# non-nested parameter search and scoring
lg_clf = GridSearchCV(lg, l_grid, cv = inner_cv, scoring = 'accuracy')

# nested CV with parameter opitimization
lg_score = cross_val_score(lg_clf, X=x, y=y, cv=outer_cv)

In [98]:
#svm
# Tuning hyper-parameter
g = [0.001, 0.01, 0.1, 1, 10, 100] # gamma
c = [0.1, 1, 10, 100]
s_grid = dict(C = c, gamma = g )

# model
svm = SVC(probability = True)

# non-nested parameter search and scoring
svm_clf = GridSearchCV(svm, s_grid, cv=inner_cv, scoring='accuracy')

# nested CV with parameter opitimization
svm_score = cross_val_score(svm_clf,X=x,y=y, cv=outer_cv)

In [99]:
score = {}
score['Decision Tree'] = tree_score.mean()
score['KNN'] = knn_score.mean()
score['Logistic Regression'] = lg_score.mean()
score['SVM'] = svm_score.mean()
score

{'Decision Tree': 0.9096711996218958,
 'KNN': 0.8256879994844032,
 'Logistic Regression': 0.9305085610552548,
 'SVM': 0.9947889812666495}

## Transform ordinal features into numeric 

Pros: It simplifies the data preprocessing step as you don't need to create additional dummy variables.

Cons: Treating ordinal variables as numeric assumes that the intervals between the categories are equal, which may not be the case in reality. This assumption can lead to incorrect interpretations.

In [100]:
#load dataset
car_df = pd.read_csv('car.data')
columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
car_df.columns = columns

In [101]:
# define dependent and independent variables
x = car_df.iloc[:,:6]
y = car_df['class']

In [102]:
# treat ordinal variables as numeric
label_encoder = LabelEncoder()

for column in x.columns:
    x[column] = label_encoder.fit_transform(x[column])

### Modeling

In [103]:
# Cross-validation for inner and outer loops
i = 42
inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)
score = 'accuracy'

In [104]:
# Desicion tree

# Tuning hyperparameters
criterions = ['gini', 'entropy']
d_rng =  list(range(1,10)) # max_depth
s_rng = list(range(2,10)) # min_samples_split
t_grid = dict(criterion = criterions,max_depth = d_rng,min_samples_split = s_rng)

# model
tree = DecisionTreeClassifier()

# non-nested parameter search and scoring
tree_clf = GridSearchCV(tree, t_grid, cv=inner_cv, scoring='accuracy')

# nested CV with parameter opitimization
tree_score = cross_val_score(tree_clf,X=x,y=y, cv=outer_cv)

In [105]:
#knn

# Tuning hyperparameters
weights = ['uniform', 'distance']
k_values = list(range(1,30))
k_grid = dict(weights = weights,n_neighbors = k_values)

# model
knn = KNeighborsClassifier()

# non-nested parameter search and scoring
knn_clf = GridSearchCV(knn, k_grid, cv=inner_cv, scoring='accuracy')

# nested CV with parameter opitimization
knn_score = cross_val_score(knn_clf,X=x,y=y, cv=outer_cv)

In [106]:
#logistic regression

# Tuning hyperparameters
penalty = ['l1', 'l2', 'elasticnet']
c_values = [10**i for i in range(-5,9)]
l_grid = dict(penalty = penalty, C = c_values)

# model
lg = LogisticRegression()

# non-nested parameter search and scoring
lg_clf = GridSearchCV(lg, l_grid, cv = inner_cv, scoring = 'accuracy')

# nested CV with parameter opitimization
lg_score = cross_val_score(lg_clf, X=x, y=y, cv=outer_cv)

In [107]:
#svm
# Tuning hyper-parameter
g = [0.001, 0.01, 0.1, 1, 10, 100] # gamma
c = [0.1, 1, 10, 100]
s_grid = dict(C = c, gamma = g )

# model
svm = SVC(probability = True)

# non-nested parameter search and scoring
svm_clf = GridSearchCV(svm, s_grid, cv=inner_cv, scoring='accuracy')

# nested CV with parameter opitimization
svm_score = cross_val_score(svm_clf,X=x,y=y, cv=outer_cv)

In [108]:
score = {}
score['Decision Tree'] = tree_score.mean()
score['KNN'] = knn_score.mean()
score['Logistic Regression'] = lg_score.mean()
score['SVM'] = svm_score.mean()
score

{'Decision Tree': 0.9490364784738334,
 'KNN': 0.9044736078886311,
 'Logistic Regression': 0.704673133109908,
 'SVM': 0.9907353699407064}

## Final Model

Considering that SVM has the highest accuracy (0.9948), it appears to be the best-performing model on this dataset based on the cross validation mean accuracy scores. 

In [116]:
#load dataset
car_df = pd.read_csv('car.data')
columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
car_df.columns = columns

# define dependent and independent variables
x = car_df.iloc[:,:6]
y = car_df['class']

#  treat ordinal variables as categorical
nominal_features = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
x = pd.get_dummies(x, columns=nominal_features, drop_first=True)

In [117]:
# Split the dataset into training(70%) and testing(30%) sets
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3)

In [118]:
# Tuning hyper-parameter
g = [0.001, 0.01, 0.1, 1, 10, 100] # gamma
c = [0.1, 1, 10, 100]
s_grid = dict(C = c, gamma = g )

# model
svm = SVC(probability = True)

# hyper-parameter search and scoring
svm_clf = GridSearchCV(svm, s_grid, cv=5, scoring='accuracy')

#fit the model
svm_clf.fit(X_train, y_train)
y_pred = svm_clf.predict(X_test)

In [119]:
print('best params: ', svm_clf.best_params_)
print('best score: ', svm_clf.best_score_)
print(classification_report(y_test, y_pred))

best params:  {'C': 100, 'gamma': 0.1}
best score:  0.9842735159973938
              precision    recall  f1-score   support

         acc       1.00      0.98      0.99       123
        good       1.00      1.00      1.00        22
       unacc       0.99      1.00      1.00       354
       vgood       1.00      1.00      1.00        20

    accuracy                           1.00       519
   macro avg       1.00      1.00      1.00       519
weighted avg       1.00      1.00      1.00       519



## Summary
In conclusion, the Support Vector Machine (SVM) model, optimized with hyperparameters C=100 and gamma=0.1, has demonstrated exceptional performance in classifying the dataset. The model achieved an impressive accuracy of 100%, indicating its robustness in accurately predicting class labels. Furthermore, the precision, recall, and F1-scores for all classes are consistently high, underscoring the model's reliability in handling different categories. With minimal misclassifications and strong overall performance, this SVM model is a compelling choice for this classification task.