# Car Evaluation

Dataset: http://archive.ics.uci.edu/ml/datasets/Car+Evaluation

This is a classification problem. We want to classify the car into 4 classes, which are unacc, acc, good, and vgood.
I approached decision tree, logistic regression, KNN, Navie base, and SVM classification to build the models.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [3]:
df = pd.read_csv('car.data', header = None)
col_names =['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df.columns = col_names

In [4]:
# check NA and datatype
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [5]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [6]:
# count different classes. It shows that it is not imbalanced data
df.groupby('class')['class'].count()

class
acc       384
good       69
unacc    1210
vgood      65
Name: class, dtype: int64

In [7]:
X = df.iloc[:,0:6]
y = df['class']

Each x varable is ordinal data. Even though there seems to be a order in the class, for example ['low','med','high','vhigh']. However, since I can't define the distance between each class, I decided to turn into categorical datatype.

In [8]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
X_ohe = ohe.fit_transform(X).toarray()
X = pd.DataFrame(X_ohe)

#from sklearn.preprocessing import LabelEncoder
#labelencoder = LabelEncoder()
#y = labelencoder.fit_transform(y)

## Nested Grid Search CV

To find the best model for this dataset among all 5 models.

In [9]:
i = 42
inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)
scoring = 'accuracy'

In [10]:
dt = DecisionTreeClassifier()
dt_grid = {'max_depth':list(range(2,10)), 'min_samples_leaf':list(range(1,5))}
lr = LogisticRegression(solver='liblinear')
lr_grid = {'C':[0.01, 0.1, 0.5, 1, 10, 100], 'penalty':['l1','l2','elastinet']}
knn =  KNeighborsClassifier()
knn_grid = {'n_neighbors':list(range(2,5))}
gnb = GaussianNB()
gnb_grid = {}
svm = SVC()
svm_grid = {'kernel':['rbf'],'C':[1,10,100],'gamma':[.1,.5,1]} 

In [11]:
dt_clf = GridSearchCV(estimator=dt, param_grid=dt_grid, scoring = scoring, cv=inner_cv)
lr_clf = GridSearchCV(estimator=lr, param_grid=lr_grid, scoring = scoring, cv=inner_cv)
knn_clf = GridSearchCV(estimator=knn, param_grid=knn_grid, scoring = scoring, cv=inner_cv)
gnb_clf = GridSearchCV(estimator=gnb, param_grid=gnb_grid, scoring = scoring, cv=inner_cv)
svm_clf = GridSearchCV(estimator=svm, param_grid=svm_grid, scoring = scoring, cv=inner_cv)

In [12]:
dt_score = cross_val_score(dt_clf, X=X, y=y, cv=outer_cv)
lr_score = cross_val_score(lr_clf, X=X, y=y, cv=outer_cv)
knn_score = cross_val_score(knn_clf, X=X, y=y, cv=outer_cv)
gnb_score = cross_val_score(gnb_clf, X=X, y=y, cv=outer_cv)
svm_score = cross_val_score(svm_clf, X=X, y=y, cv=outer_cv)

In [13]:
score = {}
score['dt'] = dt_score.mean()
score['lr'] = lr_score.mean()
score['knn'] = knn_score.mean()
score['gnb'] = gnb_score.mean()
score['svm'] = svm_score.mean()
score

{'dt': 0.9479166666666667,
 'lr': 0.8981481481481481,
 'knn': 0.8304398148148149,
 'gnb': 0.8026620370370371,
 'svm': 0.9942129629629629}

## Final model

SVM model performs the best. I used it as my final model to further know the hyperparameter and performance matrix.

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=45)

In [15]:
svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)
print('best params: ', svm_clf.best_params_)
print('best score: ', svm_clf.best_score_)
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

best params:  {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
best score:  0.9905922761162772
[[ 74   0   0   0]
 [  0  15   0   0]
 [  2   0 241   0]
 [  0   0   0  14]]
              precision    recall  f1-score   support

         acc       0.97      1.00      0.99        74
        good       1.00      1.00      1.00        15
       unacc       1.00      0.99      1.00       243
       vgood       1.00      1.00      1.00        14

    accuracy                           0.99       346
   macro avg       0.99      1.00      1.00       346
weighted avg       0.99      0.99      0.99       346

