# Car Evaluation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

- Dataset can be found [here](https://archive.ics.uci.edu/dataset/19/car+evaluation)

In [2]:
labels = ['price', 'maintenance', 'doors', 'persons', 'boot_size', 'safety', 'acceptance']
df = pd.read_csv('data\car.data', names=labels)

In [3]:
# Dividing the data into features and labels(s)
X = df.drop('acceptance', axis=1)

In [4]:
X

Unnamed: 0,price,maintenance,doors,persons,boot_size,safety
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med
2,vhigh,vhigh,2,2,small,high
3,vhigh,vhigh,2,2,med,low
4,vhigh,vhigh,2,2,med,med
...,...,...,...,...,...,...
1723,low,low,5more,more,med,med
1724,low,low,5more,more,med,high
1725,low,low,5more,more,big,low
1726,low,low,5more,more,big,med


In [5]:
y = df['acceptance']
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1728 entries, 0 to 1727
Series name: acceptance
Non-Null Count  Dtype 
--------------  ----- 
1728 non-null   object
dtypes: object(1)
memory usage: 13.6+ KB


In [6]:
# Creating dummies for categorical features and labels
X.columns

Index(['price', 'maintenance', 'doors', 'persons', 'boot_size', 'safety'], dtype='object')

In [7]:
X = pd.get_dummies(data=X)

In [8]:
print(y.value_counts())

acceptance
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Naive Bayes, kNN, Logistic Regression, Support Vector Machines (SVM)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [22]:
naive_bayes = GaussianNB()
knn_5 = KNeighborsClassifier(n_neighbors=5)
logRegression = LogisticRegression()
svc = SVC()

classList = [naive_bayes, knn_5, logRegression, svc]

In [23]:
for classifier in classList:
    model = classifier.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(" Stats for ", classifier)
    print(classification_report(y_test, y_pred))

 Stats for  GaussianNB()
              precision    recall  f1-score   support

         acc       0.59      0.72      0.65        83
        good       0.35      0.82      0.49        11
       unacc       1.00      0.82      0.90       235
       vgood       0.68      1.00      0.81        17

    accuracy                           0.81       346
   macro avg       0.65      0.84      0.71       346
weighted avg       0.86      0.81      0.82       346

 Stats for  KNeighborsClassifier()
              precision    recall  f1-score   support

         acc       0.78      0.75      0.77        83
        good       0.30      0.27      0.29        11
       unacc       0.92      0.99      0.96       235
       vgood       1.00      0.29      0.45        17

    accuracy                           0.88       346
   macro avg       0.75      0.58      0.62       346
weighted avg       0.87      0.88      0.86       346

 Stats for  LogisticRegression()
              precision    recall  f1

 - Looks like Logistic Regression is the most accurate classifier here.