In [1]:
# Library imports
import pandas as pd

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.utils import shuffle
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import random
random.seed(0)

In [22]:
df = pd.read_csv(
    "car_data.csv",
    header=None,
    names=["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]
).drop("persons", axis=1)

In [23]:
df.buying.value_counts()

vhigh    432
high     432
med      432
low      432
Name: buying, dtype: int64

In [24]:
df_shuffled = shuffle(df).reset_index(drop=True)

In [25]:
X, y = df_shuffled.drop("buying", axis=1), df_shuffled[["buying"]]

In [26]:
X

Unnamed: 0,maint,doors,lug_boot,safety,class
0,high,4,big,med,acc
1,low,3,med,low,unacc
2,vhigh,3,med,med,unacc
3,high,2,small,low,unacc
4,high,2,small,high,acc
...,...,...,...,...,...
1723,high,3,big,low,unacc
1724,vhigh,2,small,high,unacc
1725,med,5more,small,med,acc
1726,med,5more,big,high,unacc


In [41]:
label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder(drop='first')
X_transformed = one_hot_encoder.fit_transform(X)
y_transformed = label_encoder.fit_transform(y)

  y = column_or_1d(y, warn=True)


In [28]:
y_transformed.shape

(1728,)

In [29]:
X_transformed.shape

(1728, 13)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_transformed, test_size=0.2)

In [31]:
# Naive Bayes
classifier_nb = BernoulliNB()
score_nb = cross_val_score(
    classifier_nb, 
    X_transformed, y_transformed,
    scoring="accuracy"
)
print(score_nb.mean())

classifier_nb.fit(X_train, y_train)
y_pred = classifier_nb.predict(X_test)
print(classification_report(y_test, y_pred))

0.2795291949401022
              precision    recall  f1-score   support

           0       0.22      0.24      0.23        86
           1       0.34      0.20      0.25        74
           2       0.33      0.18      0.24        92
           3       0.28      0.46      0.34        94

    accuracy                           0.28       346
   macro avg       0.29      0.27      0.27       346
weighted avg       0.29      0.28      0.27       346



In [32]:
from sklearn.svm import SVC
classifier_svm = SVC()
score_svm = cross_val_score(
    classifier_svm, 
    X_transformed, y_transformed,
    scoring="accuracy"
)
print(score_svm.mean())

classifier_svm.fit(X_train, y_train)
y_pred = classifier_svm.predict(X_test)
print(classification_report(y_test, y_pred))

0.2355382424394739
              precision    recall  f1-score   support

           0       0.18      0.16      0.17        86
           1       0.29      0.43      0.35        74
           2       0.12      0.02      0.04        92
           3       0.28      0.43      0.34        94

    accuracy                           0.25       346
   macro avg       0.22      0.26      0.22       346
weighted avg       0.22      0.25      0.22       346



In [33]:
# Decision Tree
classifier_dt = DecisionTreeClassifier()
score_dt = cross_val_score(
    classifier_dt, 
    X_transformed, y_transformed,
    scoring="accuracy"
)
print(score_dt.mean())

classifier_dt.fit(X_train, y_train)
y_pred = classifier_dt.predict(X_test)
print(classification_report(y_test, y_pred))

0.10763675965485464
              precision    recall  f1-score   support

           0       0.08      0.15      0.11        86
           1       0.15      0.18      0.16        74
           2       0.02      0.01      0.01        92
           3       0.15      0.09      0.11        94

    accuracy                           0.10       346
   macro avg       0.10      0.11      0.10       346
weighted avg       0.10      0.10      0.09       346



In [34]:
# Random Forest
classifier_rf = RandomForestClassifier()
score_rf = cross_val_score(
    classifier_rf, 
    X_transformed, y_transformed,
    scoring="accuracy"
)
print(score_rf.mean())

classifier_rf.fit(X_train, y_train)
y_pred = classifier_rf.predict(X_test)
print(classification_report(y_test, y_pred))

0.11342715925274356
              precision    recall  f1-score   support

           0       0.08      0.08      0.08        86
           1       0.14      0.18      0.16        74
           2       0.03      0.02      0.03        92
           3       0.12      0.13      0.12        94

    accuracy                           0.10       346
   macro avg       0.09      0.10      0.10       346
weighted avg       0.09      0.10      0.09       346



### Prediction with Naive Bayes Classifier

In [44]:
classifier_nb.fit(X_transformed, y_transformed)
X_prediction = pd.DataFrame({"maint":["high"], "doors":["4"], "lug_boot":["big"], "safety":["high"], "class":["good"]})
X_prediction = one_hot_encoder.transform(X_prediction)
y_prediction = classifier_nb.predict(X_prediction)

In [46]:
label_encoder.inverse_transform(y_prediction)

array(['low'], dtype=object)