In [636]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

In [None]:
# Load the data
df = pd.read_csv('data/car_evaluation.csv')
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,HIGH,high,5more,2,med,LOW,unacc
1,high,med,3,more,BIG,LOW,unacc
2,vhigh,MED,3,more,small,med,unacc
3,vhigh,med,4,2,small,LOW,unacc
4,,,,,,,
...,...,...,...,...,...,...,...
2068,LOW,high,3,2,SMALL,low,unacc
2069,HIGH,high,4,2,big,med,UNACC
2070,0,0,,,0,0,
2071,high,high,3,4,MED,high,acc


In [638]:
# Convert all columns to lowercase
df = df.map(lambda s:s.lower() if type(s) == str else s)

In [639]:
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,high,high,5more,2,med,low,unacc
1,high,med,3,more,big,low,unacc
2,vhigh,med,3,more,small,med,unacc
3,vhigh,med,4,2,small,low,unacc
4,,,,,,,
...,...,...,...,...,...,...,...
2068,low,high,3,2,small,low,unacc
2069,high,high,4,2,big,med,unacc
2070,0,0,,,0,0,
2071,high,high,3,4,med,high,acc


In [640]:
# Replace 'more' and '5more' with a high number (e.g., 5)
df.replace(['more', '5more'], 5, inplace=True)
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,high,high,5,2,med,low,unacc
1,high,med,3,5,big,low,unacc
2,vhigh,med,3,5,small,med,unacc
3,vhigh,med,4,2,small,low,unacc
4,,,,,,,
...,...,...,...,...,...,...,...
2068,low,high,3,2,small,low,unacc
2069,high,high,4,2,big,med,unacc
2070,0,0,,,0,0,
2071,high,high,3,4,med,high,acc


In [641]:
# Convert 'doors' and 'persons' to numeric
df[['doors', 'persons']] = df[['doors', 'persons']].apply(pd.to_numeric)
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,high,high,5.0,2.0,med,low,unacc
1,high,med,3.0,5.0,big,low,unacc
2,vhigh,med,3.0,5.0,small,med,unacc
3,vhigh,med,4.0,2.0,small,low,unacc
4,,,,,,,
...,...,...,...,...,...,...,...
2068,low,high,3.0,2.0,small,low,unacc
2069,high,high,4.0,2.0,big,med,unacc
2070,0,0,,,0,0,
2071,high,high,3.0,4.0,med,high,acc


In [642]:
# Replace '0' with 'NaN'
df.replace('0', np.nan, inplace=True)
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,high,high,5.0,2.0,med,low,unacc
1,high,med,3.0,5.0,big,low,unacc
2,vhigh,med,3.0,5.0,small,med,unacc
3,vhigh,med,4.0,2.0,small,low,unacc
4,,,,,,,
...,...,...,...,...,...,...,...
2068,low,high,3.0,2.0,small,low,unacc
2069,high,high,4.0,2.0,big,med,unacc
2070,,,,,,,
2071,high,high,3.0,4.0,med,high,acc


In [643]:
# Drop the rows where all values are 'NaN'
df.dropna(how='all', inplace=True)
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,high,high,5.0,2.0,med,low,unacc
1,high,med,3.0,5.0,big,low,unacc
2,vhigh,med,3.0,5.0,small,med,unacc
3,vhigh,med,4.0,2.0,small,low,unacc
5,high,med,5.0,2.0,med,high,unacc
...,...,...,...,...,...,...,...
2067,med,med,4.0,2.0,big,high,unacc
2068,low,high,3.0,2.0,small,low,unacc
2069,high,high,4.0,2.0,big,med,unacc
2071,high,high,3.0,4.0,med,high,acc


In [644]:
# Convert categorical data to numerical data
le = LabelEncoder()
df = df.apply(le.fit_transform)
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,0,0,4,1,1,1,2
1,0,2,2,3,0,1,2
2,3,2,2,3,2,2,2
3,3,2,3,1,2,1,2
5,0,2,4,1,1,0,2
...,...,...,...,...,...,...,...
2067,2,2,3,1,0,0,2
2068,1,0,2,1,2,1,2
2069,0,0,3,1,0,2,2
2071,0,0,2,2,1,0,0


In [673]:
# Split the data into features and target variable
X = df.drop('class', axis=1)
Y = df['class']

In [671]:
# Split the dataset into a training set and a test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [669]:
# Create the Decision Tree classifier
clfr = DecisionTreeClassifier()

In [667]:
# Train the model
clfr.fit(X_train, Y_train)

In [649]:
# Make predictions on the test set
Y_prediction = clfr.predict(X_test)

In [675]:
# Evaluate the model
print("Accuracy:", accuracy_score(Y_test, Y_prediction))
print('Confusion Matrix:')
print(confusion_matrix(Y_test, Y_prediction))
print("Classification Report:")
print(classification_report(Y_test,Y_prediction))

Accuracy: 0.9763779527559056
Confusion Matrix:
[[ 86   1   1   2   0]
 [  0  11   1   0   0]
 [  3   0 246   0   0]
 [  1   0   0  12   0]
 [  0   0   0   0  17]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96        90
           1       0.92      0.92      0.92        12
           2       0.99      0.99      0.99       249
           3       0.86      0.92      0.89        13
           4       1.00      1.00      1.00        17

    accuracy                           0.98       381
   macro avg       0.94      0.96      0.95       381
weighted avg       0.98      0.98      0.98       381

