In [26]:
## scikit-learn's DecisionTreeClassifier example with gini index and default parameters

# import required packages/functions

import warnings
warnings.filterwarnings('ignore')
import category_encoders as ce
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn import tree
from graphviz import Source
import graphviz

# load the data into dataframe
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data (1).csv')

# get feature vector
X = df.drop(['class'], axis=1)

# get target variable
y = df['class']

# split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# encoding variables
encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])

X_train = encoder.fit_transform(X_train)

X_test = encoder.transform(X_test)

# train the model
clf_gini =  DecisionTreeClassifier()
clf_gini.fit(X_train, y_train)

# predicting output for test data
y_pred_gini = clf_gini.predict(X_test)


# confusion matrix and f1 score
cm = confusion_matrix(y_test, y_pred_gini)
print('Confusion matrix\n', cm)
print('\nF1 score\n', classification_report(y_test, y_pred_gini))

# plotting the tree
dot_data = tree.export_graphviz(clf_gini, out_file=None, 
                              feature_names=X_train.columns,  
                              class_names=y_train,  
                              filled=True, rounded=True,  
                              special_characters=True)

graph = graphviz.Source(dot_data) 

graph.render(directory='my')

Confusion matrix
 [[110  12   7   0]
 [  0  20   0   0]
 [  6   0 391   0]
 [  3   0   0  22]]

F1 score
               precision    recall  f1-score   support

         acc       0.92      0.85      0.89       129
        good       0.62      1.00      0.77        20
       unacc       0.98      0.98      0.98       397
       vgood       1.00      0.88      0.94        25

    accuracy                           0.95       571
   macro avg       0.88      0.93      0.89       571
weighted avg       0.96      0.95      0.95       571



'my/Source.gv.pdf'

In [28]:
## Varying parameters of DecisionTreeClassifier

# train the model
clf_gini =  DecisionTreeClassifier(criterion='entropy', max_depth=6)
clf_gini.fit(X_train, y_train)

# predicting output for test data
y_pred_gini = clf_gini.predict(X_test)


# confusion matrix and f1 score
cm = confusion_matrix(y_test, y_pred_gini)
print('Confusion matrix\n', cm)
print('\nF1 score\n', classification_report(y_test, y_pred_gini))

# plotting the tree
dot_data = tree.export_graphviz(clf_gini, out_file=None, 
                              feature_names=X_train.columns,  
                              class_names=y_train,  
                              filled=True, rounded=True,  
                              special_characters=True)

graph = graphviz.Source(dot_data) 

graph.render(directory='my')

Confusion matrix
 [[ 92   8  29   0]
 [  5  15   0   0]
 [  6   0 391   0]
 [  3   9   0  13]]

F1 score
               precision    recall  f1-score   support

         acc       0.87      0.71      0.78       129
        good       0.47      0.75      0.58        20
       unacc       0.93      0.98      0.96       397
       vgood       1.00      0.52      0.68        25

    accuracy                           0.89       571
   macro avg       0.82      0.74      0.75       571
weighted avg       0.90      0.89      0.89       571



'my/Source.gv.pdf'