In [1]:


from ucimlrepo import fetch_ucirepo  
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split, KFold, cross_val_score
  
# fetch dataset 
car_evaluation = fetch_ucirepo(id=19) 
  
# data (as pandas dataframes) 
X = car_evaluation.data.features 
Y = car_evaluation.data.targets 
  
# metadata 
print(car_evaluation.metadata) 
  
# variable information 
print(car_evaluation.variables) 


{'uci_id': 19, 'name': 'Car Evaluation', 'repository_url': 'https://archive.ics.uci.edu/dataset/19/car+evaluation', 'data_url': 'https://archive.ics.uci.edu/static/public/19/data.csv', 'abstract': 'Derived from simple hierarchical decision model, this database may be useful for testing constructive induction and structure discovery methods.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1728, 'num_features': 6, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5JP48', 'creators': ['Marko Bohanec'], 'intro_paper': {'ID': 249, 'type': 'NATIVE', 'title': 'Knowledge acquisition and explanation for multi-attribute decision making', 'authors': 'M. Bohanec, V. Rajkovič', 'venue': '8th Intl Workshop on Expert Systems and their Applications, 

In [2]:
Y['class'].unique()


array(['unacc', 'acc', 'vgood', 'good'], dtype=object)

In [3]:
Y = ((Y['class'] == 'vgood') | (Y['class'] == 'good')).astype(float)
Y[Y == 0] = -1

In [4]:
Y = Y.values.reshape(-1,1)

In [5]:
Y

array([[-1.],
       [-1.],
       [-1.],
       ...,
       [-1.],
       [ 1.],
       [ 1.]], shape=(1728, 1))

In [6]:
X_and_Y = np.hstack((X, Y))     # Stack them together for shuffling.
np.random.seed(1)               # Set the random seed.
np.random.shuffle(X_and_Y)  

print(Y.shape)
print(X.shape)
print(X_and_Y[0])

(1728, 1)
(1728, 6)
['med' 'low' '3' 'more' 'small' 'low' -1.0]


In [7]:
X

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med
2,vhigh,vhigh,2,2,small,high
3,vhigh,vhigh,2,2,med,low
4,vhigh,vhigh,2,2,med,med
...,...,...,...,...,...,...
1723,low,low,5more,more,med,med
1724,low,low,5more,more,med,high
1725,low,low,5more,more,big,low
1726,low,low,5more,more,big,med


In [8]:
X['safety'].unique()
safeties = ['low', 'med', 'high']

In [9]:
from sklearn.preprocessing import OrdinalEncoder

In [10]:
enc = OrdinalEncoder(categories = [safeties])

In [11]:
enc.fit_transform(X[['safety']])

array([[0.],
       [1.],
       [2.],
       ...,
       [0.],
       [1.],
       [2.]], shape=(1728, 1))

In [12]:
X['safety'] = enc.fit_transform(X[['safety']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['safety'] = enc.fit_transform(X[['safety']])


In [13]:
X.head(10)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,0.0
1,vhigh,vhigh,2,2,small,1.0
2,vhigh,vhigh,2,2,small,2.0
3,vhigh,vhigh,2,2,med,0.0
4,vhigh,vhigh,2,2,med,1.0
5,vhigh,vhigh,2,2,med,2.0
6,vhigh,vhigh,2,2,big,0.0
7,vhigh,vhigh,2,2,big,1.0
8,vhigh,vhigh,2,2,big,2.0
9,vhigh,vhigh,2,4,small,0.0


In [14]:
X['buying'].unique()

array(['vhigh', 'high', 'med', 'low'], dtype=object)

In [15]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(categories = [['vhigh', 'high', 'med', 'low']])

In [16]:
X['buying'] = enc.fit_transform(X[['buying']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['buying'] = enc.fit_transform(X[['buying']])


In [17]:
X['maint'].unique()

array(['vhigh', 'high', 'med', 'low'], dtype=object)

In [18]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(categories = [['vhigh', 'high', 'med', 'low']])
X['maint'] = enc.fit_transform(X[['maint']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['maint'] = enc.fit_transform(X[['maint']])


In [19]:
X['lug_boot'].unique()

array(['small', 'med', 'big'], dtype=object)

In [20]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(categories = [['small', 'med', 'big']])
X['lug_boot'] = enc.fit_transform(X[['lug_boot']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['lug_boot'] = enc.fit_transform(X[['lug_boot']])


In [21]:
X['doors'].unique()

array(['2', '3', '4', '5more'], dtype=object)

In [22]:
X['doors'].unique()
X['doors'] = X['doors'].replace({'5more': '5'}).astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['doors'] = X['doors'].replace({'5more': '5'}).astype(float)


In [23]:
X

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,0.0,0.0,2.0,2,0.0,0.0
1,0.0,0.0,2.0,2,0.0,1.0
2,0.0,0.0,2.0,2,0.0,2.0
3,0.0,0.0,2.0,2,1.0,0.0
4,0.0,0.0,2.0,2,1.0,1.0
...,...,...,...,...,...,...
1723,3.0,3.0,5.0,more,1.0,1.0
1724,3.0,3.0,5.0,more,1.0,2.0
1725,3.0,3.0,5.0,more,2.0,0.0
1726,3.0,3.0,5.0,more,2.0,1.0


In [24]:
X['persons'] = X['persons'].replace({'more' : '6'}).astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['persons'] = X['persons'].replace({'more' : '6'}).astype(float)


In [25]:
print(Y.shape)
print(X.shape)

(1728, 1)
(1728, 6)


In [26]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
#import seaborn as sns

partitions = [0.2, 0.5, 0.8]

for partition in partitions:


    for trial in range(3):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=partition, random_state=0)

        D_list = [1, 2, 3, 4, 5]
        param_grid = {'max_depth':D_list}
        estimator = tree.DecisionTreeClassifier(criterion='entropy', random_state=1)

        grid_search = GridSearchCV(estimator, param_grid, cv=10, scoring="accuracy")

        grid_search.fit(X_train, Y_train)
        
        # Get best model and calculate accuracies
        train_acc = grid_search.score(X_train, Y_train)
        val_acc = grid_search.best_score_
        test_acc = grid_search.score(X_test, Y_test)

        y_pred = grid_search.predict(X_test)

        # Print classification report for predictions
        print(classification_report(Y_test, y_pred))
        
        # Print results for this trial directly
        print(f"Partition: {partition*100:.0f}/100, Trial: {trial+1}")
        print(f"Train Accuracy: {train_acc:.4f}")
        print(f"Validation Accuracy: {val_acc:.4f}")
        print(f"Test Accuracy: {test_acc:.4f}")
    

              precision    recall  f1-score   support

        -1.0       0.97      0.99      0.98      1267
         1.0       0.87      0.61      0.72       116

    accuracy                           0.96      1383
   macro avg       0.92      0.80      0.85      1383
weighted avg       0.96      0.96      0.96      1383

Partition: 20/100, Trial: 1
Train Accuracy: 0.9710
Validation Accuracy: 0.9652
Test Accuracy: 0.9595
              precision    recall  f1-score   support

        -1.0       0.97      0.99      0.98      1267
         1.0       0.87      0.61      0.72       116

    accuracy                           0.96      1383
   macro avg       0.92      0.80      0.85      1383
weighted avg       0.96      0.96      0.96      1383

Partition: 20/100, Trial: 2
Train Accuracy: 0.9710
Validation Accuracy: 0.9652
Test Accuracy: 0.9595
              precision    recall  f1-score   support

        -1.0       0.97      0.99      0.98      1267
         1.0       0.87      0.61  