In [17]:
# Import all libraries needed

import collections
from sklearn import preprocessing
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import time
from sklearn import metrics
from IPython.display import Markdown, display
import random
import matplotlib.pyplot as plt
from keras.datasets import fashion_mnist

In [18]:
# Show Shapes

(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
print("Shape of x_train: {}".format(x_train.shape))
print("Shape of y_train: {}".format(y_train.shape))
print()
print("Shape of x_test: {}".format(x_test.shape))
print("Shape of y_test: {}".format(y_test.shape))

Shape of x_train: (60000, 28, 28)
Shape of y_train: (60000,)

Shape of x_test: (10000, 28, 28)
Shape of y_test: (10000,)


In [19]:
# Change the Shapes and show it
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255.0
x_test /= 255.0
x_train = x_train.reshape(x_train.shape[0], x_train.shape[1] * x_train.shape[2])
x_test = x_test.reshape(x_test.shape[0], x_test.shape[1] * x_test.shape[2])

print(x_train.shape)
print(x_test.shape)

(60000, 784)
(10000, 784)


In [42]:
# Import specified libraries

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV

In [43]:
# Define parameter grid (ranges)

clf = DecisionTreeClassifier(random_state=42)


param_grid = {
    'criterion' : ['gini', 'entropy'],
    'splitter' : ['best', 'random'],
    'max_depth': [ 50, 60, 70, 80, 90, 100],
    'min_samples_leaf': [1, 20, 30, 40, 4, 6, 8, 10, 15, 20, 30],
    'min_samples_split': [2, 5, 10, 15, 20, 25, 30, 40],
    'max_features': ['auto', 'sqrt'],
}

In [44]:
# Define Randomized Search

rand_search = RandomizedSearchCV(
    clf,
    param_grid,
    cv=2,
    verbose=2,   
    scoring='accuracy',
    random_state=42,
    n_jobs=-1,
)

rand_search.fit(x_train, y_train)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:    3.0s remaining:    4.5s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    4.5s finished


RandomizedSearchCV(cv=2, estimator=DecisionTreeClassifier(random_state=42),
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [50, 60, 70, 80, 90, 100],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 20, 30, 40, 4,
                                                             6, 8, 10, 15, 20,
                                                             30],
                                        'min_samples_split': [2, 5, 10, 15, 20,
                                                              25, 30, 40],
                                        'splitter': ['best', 'random']},
                   random_state=42, scoring='accuracy', verbose=2)

In [45]:
# Results

clf = DecisionTreeClassifier(**rand_search.best_params_, random_state=42)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred, digits=4))
print('Best Results:\n', rand_search.best_params_)

              precision    recall  f1-score   support

           0     0.7614    0.7530    0.7572      1000
           1     0.9453    0.9500    0.9476      1000
           2     0.6644    0.6970    0.6803      1000
           3     0.7946    0.8280    0.8110      1000
           4     0.6582    0.6490    0.6536      1000
           5     0.8978    0.8870    0.8924      1000
           6     0.5461    0.5090    0.5269      1000
           7     0.8542    0.8790    0.8664      1000
           8     0.9192    0.8990    0.9090      1000
           9     0.9062    0.9080    0.9071      1000

    accuracy                         0.7959     10000
   macro avg     0.7947    0.7959    0.7951     10000
weighted avg     0.7947    0.7959    0.7951     10000

Best Results:
 {'splitter': 'best', 'min_samples_split': 30, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 90, 'criterion': 'entropy'}
