In [1]:
# Import all libraries needed

import collections
from sklearn import preprocessing
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import time
from sklearn import metrics
from IPython.display import Markdown, display
import random
import matplotlib.pyplot as plt
from keras.datasets import fashion_mnist

In [2]:
# Show Shapes

(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
print("Shape of x_train: {}".format(x_train.shape))
print("Shape of y_train: {}".format(y_train.shape))
print()
print("Shape of x_test: {}".format(x_test.shape))
print("Shape of y_test: {}".format(y_test.shape))

Shape of x_train: (60000, 28, 28)
Shape of y_train: (60000,)

Shape of x_test: (10000, 28, 28)
Shape of y_test: (10000,)


In [3]:
# Change the Shapes and show it
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255.0
x_test /= 255.0
x_train = x_train.reshape(x_train.shape[0], x_train.shape[1] * x_train.shape[2])
x_test = x_test.reshape(x_test.shape[0], x_test.shape[1] * x_test.shape[2])

print(x_train.shape)
print(x_test.shape)

(60000, 784)
(10000, 784)


In [4]:
# Import specified libraries

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [9]:
# Define parameter grid (ranges)

clf = RandomForestClassifier(random_state=42)


param_grid = {
    
    'n_estimators': range(10,20),
    'max_depth': range(5,10),
    'min_samples_leaf': [1, 2, 5],
    'min_samples_split': [2, 3],
    'max_features': ['auto', 'sqrt'],
}

In [10]:
# Define GridSearch Search

grid_search = GridSearchCV(
    clf,
    param_grid,
    cv=2,
    verbose=1,   
    scoring='accuracy',
    n_jobs=-1,
)

grid_search.fit(x_train, y_train)

Fitting 2 folds for each of 600 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   27.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed: 17.4min finished


GridSearchCV(cv=2, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(5, 10),
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 5],
                         'min_samples_split': [2, 3],
                         'n_estimators': range(10, 20)},
             scoring='accuracy', verbose=1)

In [11]:
# Results

clf = RandomForestClassifier(**grid_search.best_params_, random_state=42)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred, digits=4))
print('Best Results:\n', grid_search.best_params_)

              precision    recall  f1-score   support

           0     0.7673    0.8210    0.7932      1000
           1     0.9904    0.9330    0.9609      1000
           2     0.7018    0.7060    0.7039      1000
           3     0.7991    0.8950    0.8443      1000
           4     0.6697    0.7990    0.7287      1000
           5     0.9723    0.9130    0.9417      1000
           6     0.6773    0.4450    0.5371      1000
           7     0.8857    0.9070    0.8962      1000
           8     0.9523    0.9580    0.9551      1000
           9     0.8974    0.9360    0.9163      1000

    accuracy                         0.8313     10000
   macro avg     0.8313    0.8313    0.8278     10000
weighted avg     0.8313    0.8313    0.8278     10000

Best Results:
 {'max_depth': 9, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 19}
