**Image classification of flowers with Random Forest Classifier**:

Aluno: Gabriel Luiz

Disciplina: Tópico em Aprendizado de Máquina

In [1]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import mahotas
import cv2
import os
import h5py
import glob
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
# make a fix file size
fixed_size  = tuple((500,500))

#train path
train_path = "./dataset/train"

# no of trees for Random Forests
num_tree = 100

# bins for histograms
bins = 8

# train_test_split size
test_size = 0.10

# seed for reproducing same result
seed = 9


In [3]:
# features description -1:  Hu Moments
# Momentos invariantes de uma imagem
# Usado para descrever os objetos da imagem
def fd_hu_moments(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    feature = cv2.HuMoments(cv2.moments(image)).flatten()
    return feature

In [4]:
# feature-descriptor -2 Haralick Texture
# Descreve as imagens a partir de suas texturas
def fd_haralick(image):
    # conver the image to grayscale
    gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
    # Ccompute the haralick texture fetature ve tor
    haralic = mahotas.features.haralick(gray).mean(axis=0)
    return haralic


In [5]:
# feature-description -3 Color Histogram
# Representa as imagens com a distribuição de cores
def fd_histogram(image, mask=None):
    # conver the image to HSV colors-space
    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    #COPUTE THE COLOR HISTPGRAM
    hist  = cv2.calcHist([image],[0,1,2],None,[bins,bins,bins], [0, 256, 0, 256, 0, 256])
    # normalize the histogram
    cv2.normalize(hist,hist)
    # return the histog....
    return hist.flatten()

In [6]:
# get the training data labels
train_labels = os.listdir(train_path)

# sort the training labesl
train_labels.sort()
print(train_labels)

# empty list to hold feature vectors and labels
global_features = []
labels = []

['daisy', 'dandelion', 'rose', 'sunflower', 'tulip']


<h1>loop insise the folder for train images </h1>

In [7]:
# iterate the folder to get the image label name

%time
# lop over the training data sub folder

for training_name in train_labels:
    # join the training data path and each species training folder
    dir = os.path.join(train_path, training_name)

    # get the current training label
    current_label = training_name

    # loop over the images in each sub-folder
    for file in os.listdir(dir):

        file = dir + "/" + os.fsdecode(file)

        # read the image and resize it to a fixed-size
        image = cv2.imread(file)
        if image is not None:
            image = cv2.resize(image,fixed_size)
            fv_hu_moments = fd_hu_moments(image)
            fv_haralick   = fd_haralick(image)
            fv_histogram  = fd_histogram(image)

        # Concatenate global features into a single matrix
        global_feature = np.hstack([fv_histogram, fv_haralick, fv_hu_moments])

        # update the list of labels and feature vectors
        labels.append(current_label)
        global_features.append(global_feature)

    print("[STATUS] processed folder: {}".format(current_label))

print("[STATUS] completed Global Feature Extraction...")


CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs
[STATUS] processed folder: daisy
[STATUS] processed folder: dandelion
[STATUS] processed folder: rose
[STATUS] processed folder: sunflower
[STATUS] processed folder: tulip
[STATUS] completed Global Feature Extraction...


In [8]:
%time
# get the overall feature vector size
print("[STATUS] feature vector size {}".format(np.array(global_features).shape))

# get the overall training label size
print("[STATUS] training Labels {}".format(np.array(labels).shape))

# encode the target labels
targetNames = np.unique(labels)
le = LabelEncoder()
target = le.fit_transform(labels)
print("[STATUS] training labels encoded...{}")
# normalize the feature vector in the range (0-1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_features = scaler.fit_transform(global_features)
print("[STATUS] feature vector normalized...")

print("[STATUS] target labels: {}".format(target))
print("[STATUS] target labels shape: {}".format(target.shape))


CPU times: user 33 µs, sys: 15 µs, total: 48 µs
Wall time: 23.4 µs
[STATUS] feature vector size (4323, 532)
[STATUS] training Labels (4323,)
[STATUS] training labels encoded...{}
[STATUS] feature vector normalized...
[STATUS] target labels: [0 0 0 ... 4 4 4]
[STATUS] target labels shape: (4323,)


In [9]:
# split the training and testing data
(trainDataGlobal, testDataGlobal, trainLabelsGlobal, testLabelsGlobal) = train_test_split(np.array(rescaled_features),
                                                                                          np.array(target),
                                                                                          test_size=test_size,
                                                                                          random_state=seed)


<h3>RandomForest</h3>

In [10]:
# create the model - Random Forests
clf  = RandomForestClassifier(n_estimators=num_tree)

# fit the training data to the model
clf.fit(trainDataGlobal, trainLabelsGlobal)
clf_pred = clf.predict(testDataGlobal)
print(classification_report(testLabelsGlobal,clf_pred))
print(confusion_matrix(testLabelsGlobal,clf_pred))

              precision    recall  f1-score   support

           0       0.69      0.64      0.66        75
           1       0.58      0.69      0.63        99
           2       0.73      0.55      0.63        74
           3       0.80      0.84      0.82        80
           4       0.71      0.71      0.71       105

    accuracy                           0.69       433
   macro avg       0.70      0.69      0.69       433
weighted avg       0.70      0.69      0.69       433

[[48 19  2  2  4]
 [17 68  1  9  4]
 [ 2 11 41  1 19]
 [ 1  8  1 67  3]
 [ 2 12 11  5 75]]


In [11]:
parameters={"n_estimators":[10,20,50,100,200],
           "max_depth":[2,3],
           "min_samples_split":[2,3,4],
           "max_features":('auto','log2'),
           "criterion":('gini','entropy')}

grid=GridSearchCV(clf, parameters, cv=5)
grid.fit(trainDataGlobal,trainLabelsGlobal)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ('gini', 'entropy'), 'max_depth': [2, 3],
                         'max_features': ('auto', 'log2'),
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [10, 20, 50, 100, 200]})

In [12]:
grid.best_params_

{'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'auto',
 'min_samples_split': 3,
 'n_estimators': 200}

In [15]:
estimator=grid.best_estimator_
estimator.score(trainDataGlobal,trainLabelsGlobal)

0.5922879177377892

In [14]:

# # path to test data
# test_path = "./dataset/test"
#
# # loop through the test images
# for file in os.listdir(test_path):
#
#     file = test_path + "/" + file
#     #print(file)
#
#     # read the image
#     image = cv2.imread(file)
#
#     # resize the image
#     image = cv2.resize(image, fixed_size)
#
#     # Global Feature extraction
#     fv_hu_moments = fd_hu_moments(image)
#     fv_haralick   = fd_haralick(image)
#     fv_histogram  = fd_histogram(image)
#
#     # Concatenate global features
#
#     global_feature = np.hstack([fv_histogram, fv_haralick, fv_hu_moments])
#
#     # predict label of test image
#     prediction = clf.predict(global_feature.reshape(1,-1))[0]
#
#     # show predicted label on image
#     cv2.putText(image, train_labels[prediction], (20,30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,255,255), 3)
#
#     # display the output image
#     plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
#     plt.show()