Python code for plant leaf image classification using different algorithms (Naive Gaussian Bayes, Logistic Regression, Decision Tree, Random Forest, k-Nearest Neighbors) on the Leafsnap Dataset from http://leafsnap.com/dataset/. The code is based on https://gogul09.github.io/software/image-classification-python

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os
import mahotas
import h5py
import glob
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Set seed for reproducibility and data path to image folders.

seed = 2
data_path_field = 'C:/images/field/'
data_path_lab = 'C:/images/lab/'
train_labels = os.listdir(data_path_lab)

In [None]:
# Hu-Moments to capture shape.

def fd_hu_moments(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    feature = cv2.HuMoments(cv2.moments(image)).flatten()
    
    return feature

In [None]:
# Haralick features to capture texture.

def fd_haralick(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    haralick = mahotas.features.haralick(gray).mean(axis=0)
    
    return haralick

In [None]:
# Color histogram to capture color.

def fd_histogram(image, mask=None):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist  = cv2.calcHist([image], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    cv2.normalize(hist, hist)

    return hist.flatten()

In [None]:
%%time
labels = []
features = []

# Iterate through every image in every folder.
for training_name in train_labels:
    current_label = training_name

    for file in glob.glob(os.path.join(data_path_lab + str(current_label) + '/', '*.jpg')):
        
        # Read and resize every image.
        image = cv2.imread(file)
        image = cv2.resize(image, (64, 64))
        
        # Extract shape, texture and color values.
        hu = fd_hu_moments(image)
        haralick = fd_haralick(image)
        histogram = fd_histogram(image)
        
        # Concatenate the extracted values and store them in the features list.
        feature = np.hstack([hu, haralick, histogram])
        features.append(feature)
        labels.append(current_label)
        
    print('*Folder -lab- ' + str(current_label) + ' has been processed.')

    for file in glob.glob(os.path.join(data_path_field + str(current_label) + '/', '*.jpg')):
        
         # Read and resize every image.
        image = cv2.imread(file)
        image = cv2.resize(image, (64, 64))
        
        # Extract shape, texture and color values.
        hu = fd_hu_moments(image)
        haralick = fd_haralick(image)
        histogram = fd_histogram(image)
        
        # Concatenate the extracted values and store them in the features list.
        feature = np.hstack([hu, haralick, histogram])
        features.append(feature)
        labels.append(current_label)
        
    # Keep track while processing.
    print('*Folder -field- ' + str(current_label) + ' has been processed.')
print('*ALL FOLDERS PROCESSED.')

In [None]:
%%time
print('Feature vector size is {}'.format(np.array(features).shape))
print('Labels vector size is {}'.format(np.array(labels).shape))


# Transform label names to integers.
label_names = np.unique(labels)
le = LabelEncoder()
label = le.fit_transform(labels)

# Check if transformation was successful.
print ('Labels are {}'.format(label))

# Normalize feature values between 0 and 1.
scaler = MinMaxScaler(feature_range=(0, 1))
normalized_features = scaler.fit_transform(features)

# Write h5py files with feature values and labels.
h5_data = h5py.File('output_data.h5', 'w')
h5_data.create_dataset('dataset_1', data=np.array(normalized_features))

h5_label = h5py.File('output_labels.h5', 'w')
h5_label.create_dataset('dataset_1', data=np.array(label))

h5_data.close()
h5_label.close()

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Create list with Classifier Algorithms.

models = []
models.append(('Naive Gaussian Bayes', GaussianNB()))
models.append(('Logistic Regression', LogisticRegression(random_state=seed)))
models.append(('Decision Tree', DecisionTreeClassifier(random_state=seed)))
models.append(('Random Forest', RandomForestClassifier(n_estimators=100, random_state=seed)))
models.append(('KNN', KNeighborsClassifier(n_neighbors=6)))

# Read h5py files and store data in feature_values and label_values respectively.
h5_data = h5py.File('output_data.h5', 'r')
h5_label = h5py.File('output_labels.h5', 'r')

features_string = h5_data['dataset_1']
labels_string = h5_label['dataset_1']

feature_values = np.array(features_string)
label_values = np.array(labels_string)

h5_data.close()
h5_label.close()

In [None]:
# Split the data in training and testing sets.

X_train, X_test, y_train, y_test = train_test_split(feature_values, label_values,
                                                    test_size=0.2, random_state=seed)
print('Data splitted in:')
print('Training data: {}'.format(X_train.shape))
print('Training labels: {}'.format(y_train.shape))
print('Testing data: {}'.format(X_test.shape))
print('Testing labels: {}'.format(y_test.shape))

In [None]:
%%time
cv_results_list = []
names = []

# Train every model by 3-fold cross validation and store each accuracy score.
for name, model in models:
    kfold = KFold(n_splits=3, random_state=seed)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    cv_results_list.append(cv_results)
    names.append(name)
    print('Cross validation of ' + str(name) + ' algorithm done.')

In [None]:
# Compute mean and standard deviation of the three scores for each model and 
# print the final output.

list_mean = []
list_std = []
for model in cv_results_list:
    classifier_mean = model.mean()
    classifier_std = model.std()
    list_mean.append(classifier_mean)
    list_std.append(classifier_std)
final_list = pd.DataFrame(list(zip(names, list_mean, list_std)), 
                          columns=['model', 'score', 'standard deviation'], index = [1,2,3,4,5])
print(final_list)

In [None]:
# Visualize final output as boxplot. 

fig, ax = plt.subplots()
fig.suptitle('Accuracy Comparison of Classifier Algorithms')
plt.boxplot(cv_results_list)
plt.ylabel('Accuracy')
plt.ylim([0,1])
plt.xticks([1,2,3,4,5],final_list['model'], fontsize=7, rotation=40)
plt.grid(True)
plt.show()