# Image Classification using `sklearn.svm`

## Libraries

In [1]:
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm, metrics, datasets
from sklearn.utils import Bunch
from sklearn.model_selection import GridSearchCV, train_test_split

from skimage.io import imread
from skimage.transform import resize

import time
import datetime

# My addition to the original code
import os
from skimage.feature import hog
from random import randint

import os
from skimage.util import img_as_ubyte
from skimage import io
from skimage.filters import threshold_otsu


In [2]:
# Start point
start_time = time.time()
print("Start time: ", datetime.datetime.now())

Start time:  2019-01-03 17:22:52.193112


## Preprocessing

In [3]:
def HOG(image):
    img = hog(image,
              orientations=8,
              pixels_per_cell=(4, 4),
              cells_per_block=(2, 2),
              block_norm='L2-Hys',
              multichannel = False,
              feature_vector = True)
    return img

In [4]:
# binarization using Otsu's method
def binarize(inp_image):
    thresh = threshold_otsu(inp_image)
    binary_thresh_img = inp_image > thresh

    img_binary = img_as_ubyte(binary_thresh_img)

    return img_binary

In [5]:
# defining things for the morphological filtering

from skimage.morphology import erosion, dilation, opening, closing, white_tophat
from skimage.morphology import black_tophat, skeletonize, convex_hull_image
from skimage.morphology import disk, square, diamond

selem = disk(1)

## Load images in structured directory

In [6]:
# Function that builds pair of signatures on which the algorithm is going to work.

def match_2(flat_data, target, xn):
    """
    Parameters

    - flat_data: a list of ndarray containing the flattern information coming from
      the hog on the signature image.
    - target: a list of strings. Each component can be 'Genuine' of 'Forgery'.

    Returns

    A dataset divided in:
    - data: a list of ndarray, each one originating from the concatenation of 2 array of the input flat_data.
    - target: a list of strings, each one deriving from the concatenation of 2 strings of input target list.

    -------------------------------------------------------

    Rule of matching:
    - The first xn couples are composed of 2 genuine signatures randomly choosen between the 10 available.
    - The latter xn couples are composed of 2 forgery signatures randomly choosen.
    """

    m_flat_data=[]
    m_target=[]
    g_indexes=[ind for ind, t in enumerate(target) if t=='Genuine']
    f_indexes=[ind for ind, t in enumerate(target) if t=='Forgery']
    matches=[]
    for i in range(0,xn): # Creates xn matchings: Genuine - Genuine
        n=randint(0,len(g_indexes)-1)
        m=randint(0,len(g_indexes)-1)
        while n == m or (g_indexes[n],g_indexes[m]) in matches:
            n=randint(0,len(g_indexes)-1)
            m=randint(0,len(g_indexes)-1)
        m_flat_data.append(np.concatenate((flat_data[g_indexes[n]],flat_data[g_indexes[m]])))
        m_target.append(target[g_indexes[n]]+' '+target[g_indexes[m]])
        matches.append((g_indexes[n],g_indexes[m]))

    # matches=[]
    # for i in range(0,xn): # Creates xn matchings: Genuine - Forgery
    #     n=randint(0,len(g_indexes)-1)
    #     m=randint(0,len(f_indexes)-1)
    #     while (g_indexes[n],f_indexes[m]) in matches:
    #         n=randint(0,len(g_indexes)-1)
    #         m=randint(0,len(f_indexes)-1)
    #     m_flat_data.append(np.concatenate((flat_data[g_indexes[n]],flat_data[f_indexes[m]])))
    #     m_target.append(target[g_indexes[n]]+' '+target[f_indexes[m]])
    #     matches.append((g_indexes[n],f_indexes[m]))


    matches=[]
    for i in range(0,xn): # Creates xn matchings: Forgery - Forgery
        n=randint(0,len(f_indexes)-1)
        m=randint(0,len(f_indexes)-1)
        while (f_indexes[n],f_indexes[m]) in matches:
            n=randint(0,len(f_indexes)-1)
            m=randint(0,len(f_indexes)-1)
        m_flat_data.append(np.concatenate((flat_data[f_indexes[n]],flat_data[f_indexes[m]])))
        m_target.append(target[f_indexes[n]]+' '+target[f_indexes[m]])
        matches.append((f_indexes[n],f_indexes[m]))

    return Bunch(data=m_flat_data,
                 target=m_target
                )


In [None]:
def match_4(flat_data, target, xn):
    """
    Parameters

    - flat_data: a list of ndarray containing the flattern information coming from
      the hog on the signature image.
    - target: a list of strings. Each component can be 'Genuine' of 'Forgery'.
    - xn: numbers of matches to be created for every sub_directory

    Returns

    A dataset divided in:
    - data: a list of ndarray, each one originating from the concatenation of 4 array of the input flat_data.
    - target: a list of strings, each one deriving from the concatenation of 4 strings of input target list.

    -------------------------------------------------------

    Rule of matching:
    - The first xn couples are composed of 4 genuine signatures randomly choosen between the 10 available.
    - The latter xn couples are composed of 3 genuine signature and 1 forgery signatures randomly choosen.
    """

    m_flat_data=[]
    m_target=[]
    g_indexes=[ind for ind, t in enumerate(target) if t=='Genuine']
    f_indexes=[ind for ind, t in enumerate(target) if t=='Forgery']

    matches=[]
    for i in range(0,xn): # Creates xn matchings: GGGG
        n=randint(0,len(g_indexes)-1)
        m1=randint(0,len(g_indexes)-1)
        m2=randint(0,len(g_indexes)-1)
        m3=randint(0,len(g_indexes)-1)
        while ((n == m1) or (n == m2) or (n == m3) or
                (m1 == m2) or (m1 == m3) or
                (m2 == m3) or
                (g_indexes[m1], g_indexes[m2], g_indexes[m3], g_indexes[n]) in matches):
            n=randint(0,len(g_indexes)-1)
            m1=randint(0,len(g_indexes)-1)
            m2=randint(0,len(g_indexes)-1)
            m3=randint(0,len(g_indexes)-1)
        m_flat_data.append(np.concatenate((flat_data[g_indexes[m1]],flat_data[g_indexes[m2]],flat_data[g_indexes[m3]],flat_data[g_indexes[n]])))
        m_target.append(target[g_indexes[m1]]+' '+target[g_indexes[m2]]+' '+target[g_indexes[m3]]+' '+target[g_indexes[n]])
        matches.append((g_indexes[m1],g_indexes[m2],g_indexes[m3],g_indexes[n]))

    matches=[]
    for i in range(0,xn): # Creates xn matchings: FFFF
        n=randint(0,len(f_indexes)-1)
        m1=randint(0,len(f_indexes)-1)
        m2=randint(0,len(f_indexes)-1)
        m3=randint(0,len(f_indexes)-1)
        while ((n == m1) or (n == m2) or (n == m3) or
                (m1 == m2) or (m1 == m3) or
                (m2 == m3) or
                (f_indexes[m1], f_indexes[m2], f_indexes[m3], f_indexes[n]) in matches):
            n=randint(0,len(f_indexes)-1)
            m1=randint(0,len(f_indexes)-1)
            m2=randint(0,len(f_indexes)-1)
            m3=randint(0,len(f_indexes)-1)
        m_flat_data.append(np.concatenate((flat_data[f_indexes[m1]],flat_data[f_indexes[m2]],flat_data[f_indexes[m3]],flat_data[f_indexes[n]])))
        m_target.append(target[f_indexes[m1]]+' '+target[f_indexes[m2]]+' '+target[f_indexes[m3]]+' '+target[f_indexes[n]])
        matches.append((f_indexes[m1],f_indexes[m2],f_indexes[m3],f_indexes[n]))


    # matches=[]
    # for i in range(0,xn): # Creates xn matchings: Genuine - Forgery
    #     n=randint(0,len(g_indexes)-1)
    #     m=randint(0,len(f_indexes)-1)
    #     while (g_indexes[n],f_indexes[m]) in matches:
    #         n=randint(0,len(g_indexes)-1)
    #         m=randint(0,len(f_indexes)-1)
    #     m_flat_data.append(np.concatenate((flat_data[g_indexes[n]],flat_data[f_indexes[m]])))
    #     m_target.append(target[g_indexes[n]]+' '+target[f_indexes[m]])
    #     matches.append((g_indexes[n],f_indexes[m]))


    # matches=[]
    # for i in range(0,xn): # Creates xn matchings: GGGF
    #     n=randint(0,len(g_indexes)-1)
    #     m1=randint(0,len(g_indexes)-1)
    #     m2=randint(0,len(g_indexes)-1)
    #     m3=randint(0,len(g_indexes)-1)
    #     while ((m1 == m2) or (m1 == m3) or (m2 == m3) or
    #            (g_indexes[m1], g_indexes[m2], g_indexes[m3], f_indexes[n]) in matches):
    #            n=randint(0,len(g_indexes)-1)
    #            m1=randint(0,len(g_indexes)-1)
    #            m2=randint(0,len(g_indexes)-1)
    #            m3=randint(0,len(g_indexes)-1)
    #     m_flat_data.append(np.concatenate((flat_data[g_indexes[m1]],flat_data[g_indexes[m2]],flat_data[g_indexes[m3]],flat_data[f_indexes[n]])))
    #     m_target.append(target[g_indexes[m1]]+' '+target[g_indexes[m2]]+' '+target[g_indexes[m3]]+' '+target[f_indexes[n]])
    #     matches.append((g_indexes[m1],g_indexes[m2],g_indexes[m3],f_indexes[n]))


    return Bunch(data=m_flat_data,
                 target=m_target
                )

In [7]:
def load_image_files(container_path, dimension=(100, 144)): # height x lenght
    """
    Load image files with categories as subfolder names
    which performs like scikit-learn sample dataset

    Parameters
    ----------
    container_path : string or unicode
        Path to the main folder holding one subfolder per category
    dimension : tuple
        size to which image are adjusted to

    Returns
    -------
    Bunch
    """
    image_dir = Path(container_path)
    subj = [directory for directory in image_dir.iterdir() if directory.is_dir()]
    categories = ['Genuine', 'Forgery']

    descr = "An image classification dataset"
    #images = []
    flat_data = []
    target = []

    for n, s in enumerate(subj):
        folders = [sub_directory for sub_directory in s.iterdir() if sub_directory.is_dir()]
        temp_flat_data=[] # Temporary list of flat_data used to store the information of the current subject
                          # and reinitialized as empty list for each subject
        temp_images=[] # Temporary list of images
        temp_target=[] # Temporary list of target
        for sub_dir in folders:
            if sub_dir.name in categories: # Skip the Disguised folder
                for file in sub_dir.iterdir():
                    img_gray = img_as_ubyte(io.imread(file, as_gray=True)) # load the images in grayscale
                    img_bin = binarize(img_gray)
                    img_resized = resize(img_bin, dimension, anti_aliasing=True, mode='reflect')


                    temp_flat_data.append(HOG(img_resized))

                    # NOTE: resize() produces a dtype('float64') while imread() and binarize() return dtype('uint8')
                    # the opening filter is bad

                    temp_target.append(sub_dir.name)

        #subj_dataset = match_2(temp_flat_data, temp_target, 5) #now the number of couples can be setted

        subj_dataset = match_2(temp_flat_data, temp_target, 5) #now the number of 4tuples can be setted
        flat_data += subj_dataset.data
        target += subj_dataset.target

    flat_data = np.array(flat_data)
    target = np.array(target)

    return Bunch(data=flat_data,
                 target=target,
                 target_names=categories,
                 DESCR=descr)

In [8]:
# To let the code run it is necessary to put this script in the same folder which contains the database folder

db_folder='SignUniPD_anonymised' # Modify with the name of the folder containing the database.
image_dataset = load_image_files(os.path.join(os.getcwd(),db_folder))

print("----Dataset Loaded----")

### Split data in Training Set and Testing Set

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    image_dataset.data, image_dataset.target, test_size=0.3,random_state=109)

print("----Dataset Splitted----")

In [None]:
print("Training Set shape: {}".format(X_train.shape))
print("Test Set shape: {}".format(X_test.shape))

print("Training Set shape: {}".format(y_train.shape))
print("Test Set shape: {}".format(y_test.shape))

### Train data with parameter optimization

In [10]:
print("----Training----")

param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
svc = svm.SVC()
clf = GridSearchCV(svc, param_grid, cv=3)
clf.fit(X_train, y_train)

print("----Training Ended----")



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

### Predict

In [11]:
y_pred = clf.predict(X_test)

### Report

In [12]:
print("Classification report for - \n{}:\n{}\n".format(
    clf, metrics.classification_report(y_test, y_pred)))

Classification report for - 
GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0):
                 precision    recall  f1-score   support

Genuine Forgery       0.65      0.75      0.70       235
Genuine Genuine       0.71      0.61      0.66       239

      micro avg       0.68      0.68      0.68       474
      macro avg       0.68      0.68      0.68       474
   weighted avg       0.68      0.68      0.68       474




In [13]:
# End point
end_time = time.time()

uptime = end_time - start_time

human_uptime = datetime.timedelta(seconds=uptime)

print("End time: ", datetime.datetime.now())
print("Uptime :" ,human_uptime)

Start time:  2019-01-03 21:09:53.882319
End time:  2019-01-03 21:09:53.884219
Uptime : 3:47:01.685897
