# Image Classification using `sklearn.svm`

## Libraries

In [1]:
from pathlib import Path
import numpy as np
from sklearn.utils import Bunch

from skimage.transform import resize

import time
import datetime

# My addition to the original code
import os
from skimage.feature import hog

from skimage.util import img_as_ubyte
from skimage import io
from skimage.filters import threshold_otsu

import pickle

In [2]:
# Start point 1
start_time = time.time()
print("Start time: ", datetime.datetime.now())

Start time:  2019-01-13 19:07:23.410281


## Preprocessing

In [3]:
def HOG(image):
    img = hog(image,
              orientations=8,
              pixels_per_cell=(4, 4),
              cells_per_block=(2, 2),
              block_norm='L2-Hys',
              multichannel = False,
              feature_vector = True)
    return img

In [4]:
# binarization using Otsu's method
def binarize(inp_image):
    thresh = threshold_otsu(inp_image)
    binary_thresh_img = inp_image > thresh

    img_binary = img_as_ubyte(binary_thresh_img)

    return img_binary

## Load images in structured directory

In [5]:
# n fixed genuine signature + 1 genuine or forgery signature

def match_images(flat_data, target, n):
    """
    Parameters
    
    - flat_data: a list of ndarray containing the flattern information coming from 
      the hog on the signature image.
    - target: a list of strings. Each component can be 'Genuine' of 'Forgery'.
    - n: number of fixed genuine signtures in each match. 
   
    Returns
    
    A dataset divided in:
    - data: a list of 20-n ndarray, each one originating from the concatenation of n+1 array of the input flat_data.
    - target: a list of 20-n strings, each one deriving from the concatenation of n+1 strings of input target list.
    """
      
    m_flat_data=[]
    m_target=[]
    g_indexes=[ind for ind, t in enumerate(target) if t=='Genuine']
    f_indexes=[ind for ind, t in enumerate(target) if t=='Forgery']
    fixed_g = flat_data[g_indexes[0]]
    
    for i in range(1,n):
        fixed_g = np.concatenate((fixed_g, flat_data[g_indexes[i]]))
    for i in range(n,10):
        m_flat_data.append(np.concatenate((fixed_g,flat_data[g_indexes[i]])))
        m_target.append('Genuine '*n + target[g_indexes[i]])

    for i in range(0,10):
        m_flat_data.append(np.concatenate((fixed_g,flat_data[f_indexes[i]])))
        m_target.append('Genuine '*n+target[f_indexes[i]])

    return Bunch(data=m_flat_data,
                 target=m_target
                )

In [6]:
def load_image_files(container_path, dimension=(100, 144)): # height x lenght
    """
    Load image files with categories as subfolder names
    which performs like scikit-learn sample dataset

    Parameters
    ----------
    container_path : string or unicode
        Path to the main folder holding one subfolder per category
    dimension : tuple
        size to which image are adjusted to

    Returns
    -------
    Bunch
    """
    image_dir = Path(container_path)
    subj = [directory for directory in image_dir.iterdir() if directory.is_dir()]
    categories = ['Genuine', 'Forgery']

    flat_data = []
    target = []

    for n, s in enumerate(subj):
        folders = [sub_directory for sub_directory in s.iterdir() if sub_directory.is_dir()]
        temp_flat_data=[] # Temporary list of flat_data used to store the information of the current subject
                          # and reinitialized as empty list for each subject
        temp_images=[] # Temporary list of images
        temp_target=[] # Temporary list of target
        for sub_dir in folders:
            if sub_dir.name in categories: # Skip the Disguised folder
                for file in sub_dir.iterdir():
                    img_gray = img_as_ubyte(io.imread(file, as_gray=True)) # load the images in grayscale
                    img_bin = binarize(img_gray)
                    img_resized = resize(img_bin, dimension, anti_aliasing=True, mode='reflect')

                    temp_flat_data.append(HOG(img_resized))
    
                    # NOTE: resize() produces a dtype('float64') while imread() and binarize() return dtype('uint8')
                    # the opening filter is bad

                    temp_target.append(sub_dir.name)

        subj_dataset = match_images(temp_flat_data, temp_target, 2) # 2 fixed signatures for each person
        
        flat_data += subj_dataset.data
        target += subj_dataset.target

    flat_data = np.array(flat_data)
    target = np.array(target)

    return Bunch(data=flat_data,
                 target=target,
                 target_names=categories)

In [7]:
# To let the code run it is necessary to put this script in the same folder which contains the database folder

db_folder='SignUniPD_anonymised' # Modify with the name of the folder containing the dataset.
image_dataset = load_image_files(os.path.join(os.getcwd(),db_folder))

print("----Dataset Loaded----")

----Dataset Loaded----


## Creation of the pickle

In [8]:
# Creation of the pickle which is goig to store the loaded Dataset.

pickle_out = open("Signature_Dataset_HOG_V1.pickle","wb") #Modify with the name of the dataset you're saving
pickle.dump(image_dataset, pickle_out)
pickle_out.close()

print("----Pickle created----")

----Pickle created----


In [9]:
#End point 1
end_time = time.time()

uptime = end_time - start_time

human_uptime = datetime.timedelta(seconds=uptime)

print("Start time: ", datetime.datetime.now())
print("End time: ", datetime.datetime.now())
print("Uptime :" ,human_uptime)

Start time:  2019-01-13 19:18:16.743877
End time:  2019-01-13 19:18:16.762684
Uptime : 0:10:53.274690
