# Loading Features

## Libraries

In [1]:
import time
import datetime
import pickle
import os
import numpy as np
from pathlib import Path
from sklearn.utils import Bunch
from skimage.feature import hog
from skimage.transform import resize
from skimage.util import img_as_ubyte
from skimage import io
from skimage.filters import threshold_otsu

In [2]:
# Start point
start_time = time.time()
print("Start time: ", datetime.datetime.now())

Start time:  2019-01-16 11:03:35.336890


## Preprocessing

In [3]:
def HOG(image):
    img = hog(image,
              orientations=8,
              pixels_per_cell=(4, 4),
              cells_per_block=(2, 2),
              block_norm='L2-Hys',
              multichannel = False,
              feature_vector = True)
    return img

In [4]:
# Binarization using Otsu's method
def binarize(inp_image):
    thresh = threshold_otsu(inp_image)
    binary_thresh_img = inp_image > thresh

    img_binary = img_as_ubyte(binary_thresh_img)

    return img_binary

## Load images in structured directory

In [5]:
# n fixed genuine signature + 1 genuine or forgery signature

def match_images(flat_data, target, n):
    """
    Parameters
    
    - flat_data: a list of ndarray containing the flattern information coming from 
      the hog on the signature image.
    - target: a list of strings. Each component can be 'Genuine' of 'Forgery'.
    - n: number of fixed genuine signtures in each match. 
   
    Returns
    
    A dataset divided in:
    - data: a list of 20-n ndarray, each one created by concatenation of n+1 array of the input flat_data list.
    - target: a list of 20-n strings, each one created by concatenation of n+1 strings of input target list.
    """
      
    m_flat_data=[]
    m_target=[]
    g_indexes=[ind for ind, t in enumerate(target) if t=='Genuine']
    f_indexes=[ind for ind, t in enumerate(target) if t=='Forgery']
    
    fixed_g = flat_data[g_indexes[0]]
    for i in range(1,n):
        fixed_g = np.concatenate((fixed_g, flat_data[g_indexes[i]]))
    
    for i in range(n,10):
        m_flat_data.append(np.concatenate((fixed_g,flat_data[g_indexes[i]])))
        m_target.append(+1) # +1 is the positive class
        
    for i in range(0,10):
        m_flat_data.append(np.concatenate((fixed_g,flat_data[f_indexes[i]])))
        m_target.append(-1) # -1 is the negative class
    
    return Bunch(data=m_flat_data,
                 target=m_target)

In [None]:
def load_image_files_split(container_path, dimension=(100, 144)): # height x lenght
    """
    Load image files with categories as subfolder names which performs like scikit-learn sample dataset

    Parameters
    ----------
    container_path : string or unicode
        Path to the main folder holding one subfolder per category
    dimension : tuple
        size to which image are adjusted to
        
    Returns
    -------
    Dataset splitted in Training Set and Test Set as follows:
    - 70% of subjects in the Training Set
    - 30% of subjects in the Test Set 
    """
    
    image_dir = Path(container_path)
    subj = [directory for directory in image_dir.iterdir() if directory.is_dir()]
    categories = ['Genuine', 'Forgery']

    flat_data_train = []
    target_train = []
    flat_data_test = []
    target_test = []
    
    stop_load_train = 110

    for n, s in enumerate(subj):        
        folders = [sub_directory for sub_directory in s.iterdir() if sub_directory.is_dir()]
        temp_flat_data=[] # Temporary list of flat_data used to store the information of the current subject 
                          # and reinitialized as empty list for each subject
        temp_target=[]    # Temporary list of target
        
        for sub_dir in folders:
            if sub_dir.name in categories: # Skip the Disguised folder
                for file in sub_dir.iterdir():
                    img_gray = img_as_ubyte(io.imread(file, as_gray=True)) # load the images in grayscale
                    img_bin = binarize(img_gray)
                    img_resized = resize(img_bin, dimension, anti_aliasing=True, mode='reflect')
                    temp_flat_data.append(HOG(img_resized))
                    temp_target.append(sub_dir.name)

        subj_dataset = match_images(temp_flat_data, temp_target, 2) # 2 fixed signatures for each person

        if n < stop_load_train:   
            flat_data_train += subj_dataset.data
            target_train += subj_dataset.target
        else:
            flat_data_test += subj_dataset.data
            target_test += subj_dataset.target
            
            
    flat_data_train = np.array(flat_data_train)
    target_train = np.array(target_train)
    flat_data_test = np.array(flat_data_test)
    target_test = np.array(target_test)
    
    print(target_train.dtype)
    print(target_train)

    return Bunch(X_train= flat_data_train,
                 X_test = flat_data_test,
                 y_train= target_train,
                 y_test = target_test)

In [None]:
# To let the code run it is necessary to put this script in the same folder which contains the dataset folder

db_folder='SignUniPD_anonymised' # Modify with the name of the folder containing the dataset.
image_dataset = load_image_files_split(os.path.join(os.getcwd(),db_folder))

print("----Dataset Loaded----")

int64
[ 1  1  1 ... -1 -1 -1]
----Dataset Loaded----


## Creation of the pickle

In [None]:
# Creation of the pickle which is goig to store the loaded Dataset.

pickle_out = open("Signature_Dataset_HOG_110_3_int.pickle","wb") #Modify with the name of the dataset you're saving
pickle.dump(image_dataset, pickle_out)
pickle_out.close()

print("----Pickle created----")

In [None]:
#End point 

end_time = time.time()
uptime = end_time - start_time
human_uptime = datetime.timedelta(seconds=uptime)

print("Start time: ", datetime.datetime.now())
print("End time: ", datetime.datetime.now())
print("Uptime :" ,human_uptime)