# Image Classification using `sklearn.svm`

## Libraries

In [16]:
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm, metrics, datasets
from sklearn.utils import Bunch
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegressionCV

from skimage.io import imread
from skimage.transform import resize

import time
import datetime

# My addition to the original code
import os
from skimage.feature import hog
from random import randint

import os
from skimage.util import img_as_ubyte
from skimage import io
from skimage.filters import threshold_otsu


In [2]:
# Start point
start_time = time.time()
print("Start time: ", datetime.datetime.now())

Start time:  2019-01-11 10:24:46.921092


## Preprocessing

In [9]:
def HOG(image):
    img = hog(image,
              orientations=8,
              pixels_per_cell=(4, 4),
              cells_per_block=(2, 2),
              block_norm='L2-Hys',
              feature_vector = True)
    return img

In [4]:
# binarization using Otsu's method
def binarize(inp_image):
    thresh = threshold_otsu(inp_image)
    binary_thresh_img = inp_image > thresh

    img_binary = img_as_ubyte(binary_thresh_img)

    return img_binary

In [5]:
# defining things for the morphological filtering

from skimage.morphology import erosion, dilation, opening, closing, white_tophat
from skimage.morphology import black_tophat, skeletonize, convex_hull_image
from skimage.morphology import disk, square, diamond

selem = disk(1)

## Load images in structured directory

In [10]:
# Function that builds pair of signatures on which the algorithm is going to work.

def match_n(flat_data, target, n):
    """
    Parameters

    - flat_data: a list of ndarray containing the flattern information coming from
      the hog on the signature image.
    - target: a list of strings. Each component can be 'Genuine' of 'Forgery'.

    Returns

    A dataset divided in:
    - data: a list of ndarray, each one originating from the concatenation of 2 array of the input flat_data.
    - target: a list of strings, each one deriving from the concatenation of 2 strings of input target list.

    -------------------------------------------------------

    """

    
    m_flat_data=[]
    m_target=[]
    g_indexes=[ind for ind, t in enumerate(target) if t=='Genuine']
    f_indexes=[ind for ind, t in enumerate(target) if t=='Forgery']
    fixed_g = flat_data[g_indexes[0]]
    
    for i in range(1,n):
        fixed_g = np.concatenate((fixed_g, flat_data[g_indexes[i]]))
    for i in range(n,10):
        m_flat_data.append(np.concatenate((fixed_g,flat_data[g_indexes[i]])))
        m_target.append('Genuine '*n + target[g_indexes[i]])

    for i in range(10-n):
        m_flat_data.append(np.concatenate((fixed_g,flat_data[f_indexes[i]])))
        m_target.append('Genuine '*n+target[f_indexes[i]])
   

    return Bunch(data=m_flat_data,
                 target=m_target
                )


In [11]:
def load_image_files(container_path, dimension=(100, 144)): # height x lenght
    """
    Load image files with categories as subfolder names
    which performs like scikit-learn sample dataset

    Parameters
    ----------
    container_path : string or unicode
        Path to the main folder holding one subfolder per category
    dimension : tuple
        size to which image are adjusted to

    Returns
    -------
    Bunch
    """
    image_dir = Path(container_path)
    subj = [directory for directory in image_dir.iterdir() if directory.is_dir()]
    categories = ['Genuine', 'Forgery']

    descr = "An image classification dataset"
    #images = []
    flat_data = []
    target = []

    for n, s in enumerate(subj):
        folders = [sub_directory for sub_directory in s.iterdir() if sub_directory.is_dir()]
        temp_flat_data=[] # Temporary list of flat_data used to store the information of the current subject
                          # and reinitialized as empty list for each subject
        temp_images=[] # Temporary list of images
        temp_target=[] # Temporary list of target
        for sub_dir in folders:
            if sub_dir.name in categories: # Skip the Disguised folder
                for file in sub_dir.iterdir():
                    img_gray = img_as_ubyte(io.imread(file, as_gray=True)) # load the images in grayscale
                    img_bin = binarize(img_gray)
                    img_resized = resize(img_bin, dimension, mode='reflect')


                    temp_flat_data.append(HOG(img_resized))

                    # NOTE: resize() produces a dtype('float64') while imread() and binarize() return dtype('uint8')
                    # the opening filter is bad

                    temp_target.append(sub_dir.name)

        #subj_dataset = match_2(temp_flat_data, temp_target, 5) #now the number of couples can be setted

        subj_dataset = match_n(temp_flat_data, temp_target, 3) 
        flat_data += subj_dataset.data
        target += subj_dataset.target

    flat_data = np.array(flat_data)
    target = np.array(target)

    return Bunch(data=flat_data,
                 target=target,
                 target_names=categories,
                 DESCR=descr)

In [12]:
# To let the code run it is necessary to put this script in the same folder which contains the database folder

db_folder='SignUniPD_anonymised' # Modify with the name of the folder containing the database.
image_dataset = load_image_files(os.path.join(os.getcwd(),db_folder))

print("----Dataset Loaded----")

----Dataset Loaded----


### Split data in Training Set and Testing Set

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    image_dataset.data, image_dataset.target, test_size=0.3,random_state=109)

print("----Dataset Splitted----")

----Dataset Splitted----


In [14]:
print("Training Set shape: {}".format(X_train.shape))
print("Test Set shape: {}".format(X_test.shape))

print("Training Set shape: {}".format(y_train.shape))
print("Test Set shape: {}".format(y_test.shape))

Training Set shape: (1548, 107520)
Test Set shape: (664, 107520)
Training Set shape: (1548,)
Test Set shape: (664,)


### Train data with parameter optimization

In [17]:
clf = LogisticRegressionCV()
clf.fit(X_train, y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

### Predict

In [18]:
y_pred = clf.predict(X_test)

### Report

In [19]:
print("Classification report for - \n{}:\n{}\n".format(
    clf, metrics.classification_report(y_test, y_pred)))

Classification report for - 
LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0):
                                 precision    recall  f1-score   support

Genuine Genuine Genuine Forgery       0.52      0.49      0.50       340
Genuine Genuine Genuine Genuine       0.50      0.53      0.51       324

                    avg / total       0.51      0.51      0.51       664




In [20]:
# End point
end_time = time.time()

uptime = end_time - start_time

human_uptime = datetime.timedelta(seconds=uptime)

print("End time: ", datetime.datetime.now())
print("Uptime :" ,human_uptime)

End time:  2019-01-11 10:34:55.331361
Uptime : 0:10:08.408973
