# Image Classification using `sklearn.svm`

## Libraries

In [1]:
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
%matplotlib notebook
from sklearn import svm, metrics, datasets
from sklearn.utils import Bunch
from sklearn.model_selection import GridSearchCV, train_test_split

from skimage.io import imread
from skimage.transform import resize

import time
import datetime

# My addition to the original code
import os
from skimage.feature import hog
from random import randint

In [2]:
# Start point
start_time = time.time()
print("Start time: ", datetime.datetime.now())

Start time:  2019-01-01 22:12:31.877201


## Preprocessing

In [3]:
# HOG of the image
def HOG(image):
    img = hog(image,
              orientations=8,
              pixels_per_cell=(4, 4),
              cells_per_block=(2, 2),
              block_norm='L2-Hys')
    return img

## Load images in structured directory

In [4]:
# Function that builds pair of signatures on which the algorithm is going to work.

def match_images(flat_data, target):
    """
    Parameters
    
    - flat_data: a list of ndarray containing the flattern information coming from 
      the hog on the signature image.
    - target: a list of strings. Each component can be 'Genuine' of 'Forgery'.
    
    Returns
    
    A dataset divided in:
    - data: a list of ndarray, each one originating from the concatenation of 2 array of the input flat_data.
    - target: a list of strings, each one deriving from the concatenation of 2 strings of input target list.
    
    -------------------------------------------------------
    
    Rule of matching:
    - The first 5 lists of 3 signatures are composed of 3 genuine signatures randomly choosen between the 10 available.
    - The latter 5 are composed of 1 genuine signature and 2 forgery signatures, in this order,
      randomly choosen.
    """
      
    m_flat_data=[]
    #m_images=[]
    m_target=[]
    g_indexes=[ind for ind, t in enumerate(target) if t=='Genuine']
    f_indexes=[ind for ind,t in enumerate(target) if t=='Forgery']
    matches=[]
    for i in range(0,5): # The first 5 matchings: Genuine - Genuine - Genuine
        n=randint(0,len(g_indexes)-1)
        m=randint(0,len(g_indexes)-1)
        p=randint(0,len(g_indexes)-1)
        while n == m or n == p or m == p or (g_indexes[n],g_indexes[m],g_indexes[p]) in matches:
            n=randint(0,len(g_indexes)-1)
            m=randint(0,len(g_indexes)-1)
            p=randint(0,len(g_indexes)-1)
        m_flat_data.append(np.concatenate((flat_data[g_indexes[n]],flat_data[g_indexes[m]],flat_data[g_indexes[p]])))
        m_target.append(target[g_indexes[n]]+' '+target[g_indexes[m]]+' '+target[g_indexes[p]])
        matches.append((g_indexes[n],g_indexes[m],g_indexes[p]))
    
    matches=[]
    for i in range(5,10): # The latter 5 matchings: Forgery - Forgery - Forgery
        n=randint(0,len(f_indexes)-1)
        m=randint(0,len(f_indexes)-1)
        p=randint(0,len(f_indexes)-1)
        while n == m or n == p or m == p or (f_indexes[n],f_indexes[m],f_indexes[p]) in matches:
            n=randint(0,len(f_indexes)-1)
            m=randint(0,len(f_indexes)-1)
            p=randint(0,len(f_indexes)-1)
        m_flat_data.append(np.concatenate((flat_data[f_indexes[n]],flat_data[f_indexes[m]],flat_data[f_indexes[p]])))
        m_target.append(target[f_indexes[n]]+' '+target[f_indexes[m]]+' '+target[f_indexes[p]])
        matches.append((f_indexes[n],f_indexes[m],f_indexes[p]))
        
    return Bunch(data=m_flat_data,
                 target=m_target
                )    

In [5]:
def load_image_files(container_path, dimension=(70, 100)):
    """
    Load image files with categories as subfolder names 
    which performs like scikit-learn sample dataset
    
    Parameters
    ----------
    container_path : string or unicode
        Path to the main folder holding one subfolder per category
    dimension : tuple
        size to which image are adjusted to
        
    Returns
    -------
    Bunch
    """
    image_dir = Path(container_path)
    subj = [directory for directory in image_dir.iterdir() if directory.is_dir()]
    categories = ['Genuine', 'Forgery']

    descr = "An image classification dataset"
    #images = []
    flat_data = []
    target = []
    
    for n, s in enumerate(subj):
        folders = [sub_directory for sub_directory in s.iterdir() if sub_directory.is_dir()]
        temp_flat_data=[] # Temporary list of flat_data used to store the information of the current subject 
                          # and reinitialized as empty list for each subject
        temp_images=[] # Temporary list of images
        temp_target=[] # Temporary list of target
        for sub_dir in folders:
            if sub_dir.name in categories: # Skip the Disguised folder
                for file in sub_dir.iterdir():
                    img = imread(file)
                    img_resized = resize(img, dimension, anti_aliasing=True, mode='reflect')
                    # flat_data.append(img_resized.flatten()) : original function used to 
                    # flat the image in an array.
                    temp_flat_data.append(HOG(img_resized)) # Is it ok to flat the image this way?
                    # temp_images.append(img_resized)
                    temp_target.append(sub_dir.name)         
        subj_dataset = match_images(temp_flat_data, temp_target)
        flat_data += subj_dataset.data
        target += subj_dataset.target
        # images += subj_dataset.images

    flat_data = np.array(flat_data)
    target = np.array(target)
    # images = np.array(images)
    
    return Bunch(data=flat_data,
                 target=target,
                 target_names=categories,
                 # images=images,
                 DESCR=descr)

In [6]:
# To let the code run it is necessary to put this script in the same folder which contains the database folder

db_folder='Database' # Modify with the name of the folder containing the database.
image_dataset = load_image_files(os.path.join(os.getcwd(),db_folder))

### Split data in Training Set and Testing Set

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    image_dataset.data, image_dataset.target, test_size=0.3,random_state=109)

### Train data with parameter optimization

In [8]:
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
svc = svm.SVC()
clf = GridSearchCV(svc, param_grid)
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

### Predict

In [9]:
y_pred = clf.predict(X_test)

### Report

In [10]:
print("Classification report for - \n{}:\n{}\n".format(
    clf, metrics.classification_report(y_test, y_pred)))

Classification report for - 
GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0):
                         precision    recall  f1-score   support

Forgery Forgery Forgery       0.82      0.81      0.82       235
Genuine Genuine Genuine       0.81      0.83      0.82       239

            avg / total       0.82      0.82      0.82       474




In [11]:
# End point
end_time = time.time()

uptime = end_time - start_time

human_uptime = datetime.timedelta(seconds=uptime)

print("Start time: ", datetime.datetime.now())
print("End time: ", datetime.datetime.now())
print("Uptime :" ,human_uptime)

Start time:  2019-01-01 22:46:50.816510
End time:  2019-01-01 22:46:50.817483
Uptime : 0:34:18.939309
