# Image Classification using `sklearn.svm`

## Libraries

In [1]:
import numpy as np
from sklearn import svm, metrics
from sklearn.model_selection import GridSearchCV, train_test_split

import time
import datetime

# My addition to the original code
import pickle

## Loading Dataset from the pickle

In [2]:
# Start point
start_time = time.time()
print("Start time: ", datetime.datetime.now())

Start time:  2019-01-14 07:26:26.477670


In [3]:
pickle_in = open("Signature_Dataset_HOG_V1.pickle","rb")
image_dataset = pickle.load(pickle_in)

### Split data in Training Set and Testing Set

In [4]:
# Manual splitting of the Dataset
ds70=(70*158)//100*(20-2)
# 70%dataset_index = [70%(number_of_subjects)]*(20-number_of_fixed_genuine_signatures)
# [ ] = floor of a number


X_train = image_dataset.data[:ds70]
X_test = image_dataset.data[ds70:]
y_train = image_dataset.target[:ds70]
y_test = image_dataset.target[ds70:]

print("----Dataset Splitted----")

----Dataset Splitted----


### Train data with parameter optimization

In [5]:
from pandas import DataFrame
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [6]:
#Dictionary of models and hyperparameters
models_and_hyperparameters={'LR':(LogisticRegression(),{'C': [0.01, 0.05, 0.1, 0.5, 1, 2],
                                                        'penalty': ['l1', 'l2'], 
                                                        'class_weight': ['balanced']}),
                           'SVM':(svm.SVC(),{'C': [1, 10, 100, 1000],
                                             'gamma': [0.001, 0.0001],
                                             'kernel': ['rbf'],
                                             'class_weight': ['balanced']}),
                           'RFC':(RandomForestClassifier(), {'n_estimators': [50,100,200],
                                                             # 'max_features': ['auto', 'sqrt'],
                                                             'max_depth': [50,60,70],
                                                             'min_samples_split': [2,5,10],
                                                             #'min_samples_leaf': [1,2],
                                                             'bootstrap': [True, False],
                                                             'class_weight': ['balanced']})
                           }

In [7]:
#LOGISTIC REGRESSION
"""
print("----Training----")

k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=31)

model = models_and_hyperparameters['LR'][0]

hyperparameters = models_and_hyperparameters['LR'][1]


gs = GridSearchCV(model, 
                   param_grid = hyperparameters,
                   cv=k_fold, 
                   scoring='accuracy',
                   verbose=3,
                   return_train_score=True)

gs.fit(X_train, y_train)

print("----Training Ended----")
"""

'\nprint("----Training----")\n\nk_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=31)\n\nmodel = models_and_hyperparameters[\'LR\'][0]\n\nhyperparameters = models_and_hyperparameters[\'LR\'][1]\n\n\ngs = GridSearchCV(model, \n                   param_grid = hyperparameters,\n                   cv=k_fold, \n                   scoring=\'accuracy\',\n                   verbose=3,\n                   return_train_score=True)\n\ngs.fit(X_train, y_train)\n\nprint("----Training Ended----")\n'

In [8]:
# SVM
"""
print("----Training----")

k_fold = StratifiedKFold(n_splits=4, shuffle=True, random_state=31)

model = models_and_hyperparameters['SVM'][0]

hyperparameters = models_and_hyperparameters['SVM'][1]


gs = GridSearchCV(model, 
                   param_grid = hyperparameters,
                   cv=k_fold, 
                   scoring='accuracy',
                   verbose=3,
                   return_train_score=True)

gs.fit(X_train, y_train)

print("----Training Ended----")
"""

'\nprint("----Training----")\n\nk_fold = StratifiedKFold(n_splits=4, shuffle=True, random_state=31)\n\nmodel = models_and_hyperparameters[\'SVM\'][0]\n\nhyperparameters = models_and_hyperparameters[\'SVM\'][1]\n\n\ngs = GridSearchCV(model, \n                   param_grid = hyperparameters,\n                   cv=k_fold, \n                   scoring=\'accuracy\',\n                   verbose=3,\n                   return_train_score=True)\n\ngs.fit(X_train, y_train)\n\nprint("----Training Ended----")\n'

In [9]:
# RANDOM FOREST

print("----Training----")

k_fold = StratifiedKFold(n_splits=4, shuffle=True, random_state=31)

model = models_and_hyperparameters['RFC'][0]

hyperparameters = models_and_hyperparameters['RFC'][1]


gs = GridSearchCV(model, 
                   param_grid = hyperparameters,
                   cv=k_fold, 
                   scoring='accuracy',
                   verbose=3,
                   return_train_score=True)

gs.fit(X_train, y_train)

print("----Training Ended----")


----Training----
Fitting 4 folds for each of 54 candidates, totalling 216 fits
[CV] bootstrap=True, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=50 
[CV]  bootstrap=True, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=50, score=0.6868686868686869, total=   7.9s
[CV] bootstrap=True, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=50 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.5s remaining:    0.0s


[CV]  bootstrap=True, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=50, score=0.6787878787878788, total=   6.5s
[CV] bootstrap=True, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=50 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   15.4s remaining:    0.0s


[CV]  bootstrap=True, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=50, score=0.692929292929293, total=   6.6s
[CV] bootstrap=True, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=50 
[CV]  bootstrap=True, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=50, score=0.7212121212121212, total=   6.2s
[CV] bootstrap=True, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=100 
[CV]  bootstrap=True, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=100, score=0.7070707070707071, total=  11.6s
[CV] bootstrap=True, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=100 
[CV]  bootstrap=True, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=100, score=0.6989898989898989, total=  11.7s
[CV] bootstrap=True, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=100 
[CV]  bootstrap=True, class_weight=balanced, max_depth=50, min_s

[CV]  bootstrap=True, class_weight=balanced, max_depth=60, min_samples_split=2, n_estimators=50, score=0.6909090909090909, total=   6.1s
[CV] bootstrap=True, class_weight=balanced, max_depth=60, min_samples_split=2, n_estimators=50 
[CV]  bootstrap=True, class_weight=balanced, max_depth=60, min_samples_split=2, n_estimators=50, score=0.6444444444444445, total=   6.0s
[CV] bootstrap=True, class_weight=balanced, max_depth=60, min_samples_split=2, n_estimators=50 
[CV]  bootstrap=True, class_weight=balanced, max_depth=60, min_samples_split=2, n_estimators=50, score=0.7151515151515152, total=   6.1s
[CV] bootstrap=True, class_weight=balanced, max_depth=60, min_samples_split=2, n_estimators=100 
[CV]  bootstrap=True, class_weight=balanced, max_depth=60, min_samples_split=2, n_estimators=100, score=0.692929292929293, total=  11.2s
[CV] bootstrap=True, class_weight=balanced, max_depth=60, min_samples_split=2, n_estimators=100 
[CV]  bootstrap=True, class_weight=balanced, max_depth=60, min_sam

[CV]  bootstrap=True, class_weight=balanced, max_depth=70, min_samples_split=2, n_estimators=50, score=0.703030303030303, total=   6.0s
[CV] bootstrap=True, class_weight=balanced, max_depth=70, min_samples_split=2, n_estimators=50 
[CV]  bootstrap=True, class_weight=balanced, max_depth=70, min_samples_split=2, n_estimators=50, score=0.6868686868686869, total=   6.1s
[CV] bootstrap=True, class_weight=balanced, max_depth=70, min_samples_split=2, n_estimators=50 
[CV]  bootstrap=True, class_weight=balanced, max_depth=70, min_samples_split=2, n_estimators=50, score=0.6787878787878788, total=   6.0s
[CV] bootstrap=True, class_weight=balanced, max_depth=70, min_samples_split=2, n_estimators=50 
[CV]  bootstrap=True, class_weight=balanced, max_depth=70, min_samples_split=2, n_estimators=50, score=0.7090909090909091, total=   6.3s
[CV] bootstrap=True, class_weight=balanced, max_depth=70, min_samples_split=2, n_estimators=100 
[CV]  bootstrap=True, class_weight=balanced, max_depth=70, min_sampl

[CV]  bootstrap=True, class_weight=balanced, max_depth=70, min_samples_split=10, n_estimators=200, score=0.7333333333333333, total=  21.3s
[CV] bootstrap=False, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=50 
[CV]  bootstrap=False, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=50, score=0.7212121212121212, total=   9.4s
[CV] bootstrap=False, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=50 
[CV]  bootstrap=False, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=50, score=0.7151515151515152, total=   9.6s
[CV] bootstrap=False, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=50 
[CV]  bootstrap=False, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=50, score=0.7131313131313132, total=   9.6s
[CV] bootstrap=False, class_weight=balanced, max_depth=50, min_samples_split=2, n_estimators=50 
[CV]  bootstrap=False, class_weight=balanced, max_depth=50,

[CV]  bootstrap=False, class_weight=balanced, max_depth=50, min_samples_split=10, n_estimators=200, score=0.7393939393939394, total=  38.9s
[CV] bootstrap=False, class_weight=balanced, max_depth=50, min_samples_split=10, n_estimators=200 
[CV]  bootstrap=False, class_weight=balanced, max_depth=50, min_samples_split=10, n_estimators=200, score=0.7414141414141414, total=  38.9s
[CV] bootstrap=False, class_weight=balanced, max_depth=60, min_samples_split=2, n_estimators=50 
[CV]  bootstrap=False, class_weight=balanced, max_depth=60, min_samples_split=2, n_estimators=50, score=0.7171717171717171, total=  10.4s
[CV] bootstrap=False, class_weight=balanced, max_depth=60, min_samples_split=2, n_estimators=50 
[CV]  bootstrap=False, class_weight=balanced, max_depth=60, min_samples_split=2, n_estimators=50, score=0.7414141414141414, total=  10.9s
[CV] bootstrap=False, class_weight=balanced, max_depth=60, min_samples_split=2, n_estimators=50 
[CV]  bootstrap=False, class_weight=balanced, max_dept

[CV]  bootstrap=False, class_weight=balanced, max_depth=60, min_samples_split=10, n_estimators=200, score=0.7414141414141414, total=  39.4s
[CV] bootstrap=False, class_weight=balanced, max_depth=60, min_samples_split=10, n_estimators=200 
[CV]  bootstrap=False, class_weight=balanced, max_depth=60, min_samples_split=10, n_estimators=200, score=0.7212121212121212, total=  41.2s
[CV] bootstrap=False, class_weight=balanced, max_depth=60, min_samples_split=10, n_estimators=200 
[CV]  bootstrap=False, class_weight=balanced, max_depth=60, min_samples_split=10, n_estimators=200, score=0.7232323232323232, total=  40.2s
[CV] bootstrap=False, class_weight=balanced, max_depth=70, min_samples_split=2, n_estimators=50 
[CV]  bootstrap=False, class_weight=balanced, max_depth=70, min_samples_split=2, n_estimators=50, score=0.705050505050505, total=  10.1s
[CV] bootstrap=False, class_weight=balanced, max_depth=70, min_samples_split=2, n_estimators=50 
[CV]  bootstrap=False, class_weight=balanced, max_d

[CV]  bootstrap=False, class_weight=balanced, max_depth=70, min_samples_split=10, n_estimators=200, score=0.7535353535353535, total=  37.7s
[CV] bootstrap=False, class_weight=balanced, max_depth=70, min_samples_split=10, n_estimators=200 
[CV]  bootstrap=False, class_weight=balanced, max_depth=70, min_samples_split=10, n_estimators=200, score=0.7494949494949495, total=  38.8s
[CV] bootstrap=False, class_weight=balanced, max_depth=70, min_samples_split=10, n_estimators=200 
[CV]  bootstrap=False, class_weight=balanced, max_depth=70, min_samples_split=10, n_estimators=200, score=0.7434343434343434, total=  39.0s
[CV] bootstrap=False, class_weight=balanced, max_depth=70, min_samples_split=10, n_estimators=200 
[CV]  bootstrap=False, class_weight=balanced, max_depth=70, min_samples_split=10, n_estimators=200, score=0.7515151515151515, total=  39.5s


[Parallel(n_jobs=1)]: Done 216 out of 216 | elapsed: 65.5min finished


----Training Ended----


In [10]:
DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_class_weight,param_max_depth,param_min_samples_split,param_n_estimators,params,...,split3_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,mean_train_score,std_train_score
0,6.583638,0.500548,0.220798,0.182865,True,balanced,50,2,50,"{'bootstrap': True, 'class_weight': 'balanced'...",...,0.721212,0.694949,0.015971,48,1.0,1.0,1.0,1.0,1.0,0.0
1,11.37359,0.150671,0.120821,0.002678,True,balanced,50,2,100,"{'bootstrap': True, 'class_weight': 'balanced'...",...,0.713131,0.704545,0.005954,42,1.0,1.0,1.0,1.0,1.0,0.0
2,21.47992,0.128142,0.142217,0.000252,True,balanced,50,2,200,"{'bootstrap': True, 'class_weight': 'balanced'...",...,0.729293,0.719192,0.012205,29,1.0,1.0,1.0,1.0,1.0,0.0
3,5.93668,0.100051,0.107549,9.6e-05,True,balanced,50,5,50,"{'bootstrap': True, 'class_weight': 'balanced'...",...,0.69697,0.690404,0.013804,50,1.0,1.0,1.0,1.0,1.0,0.0
4,11.044159,0.140344,0.118931,0.000239,True,balanced,50,5,100,"{'bootstrap': True, 'class_weight': 'balanced'...",...,0.735354,0.706566,0.024047,41,1.0,1.0,1.0,1.0,1.0,0.0
5,21.629237,0.52649,0.142313,0.000801,True,balanced,50,5,200,"{'bootstrap': True, 'class_weight': 'balanced'...",...,0.723232,0.726263,0.009529,22,1.0,1.0,1.0,1.0,1.0,0.0
6,5.881667,0.132362,0.107572,0.000228,True,balanced,50,10,50,"{'bootstrap': True, 'class_weight': 'balanced'...",...,0.688889,0.678283,0.008002,54,1.0,1.0,0.999327,1.0,0.999832,0.000292
7,10.846113,0.124563,0.118649,0.000127,True,balanced,50,10,100,"{'bootstrap': True, 'class_weight': 'balanced'...",...,0.713131,0.709091,0.010785,37,1.0,1.0,1.0,1.0,1.0,0.0
8,20.862422,0.231569,0.140942,0.000122,True,balanced,50,10,200,"{'bootstrap': True, 'class_weight': 'balanced'...",...,0.70101,0.713131,0.008206,35,1.0,1.0,1.0,1.0,1.0,0.0
9,5.964121,0.055011,0.107661,0.00013,True,balanced,60,2,50,"{'bootstrap': True, 'class_weight': 'balanced'...",...,0.715152,0.685354,0.025609,52,1.0,1.0,1.0,1.0,1.0,0.0


### Predict

In [11]:
y_pred = gs.predict(X_test)

In [12]:
print("Best hyperparameters: {}".format(gs.best_params_))
print("Best accuracy score: {:3f}".format(gs.best_score_))

Best hyperparameters: {'bootstrap': False, 'class_weight': 'balanced', 'max_depth': 50, 'min_samples_split': 2, 'n_estimators': 200}
Best accuracy score: 0.758586


### Report

In [13]:
print("Classification report for - \n{}:\n{}\n".format(gs, metrics.classification_report(y_test, y_pred)))

Classification report for - 
GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=31, shuffle=True),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50, 100, 200], 'max_depth': [50, 60, 70], 'min_samples_split': [2, 5, 10], 'bootstrap': [True, False], 'class_weight': ['balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=3):
                         precision    recall  f1-score   support

Genuine Genuine Forgery       0.62      

In [14]:
# End point
end_time = time.time()

uptime = end_time - start_time

human_uptime = datetime.timedelta(seconds=uptime)

print("End time: ", datetime.datetime.now())
print("Uptime :" ,human_uptime)

End time:  2019-01-14 08:36:07.980338
Uptime : 1:09:41.502698
