In [1]:
# testing adaboostclassifier on yale biometric dataset
# testing to see how it performs as compared to how it performed on mnist dataset

# importing libraries
from sklearn.model_selection import train_test_split 
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
import numpy as np


In [2]:
# loading the yale numpy array using numpy load func
from numpy import load

y = load('yaleExtB_target.npy')
X = load('yaleExtB_data.npy')

In [3]:
# splitting data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
# using PCA for feature extraction
# PCA is a basic method used for image feature extraction.
# Best results visualizing eigenfaces achieved when applied on 2D images.
from sklearn.decomposition import PCA

pca = PCA(n_components=1200, whiten=True)
pca.fit(X_train)


PCA(n_components=1200, whiten=True)

In [8]:
# applying PCA to the train and test images to calculate principal components
# using PCA transform method
X_train_pca = pca.transform(X_train) 
X_test_pca = pca.transform(X_test)

In [10]:
from sklearn.tree import DecisionTreeClassifier
dt_clf= DecisionTreeClassifier(max_depth=1)
dt_clf.fit(X_train_pca, y_train)
y_pred = dt_clf.predict(X_test_pca)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         2.0       0.00      0.00      0.00         7
         3.0       0.00      0.00      0.00        10
         4.0       0.00      0.00      0.00         8
         5.0       0.00      0.00      0.00         8
         6.0       0.00      0.00      0.00        11
         7.0       0.00      0.00      0.00        13
         8.0       0.00      0.00      0.00        12
         9.0       0.00      0.00      0.00        10
        11.0       0.00      0.00      0.00        10
        12.0       0.00      0.00      0.00         9
        13.0       0.00      0.00      0.00        10
        15.0       0.00      0.00      0.00        10
        16.0       0.00      0.00      0.00         9
        17.0       0.03      1.00      0.06         7
        18.0       0.00      0.00      0.00         9
        20.0       0.00      0.00      0.00        10
        22.0       0.00      0.00      0.00        12
        23.0       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# training a neural net
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

adaboost_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=5000,
    algorithm="SAMME.R", learning_rate=0.2, random_state=42,)

In [12]:
# fitting the data to the NN
search = adaboost_clf.fit(X_train_pca, y_train)

In [13]:
# predictions
y_pred = adaboost_clf.predict(X_test_pca)

In [14]:
# recognition accuracy
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         2.0       0.75      0.43      0.55         7
         3.0       0.50      0.60      0.55        10
         4.0       0.37      0.88      0.52         8
         5.0       1.00      0.75      0.86         8
         6.0       1.00      0.45      0.62        11
         7.0       0.78      0.54      0.64        13
         8.0       0.52      1.00      0.69        12
         9.0       1.00      0.70      0.82        10
        11.0       0.71      0.50      0.59        10
        12.0       0.70      0.78      0.74         9
        13.0       1.00      0.30      0.46        10
        15.0       1.00      0.80      0.89        10
        16.0       1.00      0.33      0.50         9
        17.0       0.71      0.71      0.71         7
        18.0       0.57      0.44      0.50         9
        20.0       0.31      0.90      0.46        10
        22.0       1.00      0.33      0.50        12
        23.0       0.86    

In [15]:
# using a different classifier to try and improve accuracy
# using SGDClassifier
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train_pca, y_train)

SGDClassifier(random_state=42)

In [16]:
# prediction and scores
sgd_clf.predict(X_test_pca)
sgd_clf.score(X_train_pca, y_train)

1.0

In [17]:
# that seems like a good accuracy
# lets now use adaboost to optimise the parameters

adaboost_clf = AdaBoostClassifier(
    sgd_clf, n_estimators=100,
    algorithm="SAMME", learning_rate=0.2, random_state=42,)

In [18]:
adaboost_clf.fit(X_train_pca, y_train)

AdaBoostClassifier(algorithm='SAMME',
                   base_estimator=SGDClassifier(random_state=42),
                   learning_rate=0.2, n_estimators=100, random_state=42)

In [20]:
y_pred = adaboost_clf.predict(X_test_pca)

In [24]:
adaboost_clf.score(X_train_pca,y_train)

1.0

In [27]:
# from the above output we can see that the NN has improved 
# using adaboost classifier the accuracy has greatly improved 
# lets now use Random search to find the best hyperparameters

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from scipy.stats import uniform
from scipy.stats import loguniform

distributions = dict(C=uniform(loc=0, scale=4),penalty=['l2', 'l1'])

# define search space
space = dict()
space['learning_rate'] = ['optimal']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
clf = RandomizedSearchCV(sgd_clf, space,n_iter=500,scoring='accuracy',n_jobs=-1,cv=cv, random_state=0)
search = clf.fit(X_train_pca, y_train)
# summarize result
print('Best Score: %s' % search.best_score_)
print('Best Hyperparameters: %s' % search.best_params_)



ValueError: Invalid parameter solver for estimator SGDClassifier(random_state=42). Check the list of available parameters with `estimator.get_params().keys()`.

In [29]:
# from the above output we can see the best hyperparameters to be penalty with value of l1
# lets add more hyperparameters on the space dictionary
sgd_clf.get_params().keys()


dict_keys(['alpha', 'average', 'class_weight', 'early_stopping', 'epsilon', 'eta0', 'fit_intercept', 'l1_ratio', 'learning_rate', 'loss', 'max_iter', 'n_iter_no_change', 'n_jobs', 'penalty', 'power_t', 'random_state', 'shuffle', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

In [None]:
# adding alpha, fit_intercept

distributions = dict(C=uniform(loc=0, scale=4),penalty=['l2', 'l1'])

# define search space
space = dict()
space['learning_rate'] = ['optimal']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['alpha'] = loguniform(1e-5, 100)
space['fit_intercept'] = [True, False]
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
clf = RandomizedSearchCV(sgd_clf, space,n_iter=500,scoring='accuracy',n_jobs=-1,cv=cv, random_state=0)
search = clf.fit(X_train_pca, y_train)
# summarize result
print('Best Score: %s' % search.best_score_)
print('Best Hyperparameters: %s' % search.best_params_)