Importing Necessary Libraries:

In [4]:
import pandas as pd
import numpy as np
from time import time
import logging
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.svm import SVC

## Loading the Dataset:

In [5]:
faces = fetch_lfw_people('data', min_faces_per_person=70, resize=0.4)

Downloading LFW metadata: https://ndownloader.figshare.com/files/5976012
Downloading LFW metadata: https://ndownloader.figshare.com/files/5976009
Downloading LFW metadata: https://ndownloader.figshare.com/files/5976006
Downloading LFW data (~200MB): https://ndownloader.figshare.com/files/5976015


In [6]:
n_samples, h, w = faces.images.shape
np.random.seed(42)

x = faces.data
n_features = x.shape[1]

y = faces.target
target_names = faces.target_names
n_classes = target_names.shape[0]

print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % n_features)
print( "n_classes: %d" % n_classes)


Total dataset size:
n_samples: 1288
n_features: 1850
n_classes: 7


## Splitting the Data into training and testing

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.25, 
                                                    random_state=42)

## Computing PCA:

In [8]:
n_components = 200
print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]))
t0 = time()

#Creating instances of PCA, initializing with n_components and whiten=True
pca = PCA(n_components=n_components, whiten=True, svd_solver="randomized")
pca = pca.fit(X_train)
print("Done in %0.3fs" %(time()- t0))


Extracting the top 200 eigenfaces from 966 faces
Done in 0.369s


### Projecting the input data on the eigenfaces orthonormal basis:

In [16]:
eigenfaces = pca.components_.reshape((n_components, h, w))

t0=time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("Done in %0.3fs" %(time()- t0))

Done in 0.037s


## Training a SVM Classification Model

In [17]:
param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5],
               'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],}

clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = clf.fit(X_train_pca, Y_train)

print("Best Estimator found by Grid Search:")
print(clf.best_estimator_)


Best Estimator found by Grid Search:
SVC(C=1000.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.001,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)
