In [1]:
from sklearn.datasets import fetch_lfw_people

lfw_people = fetch_lfw_people(data_home="../data/LFW", resize=0.4, min_faces_per_person=10)

# introspect the images arrays to find the shapes (for plotting)
n_samples, h, w = lfw_people.images.shape

# for machine learning we use the 2 data directly (as relative pixel
# positions info is ignored by this model)
X = lfw_people.data
n_features = X.shape[1]

# the label to predict is the id of the person
y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]

print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % n_features)
print("n_classes: %d" % n_classes)

Total dataset size:
n_samples: 4324
n_features: 1850
n_classes: 158


In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [4]:
DATASET_PATH = (
    "../data/LFW/lfw_home/lfw_funneled"  # Location of the LFW People dataset on your local system.
)
PROCESSED_DATASET_PATH = (
    "../data/LFW/lfw_home/processed_dataset"  # Location of the pre-processed dataset on your local system.
)

In [6]:
import os
import glob
import shutil

def preprocess_dataset():
    """
    Pre-processing the dataset by fetching all the images across all the sub-folders and
    placing them under a common directory path.
    """
    print("Starting pre-processing of the LFW People dataset")
    try:
        os.mkdir(PROCESSED_DATASET_PATH)
        for folder in os.listdir(DATASET_PATH):
            src_dir = os.path.join(DATASET_PATH, folder)
            for jpgfile in glob.iglob(os.path.join(src_dir, "*.jpg")):
                shutil.copy(jpgfile, PROCESSED_DATASET_PATH)
        print("Sucessfully prepared the pre-processed dataset for Image recognition")
    except Exception as err:
        print(f"ERROR: Pre-processing failed with error: {err}")
        return False
    return True

In [7]:
preprocess_dataset()

Starting pre-processing of the LFW People dataset
Sucessfully prepared the pre-processed dataset for Image recognition


True