### Import the libraries

In [None]:
#using Python 3.12.8

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import fetch_lfw_people
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from collections import Counter
from scipy.stats import loguniform

import matplotlib.pyplot as plt

In [None]:
olivetti_dataset = fetch_olivetti_faces(shuffle=True, random_state=42, download_if_missing=True)

# introspect the images arrays to find the shapes for plotting later on
n_samples, h, w = olivetti_dataset.images.shape # (400, 64, 64)

X = olivetti_dataset.data
n_features = X.shape[1]
y = olivetti_dataset.target # Target values are the person's ID labels

print("Number of samples: %d" % n_samples) # Number of samples = rows in X
print("Number of features: %d" % n_features) # Number of features = columns in X

### Functions

In [3]:
# Apply PCA for dimensionality reduction
def apply_pca(X_train, X_test):
    n_components = 150
    print(
    "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])
    )
    pca = PCA(n_components=n_components, svd_solver="randomized", whiten=True).fit(X_train)
    eigenfaces = pca.components_.reshape((n_components, h, w))

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)

    return X_train_pca, X_test_pca, eigenfaces

In [4]:
from sklearn.utils import resample

# Find the best SVM Model with GridSearchCV
def findTheBestSVMModel(X_train, y_train):
    # Error handling for y_train -> Oversampling until all classes have at least 5 samples
    while True:
        counter = Counter(y_train)
        print("Number of samples in each class before oversampling:", counter)

        if all(count >= 5 for count in counter.values()):
            print("All classes have at least 5 samples now.")
            # print some samples
            print("First 3 classes:", {k: counter[k] for k in list(counter)[:3]})
            break

        # Identify underrepresented classes
        underrepresented_classes = [cls for cls, count in counter.items() if count < 5]

        # Manually oversample underrepresented classes
        for cls in underrepresented_classes:
            # Get indices of the class
            indices = [i for i, y in enumerate(y_train) if y == cls]
            X_class = X_train[indices]
            y_class = y_train[indices]

            # Resample to add more samples
            X_resampled, y_resampled = resample(
                X_class, y_class,
                replace=True,
                n_samples=5 - len(y_class),
                random_state=42
            )

            # Add the new samples to the dataset
            X_train = np.vstack([X_train, X_resampled])
            y_train = np.hstack([y_train, y_resampled])

    # Define the parameter grid for GridSearchCV
    param_grid_svm = {
        'C': [0.1, 1],                   
        'gamma': [0.001, 0.01, 0.1],                
        'kernel': ['linear', 'rbf'],
        'class_weight': ['balanced', None]                    
    }

    # Use StratifiedKFold for cross-validation
    cv = StratifiedKFold(n_splits=5)

    svmModel_grid = GridSearchCV(
        estimator=SVC(probability=True, max_iter=5000),
        param_grid=param_grid_svm, 
        verbose=2,
        cv=cv, 
        n_jobs=-1) # n_jobs=cores used, -1 = all cores
    
    svmModel_grid.fit(X_train, y_train)
    best_model = svmModel_grid.best_estimator_
    return best_model

In [5]:
# Predict and evaluate the models
def predict_and_evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test) # Predict the labels
    unique_labels = np.unique(y_test) # Get the unique labels
    print(classification_report(y_test, y_pred, labels=unique_labels, zero_division=0))
    print(confusion_matrix(y_test, y_pred, labels=unique_labels))
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.2f}')
    return y_pred

### Split the dataset
    train_test_split()

> Split **the dataset** into training and testing sets (30% for testing and 70% for training); 
> Choose random_state as a fixed seed value to ensure that the random processes (spliting the data) produce the same results every run; 
> Stratify the split to ensure that the same proportion of classes are present in both sets


In [None]:
# Split the dataset into training and testing sets (20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Number of training samples: %d" % X_train.shape[0])
print("Number of testing samples: %d" % X_test.shape[0])

## Supervised Learning Method - Support Vector Machine (SVM)

### Scalling the data
    StandardScaler()

> Standardize the features by removing the mean and scaling to unit variance before splitting the data


In [None]:
# Standardize the features by removing the mean and scaling to unit variance before splitting the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Number of training samples: %d" % X_train.shape[0])
print("Number of testing samples: %d" % X_test.shape[0])

### Applying PCA for dimensionality reduction

In [None]:
# Apply PCA for dimensionality reduction
X_train_pca, X_test_pca, eigenfaces = apply_pca(X_train, X_test)

### Train and evaluate the SVM Model for original and PCA-reduced data

In [None]:
# Train and evaluate SVM on original data
print("SVM on Original Data")
svm = findTheBestSVMModel(X_train, y_train)
y_pred_original = predict_and_evaluate(svm, X_test, y_test)

# Train and evaluate SVM on PCA data
print("SVM on PCA Data")
svm_pca = findTheBestSVMModel(X_train_pca, y_train)
y_pred_pca = predict_and_evaluate(svm_pca, X_test_pca, y_test)

## Semi-supervised Learning Method - SelfTrainingClassifier with an SVM base estimator

### Splitting and Scalling for the Semi-Supervised SVM (SelfTrainingClassifier)
1. First test_train_split: 

> Split **the training set** into labeled (10%) and unlabeled (90%) sets 

2. Combine the training data

> Combine labeled and unlabeled **training data** for scalling and PCA.
> This step is necessary because semi-supervised learning requires access to both labeled and unlabeled data

3. StandardScaler():

> Standardize the features by removing the mean and scaling to unit variance before splitting the data



In [None]:
# Split the training set into labeled (40%) and unlabeled (60%) sets
X_train_labeled, X_train_unlabeled, y_train_labeled, y_train_unlabeled = train_test_split(
    X_train, y_train, test_size=0.6, random_state=42)
print("Number of labeled training samples: %d" % X_train_labeled.shape[0])
print("Number of unlabeled training samples: %d" % X_train_unlabeled.shape[0])

# Combine labeled and unlabeled training data
X_train_combined = np.vstack((X_train_labeled, X_train_unlabeled))
y_train_combined = np.concatenate((y_train_labeled, [-1] * len(y_train_unlabeled))) # -1 for unlabeled samples

# Scale the combined training data
scaler = StandardScaler()
X_train_combined_scaled = scaler.fit_transform(X_train_combined)
X_test_scaled = scaler.transform(X_test)

print("Number of combined training samples: %d" % X_train_combined_scaled.shape[0])
print("Number of testing samples: %d" % X_test_scaled.shape[0])

### Applying PCA for dimensionality reduction

In [None]:
# Apply PCA for dimensionality reduction
X_train_combined_pca, X_test_pca, eigenfaces = apply_pca(X_train_combined_scaled, X_test_scaled)

### Training the semi-supervised model using SelfTrainingClassifier with an SVM base estimator

1. Original Data

In [None]:
# Train the semi-supervised model using SelfTrainingClassifier with an SVM base estimator
# On the original combined training data

# trick for the oversampling
y_train_labeled = y_train_labeled.astype(int)

base_estimator = findTheBestSVMModel(X_train_labeled, y_train_labeled)

# Print base_estimator to confirm the best parameters selected by findTheBestSVMModel
print(base_estimator)

# Train the SelfTrainingClassifier model on the original combined training data
self_training_model_original = SelfTrainingClassifier(
    estimator=base_estimator, 
    criterion='threshold', 
    threshold=0.95,
    max_iter=20, 
    verbose=True
)

self_training_model_original_fit = self_training_model_original.fit(X_train_combined_scaled, y_train_combined)

# Passing the model to the predict_and_evaluate function for predicting the labels and evaluating the model
print("Self-Training Model on Original Data")
y_pred_self_training_original_final = predict_and_evaluate(
    self_training_model_original_fit, 
    X_test_scaled, 
    y_test
    )

2. PCA-reduced data

In [10]:
# Train the semi-supervised model
# On the PCA-reduced combined training data
self_training_model_pca = SelfTrainingClassifier(base_estimator, criterion='threshold', threshold=0.95, max_iter=20, verbose=True)
self_training_model_pca.fit(X_train_combined_pca, y_train_combined)

In [None]:
print("Self-Training Model on PCA Data")
y_pred_self_training_pca_final = predict_and_evaluate(self_training_model_pca, X_test_pca, y_test)

## Results

In [None]:
# Plot some of the test results
def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
    plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
    plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
    for i in range(n_row * n_col):
        plt.subplot(n_row, n_col, i + 1)
        plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
        plt.title(titles[i], size=12)
        plt.xticks(())
        plt.yticks(())

def title(y_pred, y_test, target_names, i):
    pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
    true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
    return f'predicted: {pred_name}\ntrue:      {true_name}'

prediction_titles_original = [title(y_pred_original, y_test, target_names, i) for i in range(y_pred_original.shape[0])]
prediction_titles_pca = [title(y_pred_pca, y_test, target_names, i) for i in range(y_pred_pca.shape[0])]

plot_gallery(X_test, prediction_titles_original, h, w)
plt.show()

plot_gallery(X_test, prediction_titles_pca, h, w)
plt.show()