In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("xhlulu/140k-real-and-fake-faces")

print("Path to dataset files:", path)

In [None]:
# @title Default title text
import os
import cv2
import numpy as np
import joblib
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split
from skimage.feature import local_binary_pattern
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, InputLayer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Configuration
img_size = (64, 64)
pca_components = 100
dataset_path = "/kaggle/input/140k-real-and-fake-faces/real_vs_fake/real-vs-fake/train/"
real_faces_dir = os.path.join(dataset_path, "real")
fake_faces_dir = os.path.join(dataset_path, "fake")

# Step 1: Load Images
def load_images(folder, label, flatten=True):
    data, labels = [], []
    for filename in os.listdir(folder):
        path = os.path.join(folder, filename)
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        if img is not None:
            img = cv2.resize(img, img_size)
            data.append(img.flatten() if flatten else img)
            labels.append(label)
    return data, labels

X_real_flat, y_real = load_images(real_faces_dir, 0, flatten=True)
X_fake_flat, y_fake = load_images(fake_faces_dir, 1, flatten=True)
X_real_img, _ = load_images(real_faces_dir, 0, flatten=False)
X_fake_img, _ = load_images(fake_faces_dir, 1, flatten=False)

X_flat = np.array(X_real_flat + X_fake_flat)
y = np.array(y_real + y_fake)
X_img = np.array(X_real_img + X_fake_img)

# Split
X_train_flat, X_test_flat, y_train, y_test, X_train_img, X_test_img = train_test_split(
    X_flat, y, X_img, test_size=0.2, random_state=42
)

# Step 2: PCA & LDA
pca = PCA(n_components=pca_components)
X_train_pca = pca.fit_transform(X_train_flat)
X_test_pca = pca.transform(X_test_flat)

# Debug: Save & load PCA and compare outputs
joblib.dump(pca, 'pca_model.pkl')
pca_loaded = joblib.load('pca_model.pkl')
X_test_pca_original = pca.transform(X_test_flat)
X_test_pca_loaded = pca_loaded.transform(X_test_flat)
print("Max difference in PCA outputs:", np.abs(X_test_pca_original - X_test_pca_loaded).max())

lda = LDA(n_components=1)
X_train_lda = lda.fit_transform(X_train_pca, y_train)
X_test_lda = lda.transform(X_test_pca)

# Debug: Save & load LDA and compare outputs
joblib.dump(lda, 'lda_model.pkl')
lda_loaded = joblib.load('lda_model.pkl')
X_test_lda_original = lda.transform(X_test_pca)
X_test_lda_loaded = lda_loaded.transform(X_test_pca)
print("Max difference in LDA outputs:", np.abs(X_test_lda_original - X_test_lda_loaded).max())

# Step 3: LBPH Feature Extraction
def extract_lbph_features(img, P=8, R=1, grid_x=8, grid_y=8):
    lbp = local_binary_pattern(img, P, R, method='uniform')
    n_bins = int(lbp.max() + 1)
    h, w = img.shape
    cell_h, cell_w = h // grid_y, w // grid_x
    features = []
    for i in range(grid_y):
        for j in range(grid_x):
            cell = lbp[i*cell_h:(i+1)*cell_h, j*cell_w:(j+1)*cell_w]
            hist, _ = np.histogram(cell.ravel(), bins=n_bins, range=(0, n_bins))
            hist = hist.astype("float")
            hist /= (hist.sum() + 1e-6)
            features.extend(hist)
    return np.array(features)

X_train_lbph = np.array([extract_lbph_features(img) for img in X_train_img])
X_test_lbph = np.array([extract_lbph_features(img) for img in X_test_img])

# Step 4: Feature Fusion
X_train_fused = np.concatenate((X_train_lbph, X_train_lda), axis=1)
X_test_fused = np.concatenate((X_test_lbph, X_test_lda), axis=1)

# Reshape for CNN
X_train_cnn = X_train_fused[..., np.newaxis]
X_test_cnn = X_test_fused[..., np.newaxis]

# Step 5: CNN Model
model = Sequential([
    InputLayer(input_shape=(X_train_fused.shape[1], 1)),
    Conv1D(32, 3, activation='relu', padding='same'),
    MaxPooling1D(2),
    Conv1D(64, 3, activation='relu', padding='same'),
    MaxPooling1D(2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(X_train_cnn, y_train, validation_data=(X_test_cnn, y_test), epochs=50, batch_size=32, callbacks=[early_stop])
model.save("fused_cnn_model.h5")

# Evaluation
test_loss, test_acc = model.evaluate(X_test_cnn, y_test)
print("Test Accuracy:", test_acc)

# Step 6: Prediction Function for Individual Images
import cv2
import numpy as np
import joblib
from skimage.feature import local_binary_pattern
from tensorflow.keras.models import load_model

# Assuming img_size is defined globally as in your training code
img_size = (64, 64)

def extract_lbph_features(img, P=8, R=1, grid_x=8, grid_y=8):
    """
    Extracts Local Binary Pattern Histogram features from a given image.
    """
    lbp = local_binary_pattern(img, P, R, method='uniform')
    n_bins = int(lbp.max() + 1)
    h, w = img.shape
    cell_h, cell_w = h // grid_y, w // grid_x
    features = []
    for i in range(grid_y):
        for j in range(grid_x):
            cell = lbp[i*cell_h:(i+1)*cell_h, j*cell_w:(j+1)*cell_w]
            hist, _ = np.histogram(cell.ravel(), bins=n_bins, range=(0, n_bins))
            hist = hist.astype("float")
            hist /= (hist.sum() + 1e-6)
            features.extend(hist)
    return np.array(features)

def predict_image(img_path):
    # Load image in grayscale and resize
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise ValueError("Image not found or unreadable.")
    img = cv2.resize(img, img_size)

    # Preprocess image for PCA transform: flatten and reshape to (1, -1)
    img_flat = img.flatten().reshape(1, -1)

    # Load pre-trained models (PCA, LDA, CNN)
    pca = joblib.load("pca_model.pkl")
    lda = joblib.load("lda_model.pkl")
    cnn = load_model("fused_cnn_model.h5")

    # Apply PCA then LDA transformation
    img_pca = pca.transform(img_flat)
    img_lda = lda.transform(img_pca)

    # If img_lda is one-dimensional, reshape it to two dimensions
    if len(img_lda.shape) == 1:
        img_lda = img_lda.reshape(1, -1)

    # Extract LBPH features from the resized grayscale image
    lbph_features = extract_lbph_features(img)  # returns a 1D vector
    lbph_features = lbph_features.reshape(1, -1)  # reshape to (1, feature_length)

    # Fuse LBPH and LDA features: concatenate along the feature axis.
    fused_features = np.concatenate((lbph_features, img_lda), axis=1)

    # Reshape for CNN input: add a channel dimension so shape becomes (1, num_features, 1)
    fused_features = fused_features[..., np.newaxis]

    # Predict using the CNN model
    prediction = cnn.predict(fused_features)
    print("Prediction (probability):", prediction[0][0])
    return prediction[0][0]

# Example usage:
# predict_image("path/to/your/image.jpg")
