# Image Classification: CG vs. Camera in Google Colab**

##This notebook implements the image classification task from the paper "Forensic Techniques for Classifying Scanner, Computer Generated and Digital Camera Images". The code uses all three RGB channels (45 features) to classify images as Computer Generated (CG) or Camera-captured, trains an SVM model, and saves the model, feature vectors, and confusion matrix.

In [11]:
!pip install opencv-python-headless scikit-image scikit-learn matplotlib Pillow seaborn --quiet

In [22]:
!pip install PyWavelets
!pip install pdf2image
!apt-get update
!apt-get install -y poppler-utils

Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading

# Installing Necessary libraries

In [19]:
import os
import numpy as np
import cv2
from PIL import Image
from pdf2image import convert_from_path
from skimage.restoration import denoise_wavelet
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
import pandas as pd
import joblib

# Creating output directories

In [None]:
output_base = "/content/drive/MyDrive/DIP_Output"
cropped_dir = os.path.join(output_base, "cropped_images")
results_dir = os.path.join(output_base, "results")
os.makedirs(cropped_dir, exist_ok=True)
os.makedirs(results_dir, exist_ok=True)

#Function to Resize/crop image

In [14]:
def center_crop_image(filepath, output_path=None):
    img = Image.open(filepath).convert("RGB")
    width, height = img.size
    target_width, target_height = 1024, 768
    if width < target_width or height < target_height:
        img = img.resize((max(width, target_width), max(height, target_height)))
        width, height = img.size
    left = (width - target_width) // 2
    top = (height - target_height) // 2
    right = left + target_width
    bottom = top + target_height
    img_cropped = img.crop((left, top, right, bottom))
    if output_path:
        img_cropped.save(output_path)
    return np.array(img_cropped)

#Function to extract all 15 features in all three channels

In [15]:
def extract_features(img):
    # Ensure image is RGB
    if len(img.shape) != 3:
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)

    # Extract features from all three channels (R, G, B)
    all_features = []
    for channel in range(3):  # 0: Red, 1: Green, 2: Blue
        img_channel = img[:, :, channel]

        # Wavelet denoising and noise extraction
        denoised = denoise_wavelet(img_channel, channel_axis=None, rescale_sigma=True)
        noise = img_channel.astype(np.float32) - denoised.astype(np.float32)
        M, N = noise.shape

        # Row and column averages
        r_avg = np.mean(noise, axis=0)  # 1 x N
        c_avg = np.mean(noise, axis=1)  # M x 1

        # Normalized correlation
        def normalized_corr(a, b):
            if np.std(a) == 0 or np.std(b) == 0:
                return 0
            return np.corrcoef(a, b)[0, 1]

        rho_row = [normalized_corr(r_avg, noise[i, :]) for i in range(M)]
        rho_col = [normalized_corr(c_avg, noise[:, j]) for j in range(N)]

        # 15 features per channel
        features = [
            np.mean(rho_row), np.std(rho_row), scipy.stats.skew(rho_row), scipy.stats.kurtosis(rho_row),
            np.mean(rho_col), np.std(rho_col), scipy.stats.skew(rho_col), scipy.stats.kurtosis(rho_col),
            np.std(r_avg), scipy.stats.skew(r_avg), scipy.stats.kurtosis(r_avg),
            np.std(c_avg), scipy.stats.skew(c_avg), scipy.stats.kurtosis(c_avg),
            (1 - np.mean(rho_col) / np.mean(rho_row)) * 100 if np.mean(rho_row) != 0 else 0
        ]
        all_features.extend(features)

    return np.array(all_features)

#Loading Dataset

In [None]:
def load_dataset(cg_folder, camera_folder):
    X, y, filenames = [], [], []
    for idx, file in enumerate(os.listdir(cg_folder)):
        try:
            img_path = os.path.join(cg_folder, file)
            output_path = os.path.join(cropped_dir, f"cg_image_{idx}_{file}")
            img = center_crop_image(img_path, output_path=output_path)
            X.append(extract_features(img))
            y.append(0)  # CG
            filenames.append(file)
        except Exception as e:
            print(f"Error processing CG image {file}: {e}")
            continue
    for idx, file in enumerate(os.listdir(camera_folder)):
        try:
            img_path = os.path.join(camera_folder, file)
            output_path = os.path.join(cropped_dir, f"camera_image_{idx}_{file}")
            img = center_crop_image(img_path, output_path=output_path)
            X.append(extract_features(img))
            y.append(1)  # Camera
            filenames.append(file)
        except Exception as e:
            print(f"Error processing Camera image {file}: {e}")
            continue
    return np.array(X), np.array(y), filenames

In [None]:
# Paths to datasets
cg_folder = "/content/drive/MyDrive/DIP_CG"
camera_folder = "/content/drive/MyDrive/DIP_DigitalCamera"# Load dataset
X, y, filenames = load_dataset(cg_folder, camera_folder)

# Check class balance
print(f"CG images: {sum(y == 0)}, Camera images: {sum(y == 1)}")

# Save feature vectors
feature_names = []
channels = ["red", "green", "blue"]
for channel in channels:
    feature_names.extend([
        f"{channel}_rho_row_mean", f"{channel}_rho_row_std", f"{channel}_rho_row_skew", f"{channel}_rho_row_kurtosis",
        f"{channel}_rho_col_mean", f"{channel}_rho_col_std", f"{channel}_rho_col_skew", f"{channel}_rho_col_kurtosis",
        f"{channel}_r_avg_std", f"{channel}_r_avg_skew", f"{channel}_r_avg_kurtosis",
        f"{channel}_c_avg_std", f"{channel}_c_avg_skew", f"{channel}_c_avg_kurtosis",
        f"{channel}_ratio_feature"
    ])
features_df = pd.DataFrame(X, columns=feature_names)
features_df["label"] = y
features_df["filename"] = filenames
features_df.to_csv(os.path.join(results_dir, "feature_vectors.csv"), index=False)
np.savez(os.path.join(results_dir, "feature_vectors.npz"), X=X, y=y, filenames=filenames)

  c /= stddev[:, None]
  c /= stddev[None, :]


Error processing CG image Copy of chevy1.jpg: Truncated File Read
Error processing CG image Copy of PM_GAME.jpg: cannot identify image file '/content/drive/MyDrive/DIP_CG/Copy of PM_GAME.jpg'
Error processing CG image Copy of jmlydf1.jpg: image file is truncated (22 bytes not processed)
Error processing CG image Copy of noc1900.jpg: image file is truncated (2 bytes not processed)
Error processing CG image Copy of chevy.jpg: Truncated File Read
CG images: 246, Camera images: 251


#Preprocessing and SVM trainig

In [None]:
# Preprocessing
# Handle NaNs with mean imputation
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVM with grid search for RBF kernel
param_grid = {
    'C': [1, 10, 100],
    'gamma': [0.01, 0.001, 0.0001],
    'kernel': ['rbf']
}
grid = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

# Evaluate
clf = grid.best_estimator_
y_pred = clf.predict(X_test)

# Save the trained SVM model
model_path = os.path.join(results_dir, "svm_model.joblib")
joblib.dump(clf, model_path)
print(f"SVM model saved to {model_path}")

print("Best SVM parameters:", grid.best_params_)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["CG", "Camera"], yticklabels=["CG", "Camera"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("CG vs Camera SVM Classification")
plt.savefig(os.path.join(results_dir, "confusion_matrix.png"), dpi=300, bbox_inches="tight")
plt.close()

SVM model saved to /content/drive/MyDrive/DIP_Output/results/svm_model.joblib
Best SVM parameters: {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.82      0.81        49
           1       0.82      0.80      0.81        51

    accuracy                           0.81       100
   macro avg       0.81      0.81      0.81       100
weighted avg       0.81      0.81      0.81       100



In [24]:
def process_pdf_for_features(pdf_path):

    images = convert_from_path(pdf_path, first_page=1, last_page=1)

    if not images:
        raise ValueError("No images extracted from the PDF.")

    # Convert PIL image to numpy array (RGB format)
    img_pil = images[0]
    img = np.array(img_pil.convert('RGB'))

    # Ensure image is in correct format (RGB)
    if len(img.shape) != 3 or img.shape[2] != 3:
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB if len(img.shape) == 2 else cv2.COLOR_RGBA2RGB)

    # Call the original extract_features function
    return extract_features(img)

features = process_pdf_for_features("/content/Xerox_Versalink.PDF")
print(f"Extracted features: {features}")

Extracted features: [-4.63989347e-02  2.65201083e-01  9.10990467e-02 -1.23476955e+00
  3.72590373e-01  2.79367381e-01 -1.40845123e+00  5.93305253e-01
  6.07044554e+00 -1.25209272e-01 -2.37691879e-01  1.98169575e+01
 -3.52196598e+00  1.64734478e+01  9.03014931e+02 -4.63989347e-02
  2.65201083e-01  9.10990467e-02 -1.23476955e+00  3.72590373e-01
  2.79367381e-01 -1.40845123e+00  5.93305253e-01  6.07044554e+00
 -1.25209272e-01 -2.37691879e-01  1.98169575e+01 -3.52196598e+00
  1.64734478e+01  9.03014931e+02 -4.63989347e-02  2.65201083e-01
  9.10990467e-02 -1.23476955e+00  3.72590373e-01  2.79367381e-01
 -1.40845123e+00  5.93305253e-01  6.07044554e+00 -1.25209272e-01
 -2.37691879e-01  1.98169575e+01 -3.52196598e+00  1.64734478e+01
  9.03014931e+02]
