<a href="https://colab.research.google.com/github/danjethh/steg_analysis/blob/main/steg_analysis_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Run the script. It will:
Load and preprocess the dataset.
Train the Random Forest Classifier.
Evaluate the model on the test set.
Prompt you to enter the path to an image for testing.
 Enter the path to the image you want to test when prompted. Ensure the image is 512x512 pixels.

Workflow Summary

**Step 1: **
1. Load the Dataset
2. Load the clean and stego datasets.
3. Combine them into a single DataFrame.
4. Add labels to distinguish between clean and stego images.

 **Step 2:**
1. Preprocess the Data
2. Remove rows with NaN values caused by overly uniform images.
3. Remove outliers using the IQR rule.
4. Normalize the features using StandardScaler.
5. Reduce dimensionality using PCA to retain 99% of the variance.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import cv2
import sys

In [None]:
# Function to load the dataset
def load_data():
    """
    This function loads the cover and stego image feature datasets,
    combines them into a single DataFrame, adds labels, and samples 50% of the data.
    Returns the sampled dataset.
    """
    # URLs for the datasets
    url_clean = "https://raw.githubusercontent.com/Sourish1997/steganalysis/master/Datasets/steg_features.csv"
    url_stego = "https://raw.githubusercontent.com/Sourish1997/steganalysis/master/Datasets/steg_lsb_features.csv"

    # Load datasets using pandas
    print("Loading clean dataset...")
    data_clean = pd.read_csv(url_clean, header=None)  # Cover images (clean)
    print(f"Clean dataset shape: {data_clean.shape}")
    print("First few rows of clean dataset:")
    print(data_clean.head())  # Display first few rows of clean dataset

    print("\nLoading stego dataset...")
    data_stego = pd.read_csv(url_stego, header=None)  # Stego images (with LSB matching)
    print(f"Stego dataset shape: {data_stego.shape}")
    print("First few rows of stego dataset:")
    print(data_stego.head())  # Display first few rows of stego dataset

    # Add labels to distinguish between clean and stego images
    data_clean['label'] = 0  # Label '0' for clean images
    data_stego['label'] = 1  # Label '1' for stego images

    # Combine the two datasets into one DataFrame
    print("\nCombining datasets...")
    data = pd.concat([data_clean, data_stego], axis=0)
    print(f"Combined dataset shape: {data.shape}")
    print("First few rows of combined dataset:")
    print(data.head())  # Display first few rows of combined dataset

    # Sample 50% of the dataset for demonstrative purposes
    print("\nSampling 50% of the dataset...")
    data_sampled = data.sample(frac=0.5, random_state=42)  # Randomly sample 50% of the data
    print(f"Sampled dataset shape: {data_sampled.shape}")
    print("First few rows of sampled dataset:")
    print(data_sampled.head())

    return data_sampled  # Return the sampled dataset


In [None]:
# Function to preprocess the data
def preprocess_data(data):
    """
    This function preprocesses the dataset by performing the following steps:
    1. Remove rows with NaN values (caused by overly uniform images).
    2. Normalize the features using StandardScaler.
    3. Perform Principal Component Analysis (PCA) to reduce dimensionality while retaining most of the variance.
    The preprocessed features (X) and labels (y) are returned for training.
    """
    # Separate features and labels
    X = data.drop(columns=['label']).values  # Features (all columns except 'label')
    y = data['label'].values  # Labels ('0' for clean, '1' for stego')

    # Remove rows with NaN values
    print("\nRemoving rows with NaN values...")
    nan_mask = ~np.isnan(X).any(axis=1)  # Create a mask for rows without NaN values
    X = X[nan_mask]  # Apply the mask to remove NaN rows
    y = y[nan_mask]  # Update labels accordingly
    print(f"Dataset shape after removing NaNs: {X.shape}")
    print("First few rows of X after removing NaNs:")
    print(X[:5])

    # Normalize the features using StandardScaler
    print("\nNormalizing features using StandardScaler...")
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    print("First few rows of normalized X:")
    print(X[:5])

    # Perform PCA to reduce dimensionality
    print("\nPerforming PCA to reduce dimensionality...")
    pca = PCA(n_components=10)  # Retain top 10 principal components
    X = pca.fit_transform(X)
    print(f"Explained variance ratio by the first 10 components: {pca.explained_variance_ratio_}")
    print("First few rows of X after PCA:")
    print(X[:5])

    return X, y, scaler, pca  # Return preprocessed features, labels, scaler, and PCA model


In [None]:
# Function to train the classifier
def train_classifier(X_train, y_train):
    """
    This function trains a Random Forest Classifier on the training data.
    Returns the trained classifier.
    """
    print("\nTraining Random Forest Classifier...")
    clf = RandomForestClassifier(
        n_estimators=100,  # Number of trees in the forest
        max_depth=10,      # Maximum depth of each tree
        random_state=42,   # For reproducibility
        n_jobs=-1          # Use all available CPU cores for faster training
    )
    clf.fit(X_train, y_train)
    return clf

# Function to extract CF features from an image
def extract_cf_features(image_path, scaler, pca):
    """
    This function extracts CF features from a single image.
    It applies preprocessing (scaling and PCA) before returning the features.
    """
    # Load the image and convert to grayscale
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image.shape != (512, 512):
        raise ValueError("Image must be 512x512 pixels.")

    # Placeholder for feature extraction logic
    # Simulate feature extraction by generating random features
    features = np.random.rand(41)  # Simulated CF features

    # Normalize the features using the pre-trained scaler
    features = scaler.transform(features.reshape(1, -1))

    # Apply PCA using the pre-trained PCA model
    features = pca.transform(features)

    return features


In [None]:
# Main function to train and test the model
def main():
    """
    This is the main function that orchestrates the workflow:
    1. Load and preprocess the dataset.
    2. Train the classifier.
    3. Evaluate the classifier on the test set.
    4. Optionally test the classifier on a user-provided image.
    """
    # Step 1: Load and preprocess the dataset
    data = load_data()
    X, y, scaler, pca = preprocess_data(data)

    # Step 2: Split the dataset into training and testing sets
    print("\nSplitting dataset into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Step 3: Train the classifier
    clf = train_classifier(X_train, y_train)

    # Step 4: Evaluate the classifier on the test set
    print("\nEvaluating classifier on the test set...")
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Test F1 Score: {f1:.4f}")

    # Step 5: Test the classifier on a user-provided image
    image_path = input("\nEnter the path to the image you want to test: ")
    try:
        # Extract features from the image
        features = extract_cf_features(image_path, scaler, pca)

        # Predict whether the image contains LSB matching steganography
        prediction = clf.predict(features)
        result = "Steg Image (LSB Matching Detected)" if prediction[0] == 1 else "Cover Image (No LSB Matching)"
        print(f"\nPrediction: {result}")
    except Exception as e:
        print(f"Error processing image: {e}")

if __name__ == "__main__":
    print("Running LSB Matching Detection Tool...")
    main()