<a href="https://colab.research.google.com/github/danjethh/steg_analysis/blob/main/steg_analysis_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Import necessary libraries

In [1]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import cv2
import sys

# Step 2: Load the **dataset**

In [2]:
# Step 2: Load the dataset
def load_data():
    """
    This function loads the cover and stego image feature datasets,
    combines them into a single DataFrame, and adds labels:
    - Label '0' for clean images (from steg_features.csv).
    - Label '1' for stego images (from steg_lsb_features.csv).
    The combined dataset is returned for further processing.
    """
    # URLs for the datasets
    url_clean = "https://raw.githubusercontent.com/Sourish1997/steganalysis/master/Datasets/steg_features.csv"
    url_stego = "https://raw.githubusercontent.com/Sourish1997/steganalysis/master/Datasets/steg_lsb_features.csv"

    # Load datasets using pandas
    print("Loading clean dataset...")
    data_clean = pd.read_csv(url_clean, header=None)  # Cover images (clean)
    print(f"Clean dataset shape: {data_clean.shape}")
    print("First few rows of clean dataset:")
    print(data_clean.head())  # Display first few rows of clean dataset

    print("\nLoading stego dataset...")
    data_stego = pd.read_csv(url_stego, header=None)  # Stego images (with LSB matching)
    print(f"Stego dataset shape: {data_stego.shape}")
    print("First few rows of stego dataset:")
    print(data_stego.head())  # Display first few rows of stego dataset

    # Add labels to distinguish between clean and stego images
    data_clean['label'] = 0  # Label '0' for clean images
    data_stego['label'] = 1  # Label '1' for stego images

    # Combine the two datasets into one DataFrame
    print("\nCombining datasets...")
    data = pd.concat([data_clean, data_stego], axis=0)
    print(f"Combined dataset shape: {data.shape}")
    print("First few rows of combined dataset:")
    print(data.head())  # Display first few rows of combined dataset

    return data  # Return the combined dataset

# Step 3: Preprocess the data

In [3]:
# Step 3: Preprocess the data
def preprocess_data(data):
    """
    This function preprocesses the dataset by performing the following steps:
    1. Remove rows with NaN values (caused by overly uniform images).
    2. Normalize the features using StandardScaler.
    3. Perform Principal Component Analysis (PCA) to reduce dimensionality while retaining most of the variance.
    The preprocessed features (X) and labels (y) are returned for training.
    """
    # Separate features and labels
    X = data.drop(columns=['label']).values  # Features (all columns except 'label')
    y = data['label'].values  # Labels ('0' for clean, '1' for stego')

    # Remove rows with NaN values
    print("\nRemoving rows with NaN values...")
    nan_mask = ~np.isnan(X).any(axis=1)  # Create a mask for rows without NaN values
    X = X[nan_mask]  # Apply the mask to remove NaN rows
    y = y[nan_mask]  # Update labels accordingly
    print(f"Dataset shape after removing NaNs: {X.shape}")
    print("First few rows of X after removing NaNs:")
    print(X[:5])

    # Normalize the features using StandardScaler
    print("\nNormalizing features using StandardScaler...")
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    print("First few rows of normalized X:")
    print(X[:5])

    # Perform PCA to reduce dimensionality
    print("\nPerforming PCA to reduce dimensionality...")
    pca = PCA(n_components=10)  # Retain top 10 principal components
    X = pca.fit_transform(X)
    print(f"Explained variance ratio by the first 10 components: {pca.explained_variance_ratio_}")
    print("First few rows of X after PCA:")
    print(X[:5])

    return X, y  # Return preprocessed features and labels

# Step 4: Train the classifier

In [4]:
# Step 4: Train the classifier
def train_classifier(X_train, y_train):
    """
    This function trains a Random Forest Classifier on the training data.
    Returns the trained classifier.
    """
    print("\nTraining Random Forest Classifier...")
    clf = RandomForestClassifier(
        n_estimators=100,  # Number of trees in the forest
        max_depth=10,      # Maximum depth of each tree
        random_state=42,   # For reproducibility
        n_jobs=-1          # Use all available CPU cores for faster training
    )
    clf.fit(X_train, y_train)
    return clf

# Step 5: Test the classifier on a new image

In [5]:
# Step 5: Test the classifier on a new image
def extract_features_from_image(image_path):
    """
    This function extracts CF features from a single image.
    Replace this placeholder logic with the actual CF feature extraction code.
    """
    # Load the image and convert to grayscale
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image.shape != (512, 512):
        raise ValueError("Image must be 512x512 pixels.")

    # Placeholder for feature extraction logic
    # Simulate feature extraction by generating random features
    features = np.random.rand(41)  # Simulated CF features

    # Normalize the features
    scaler = StandardScaler()
    features = scaler.fit_transform(features.reshape(1, -1))

    # Apply PCA (using pre-trained PCA model)
    pca = PCA(n_components=10)
    features = pca.transform(features)

    return features

# Step 6: Main function to train and test the model

In [6]:
# Step 6: Main function to train and test the model
def main():
    """
    This is the main function that orchestrates the workflow:
    1. Load and preprocess the dataset.
    2. Train the classifier.
    3. Evaluate the classifier on the test set.
    4. Optionally test the classifier on a user-provided image.
    """
    # Step 1: Load and preprocess the dataset
    data = load_data()
    X, y = preprocess_data(data)

    # Step 2: Split the dataset into training and testing sets
    print("\nSplitting dataset into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Step 3: Train the classifier
    clf = train_classifier(X_train, y_train)

    # Step 4: Evaluate the classifier on the test set
    print("\nEvaluating classifier on the test set...")
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Test F1 Score: {f1:.4f}")

    # Step 5: Test the classifier on a user-provided image
    if len(sys.argv) > 1:
        image_path = sys.argv[1]
        try:
            # Extract features from the image
            features = extract_features_from_image(image_path)

            # Predict whether the image contains LSB matching steganography
            prediction = clf.predict(features)
            result = "Steg Image (LSB Matching Detected)" if prediction[0] == 1 else "Cover Image (No LSB Matching)"
            print(f"\nPrediction: {result}")
        except Exception as e:
            print(f"Error processing image: {e}")


In [7]:
if __name__ == "__main__":
    print("Running LSB Matching Detection Tool...")
    main()

Running LSB Matching Detection Tool...
Loading clean dataset...
Clean dataset shape: (10000, 41)
First few rows of clean dataset:
         0         1         2         3         4         5         6   \
0 -0.317327  0.827515  0.760605  0.740966  0.721418  0.910647  0.861356   
1       NaN       NaN       NaN       NaN       NaN       NaN       NaN   
2 -0.503111  0.862970  0.802899  0.775813  0.751000  0.927452  0.889261   
3 -0.182988  0.887022  0.835196  0.813357  0.789932  0.911072  0.861291   
4  0.006107  0.932943  0.906990  0.897635  0.886993  0.970490  0.954652   

         7         8         9   ...        31        32        33        34  \
0  0.835196  0.815543  0.818339  ... -0.001588 -0.004257 -0.000239 -0.266943   
1       NaN       NaN       NaN  ...  0.020795 -0.064528  0.015347  0.005049   
2  0.866067  0.848226  0.855546  ... -0.008875  0.003529  0.009316 -0.248362   
3  0.824739  0.795830  0.856713  ...  0.035087 -0.024424  0.004261 -0.137704   
4  0.944758  0.9346