<a href="https://colab.research.google.com/github/danjethh/steg_analysis/blob/main/steg_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Step 1: Load the Dataset**

In [None]:
# Import necessary libraries
import pandas as pd

# Function to load the dataset
def load_data():
    """
    This function loads the cover and stego image feature datasets.
    It combines the two datasets into a single DataFrame and adds labels:
    - Label '0' for clean images (from steg_features.csv).
    - Label '1' for stego images (from steg_lsb_features.csv).
    The combined dataset is returned for further processing.
    """
    # URLs for the datasets
    url_clean = "https://raw.githubusercontent.com/Sourish1997/steganalysis/master/Datasets/steg_features.csv"
    url_stego = "https://raw.githubusercontent.com/Sourish1997/steganalysis/master/Datasets/steg_lsb_features.csv"

    # Load datasets using pandas
    print("Loading clean dataset...")
    data_clean = pd.read_csv(url_clean, header=None)  # Cover images (clean)
    print(f"Clean dataset shape: {data_clean.shape}")
    print("First few rows of clean dataset:")
    print(data_clean.head())  # Display first few rows of clean dataset

    print("\nLoading stego dataset...")
    data_stego = pd.read_csv(url_stego, header=None)  # Stego images (with LSB matching)
    print(f"Stego dataset shape: {data_stego.shape}")
    print("First few rows of stego dataset:")
    print(data_stego.head())  # Display first few rows of stego dataset

    # Add labels to distinguish between clean and stego images
    data_clean['label'] = 0  # Label '0' for clean images
    data_stego['label'] = 1  # Label '1' for stego images

    # Combine the two datasets into one DataFrame
    print("\nCombining datasets...")
    data = pd.concat([data_clean, data_stego], axis=0)
    print(f"Combined dataset shape: {data.shape}")
    print("First few rows of combined dataset:")
    print(data.head())  # Display first few rows of combined dataset

    return data  # Return the combined dataset

# Run the function to load the data
data = load_data()

## **Step 2: Preprocess the Data**

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Function to preprocess the data
def preprocess_data(data):
    """
    This function preprocesses the dataset by performing the following steps:
    1. Remove rows with NaN values (caused by overly uniform images).
    2. Remove outliers using the IQR rule to improve model robustness.
    3. Normalize the features using StandardScaler to ensure all features have zero mean and unit variance.
    4. Perform Principal Component Analysis (PCA) to reduce dimensionality while retaining most of the variance.
    The preprocessed features (X) and labels (y) are returned for training.
    """
    # Separate features and labels
    X = data.drop(columns=['label']).values  # Features (all columns except 'label')
    y = data['label'].values  # Labels ('0' for clean, '1' for stego')

    # Remove rows with NaN values
    print("\nRemoving rows with NaN values...")
    nan_mask = ~np.isnan(X).any(axis=1)  # Create a mask for rows without NaN values
    X = X[nan_mask]  # Apply the mask to remove NaN rows
    y = y[nan_mask]  # Update labels accordingly
    print(f"Dataset shape after removing NaNs: {X.shape}")
    print("First few rows of X after removing NaNs:")
    print(X[:5])

    # Remove outliers using the IQR rule
    def remove_outliers(X):
        """
        This helper function removes outliers from the dataset using the Interquartile Range (IQR) rule.
        An entry is considered an outlier if it lies outside the range [Q1 - 1.5*IQR, Q3 + 1.5*IQR].
        """
        Q1 = np.percentile(X, 25, axis=0)  # First quartile (25th percentile)
        Q3 = np.percentile(X, 75, axis=0)  # Third quartile (75th percentile)
        IQR = Q3 - Q1  # Interquartile range
        lower_bound = Q1 - 1.5 * IQR  # Lower bound for valid values
        upper_bound = Q3 + 1.5 * IQR  # Upper bound for valid values
        outlier_mask = np.all((X >= lower_bound) & (X <= upper_bound), axis=1)  # Mask for non-outliers
        return X[outlier_mask], outlier_mask  # Return filtered data and mask

    print("\nRemoving outliers using IQR rule...")
    X, outlier_mask = remove_outliers(X)  # Remove outliers from features
    y = y[outlier_mask]  # Update labels accordingly
    print(f"Dataset shape after removing outliers: {X.shape}")
    print("First few rows of X after removing outliers:")
    print(X[:5])

    # Normalize the features using StandardScaler
    print("\nNormalizing features using StandardScaler...")
    scaler = StandardScaler()  # Initialize the scaler
    X = scaler.fit_transform(X)  # Normalize features to have zero mean and unit variance
    print("First few rows of normalized X:")
    print(X[:5])

    # Perform PCA to reduce dimensionality
    print("\nPerforming PCA to reduce dimensionality...")
    pca = PCA(n_components=10)  # Retain 10 principal components (captures 99.23% variance)
    X = pca.fit_transform(X)  # Transform the data to the reduced feature space
    print(f"Explained variance ratio by the first 10 components: {pca.explained_variance_ratio_}")
    print("First few rows of X after PCA:")
    print(X[:5])

    return X, y  # Return preprocessed features and labels

# Run the function to preprocess the data
X, y = preprocess_data(data)

## **Step 3: Train Classifiers**



In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Split the data into training and testing sets
print("\nSplitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape}, Testing set size: {X_test.shape}")

# Function to train classifiers
def train_classifiers(X_train, y_train):
    """
    This function trains multiple classifiers on the training data.
    It also performs hyperparameter tuning using GridSearchCV for some classifiers.
    The best-performing models are returned as a dictionary.
    """
    # Define classifiers
    classifiers = {
        'GaussianNB': GaussianNB(),  # Gaussian Naive Bayes (simple and fast)
        'RandomForest': RandomForestClassifier(random_state=42),  # Random Forest (ensemble method)
        'SVC': SVC(probability=True, random_state=42),  # Support Vector Classifier (handles high-dimensional data well)
        'MLP': MLPClassifier(random_state=42, max_iter=500),  # Multi-Layer Perceptron classifier
        'AdaBoost': AdaBoostClassifier(random_state=42)  # Adaptive Boosting (ensemble method)
    }

    # Hyperparameter grids for tuning
    param_grids = {
        'RandomForest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]},  # Number of trees and depth
        'SVC': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},  # Regularization strength and kernel type
        'MLP': {'hidden_layer_sizes': [(50,), (100,)], 'alpha': [0.0001, 0.001]},  # Network architecture and regularization
        'AdaBoost': {'n_estimators': [50, 100], 'learning_rate': [0.1, 1.0]}  # Number of estimators and learning rate
    }

    best_models = {}  # Dictionary to store the best models
    for name, clf in classifiers.items():
        print(f"\nTraining {name} classifier...")
        if name in param_grids:
            # Perform hyperparameter tuning using GridSearchCV
            print(f"Tuning hyperparameters for {name}...")
            grid = GridSearchCV(clf, param_grids[name], cv=5, scoring='f1')  # Use 5-fold cross-validation
            grid.fit(X_train, y_train)  # Fit the model to the training data
            best_models[name] = grid.best_estimator_  # Store the best estimator
            print(f"Best parameters for {name}: {grid.best_params_}")
        else:
            # Train the classifier without hyperparameter tuning
            clf.fit(X_train, y_train)  # Fit the model to the training data
            best_models[name] = clf  # Store the trained model
            print(f"{name} trained successfully.")

    return best_models  # Return the dictionary of best models

# Run the function to train classifiers
best_models = train_classifiers(X_train, y_train)

## **Step 4: Build Voting Ensemble**

In [None]:
from sklearn.ensemble import VotingClassifier

# Function to build voting ensemble
def build_voting_ensemble(best_models, X_train, y_train):
    """
    This function builds a voting ensemble of the best-performing models.
    The ensemble uses soft voting, which averages the predicted probabilities from each model.
    The ensemble is trained on the training data and returned.
    """
    print("\nBuilding voting ensemble...")
    # Create a list of (name, model) tuples for the ensemble
    estimators = [(name, model) for name, model in best_models.items()]

    # Initialize the voting classifier with soft voting
    voting_clf = VotingClassifier(estimators, voting='soft')

    # Train the ensemble on the training data
    print("Training voting ensemble...")
    voting_clf.fit(X_train, y_train)

    return voting_clf  # Return the trained ensemble

# Run the function to build the voting ensemble
voting_clf = build_voting_ensemble(best_models, X_train, y_train)

## **Step 5: Evaluate the Model**

In [None]:
from sklearn.metrics import accuracy_score, f1_score

# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    """
    This function evaluates the performance of the model on the test data.
    It calculates the accuracy and F-score and returns them as metrics.
    """
    print("\nEvaluating the model...")
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    print(f"Predictions on test data: {y_pred[:10]}")  # Display first 10 predictions

    # Calculate accuracy and F-score
    accuracy = accuracy_score(y_test, y_pred)  # Accuracy: fraction of correct predictions
    f_score = f1_score(y_test, y_pred)  # F-score: harmonic mean of precision and recall
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F-Score: {f_score:.4f}")

    return accuracy, f_score  # Return the evaluation metrics

# Run the function to evaluate the model
accuracy, f_score = evaluate_model(voting_clf, X_test, y_test)