# Unified GA-ANN-SSN for Multiple Datasets
## Genetic Algorithm with Artificial Neural Network and Structured Sparsity Norm

### Supported Datasets:
- **Plants Dataset**: Pre-split Train/Validation/Test folders
- **AID Dataset**: 70-10-20 split (needs splitting)
- **LC25000 Dataset**: Custom Test set + Train-Validation set (80-20 split)

### Workflow:
1. Dataset selection and configuration
2. Automatic download and data loading
3. VGG16 feature extraction
4. PCA dimensionality reduction
5. GA-ANN-SSN feature selection
6. Final model training and evaluation

## Cell 1: Installations and Imports

In [1]:
# Cell 1: Installations and Imports
# =================================
!pip install opendatasets kaggle pandas tensorflow scikit-learn opencv-python -q

import os
import numpy as np
import cv2
import random
import gc
import pickle
import json
import opendatasets as od
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='keras.*')

print("✅ All imports successful!")

✅ All imports successful!


## Cell 2: Configuration and Dataset Selection

In [2]:
# Cell 2: Configuration and Dataset Selection
# ===========================================
def get_dataset_choice():
    print("Available datasets:")
    print("1. plants")
    print("2. aid")
    print("3. lc25000")

    while True:
        choice = input("Select dataset (enter number or name): ").strip().lower()

        # Handle numeric input
        if choice == "1":
            return "plants"
        elif choice == "2":
            return "aid"
        elif choice == "3":
            return "lc25000"
        elif choice in ["plants", "aid", "lc25000"]:
            return choice
        else:
            print("Invalid choice. Please enter 1, 2, 3, or the dataset name.")

DATASET_CHOICE = get_dataset_choice()
print(f"Selected dataset: {DATASET_CHOICE}")

#Options: "plants", "aid", "lc25000"
#DATASET_CHOICE = "aid"


import os

# Ask user about their environment
def get_environment():
    while True:
        env = input("Are you running this code in Google Colab or locally? (colab/local): ").lower().strip()
        if env in ['colab', 'local']:
            return env
        else:
            print("Please enter either 'colab' or 'local'")

environment = get_environment()

# Dataset-specific configurations
DATASET_CONFIGS = {
    "plants": {
        "dataset_url": "https://www.kaggle.com/datasets/yudhaislamisulistya/plants-type-datasets",
        "download_folder": "./plant-datasets",
        "train_path": "",
        "val_path": "",
        "test_path": "",
        "features_dir": "./saved_features_plants",
        "split_type": "pre_split"
    },
    "aid": {
        "dataset_url": "https://www.kaggle.com/datasets/jiayuanchengala/aid-scene-classification-datasets",
        "download_folder": "./aid-datasets",
        "data_root": "",
        "features_dir": "./saved_features_aid",
        "split_type": "70_10_20"
    },
    "lc25000": {
        "dataset_url": "https://www.kaggle.com/datasets/javaidahmadwani/lc25000",
        "download_folder": "./lc25000-datasets",
        "data_root": "",
        "features_dir": "./saved_features_lc25000",
        "split_type": "lc25000_split"
    }
}

# Set paths based on environment
if environment == "colab":
    # Colab paths
    DATASET_CONFIGS["plants"]["train_path"] = "/content/plant-datasets/plants-type-datasets/split_ttv_dataset_type_of_plants/Train_Set_Folder"
    DATASET_CONFIGS["plants"]["val_path"] = "/content/plant-datasets/plants-type-datasets/split_ttv_dataset_type_of_plants/Validation_Set_Folder"
    DATASET_CONFIGS["plants"]["test_path"] = "/content/plant-datasets/plants-type-datasets/split_ttv_dataset_type_of_plants/Test_Set_Folder"
    DATASET_CONFIGS["aid"]["data_root"] = "./aid-datasets/aid-scene-classification-datasets/AID"
    DATASET_CONFIGS["lc25000"]["data_root"] = "./lc25000-datasets/lc25000/lung_colon_image_set"

else:  # local environment
    print("\nFor local execution, please ensure you have the following folders in your current directory:")
    print("1. split_ttv_dataset_type_of_plants")
    print("2. AID")
    print("3. lung_colon_image_set")


    # Local paths
    DATASET_CONFIGS["plants"]["train_path"] = "./split_ttv_dataset_type_of_plants/Train_Set_Folder"
    DATASET_CONFIGS["plants"]["val_path"] = "./split_ttv_dataset_type_of_plants/Validation_Set_Folder"
    DATASET_CONFIGS["plants"]["test_path"] = "./split_ttv_dataset_type_of_plants/Test_Set_Folder"
    DATASET_CONFIGS["aid"]["data_root"] = "./AID"
    DATASET_CONFIGS["lc25000"]["data_root"] = "./lung_colon_image_set"


# Get current config
CONFIG = DATASET_CONFIGS[DATASET_CHOICE]

# Common parameters
IMG_SIZE = (128, 128)
VGG_TARGET_SIZE = (224, 224)
PCA_VARIANCE = 0.95

# GA Parameters
GA_POPULATION_SIZE = 10
GA_MAX_GENERATION = 40
GA_CROSSOVER_RATE = 0.8
GA_MUTATION_RATE = 0.2

# Training Parameters
FEATURE_EXTRACTION_BATCH_SIZE = 128
BATCH_SIZE = 64
EVAL_EPOCHS = 10
FINAL_EPOCHS = 50
RANDOM_SEED = 42

# Set seeds
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Create features directory
os.makedirs(CONFIG['features_dir'], exist_ok=True)

print(f"✅ Configuration set for {DATASET_CHOICE.upper()} dataset!")

Available datasets:
1. plants
2. aid
3. lc25000
Select dataset (enter number or name): 3
Selected dataset: lc25000
Are you running this code in Google Colab or locally? (colab/local): colab
✅ Configuration set for LC25000 dataset!


## Cell 3: Dataset Download

In [3]:
# Cell 3: Dataset Download
# ========================
def download_dataset():
    """Download dataset based on configuration"""
    kaggle_credentials = {"username": "YOUR_USERNAME", "key": "YOUR_KEY"}

    # ⚠️ REPLACE WITH YOUR ACTUAL KAGGLE CREDENTIALS ⚠️
    kaggle_credentials = {"username": 'yuvanrajvengaladas', "key": '6fd3ece8111002ccca9494a6d7e6212e'}

    with open("kaggle.json", "w") as f:
        json.dump(kaggle_credentials, f)

    try:
        with open("kaggle.json", 'r') as f:
            credentials = json.load(f)
        os.environ['KAGGLE_USERNAME'] = credentials['username']
        os.environ['KAGGLE_KEY'] = credentials['key']

        print(f"Downloading {DATASET_CHOICE} dataset: {CONFIG['dataset_url']}")
        od.download(CONFIG['dataset_url'], data_dir=CONFIG['download_folder'])
        print(f"✅ {DATASET_CHOICE.upper()} dataset downloaded successfully!")

    except Exception as e:
        print(f"❌ Error downloading dataset: {e}")
        print("👉 Please ensure your kaggle.json file has correct credentials")



# Download the dataset

if environment == "colab":
  download_dataset()

Downloading lc25000 dataset: https://www.kaggle.com/datasets/javaidahmadwani/lc25000
Dataset URL: https://www.kaggle.com/datasets/javaidahmadwani/lc25000
Downloading lc25000.zip to ./lc25000-datasets/lc25000


100%|██████████| 1.76G/1.76G [00:20<00:00, 93.1MB/s]



✅ LC25000 dataset downloaded successfully!


## Cell 4: Data Loading and Splitting Functions

In [4]:
# Cell 4: Data Loading and Splitting Functions
# ============================================
def load_images_from_folder(folder, img_size=IMG_SIZE, max_samples=None):
    """Load images from folder structure (for pre-split datasets)"""
    X, y, mapping = [], [], {}
    class_folders = sorted([f for f in os.listdir(folder) if os.path.isdir(os.path.join(folder, f))])

    for idx, cname in enumerate(class_folders):
        mapping[idx] = cname
        files = glob(os.path.join(folder, cname, "*.jpg")) + glob(os.path.join(folder, cname, "*.png")) + glob(os.path.join(folder, cname, "*.jpeg"))

        if max_samples:
            files = files[:max_samples]

        for f in files:
            img = cv2.imread(f)
            if img is None:
                continue
            img = cv2.resize(img, img_size)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            X.append(img)
            y.append(idx)

    X = np.array(X)
    y = np.array(y).reshape(-1, 1)
    print(f"✅ Loaded {len(X)} images from {folder}")
    return X, y, mapping

def get_all_filepaths_and_labels(data_root):
    """Gather all image file paths and labels (for non-split datasets)"""
    all_files, all_labels, mapping = [], [], {}
    class_folders = sorted([f for f in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, f))])

    for idx, cname in enumerate(class_folders):
        mapping[idx] = cname
        class_dir = os.path.join(data_root, cname)
        files = glob(os.path.join(class_dir, "*.jpg")) + glob(os.path.join(class_dir, "*.png"))
        all_files.extend(files)
        all_labels.extend([idx] * len(files))

    print(f"📁 Found {len(all_files)} images across {len(mapping)} classes.")
    return np.array(all_files), np.array(all_labels), mapping

def load_images_from_paths(paths, labels, img_size=IMG_SIZE, max_samples=None):
    """Load images from file paths"""
    X, valid_labels = [], []
    if max_samples:
        indices = np.random.choice(len(paths), min(max_samples, len(paths)), replace=False)
        paths = paths[indices]
        labels = labels[indices]

    for i, f in enumerate(paths):
        img = cv2.imread(f)
        if img is None:
            continue
        img = cv2.resize(img, img_size)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        X.append(img)
        valid_labels.append(labels[i])

    X = np.array(X)
    y = np.array(valid_labels).reshape(-1, 1)
    print(f"✅ Loaded {len(X)} images from paths.")
    return X, y

def load_dataset_plants():
    """Load Plants dataset (pre-split)"""
    X_train, y_train, mapping = load_images_from_folder(CONFIG['train_path'], max_samples=1000)
    X_val, y_val, _ = load_images_from_folder(CONFIG['val_path'], max_samples=300)
    X_test, y_test, _ = load_images_from_folder(CONFIG['test_path'], max_samples=300)
    return X_train, y_train, X_val, y_val, X_test, y_test, mapping

def load_dataset_aid():
    """Load AID dataset and split 70-10-20"""
    X_paths, y_labels, mapping = get_all_filepaths_and_labels(CONFIG['data_root'])

    # Split: 70% Train, 15% Validation, 15% Test
    X_train_paths, X_temp_paths, y_train_labels, y_temp_labels = train_test_split(
        X_paths, y_labels, test_size=0.3, random_state=RANDOM_SEED, stratify=y_labels
    )
    X_val_paths, X_test_paths, y_val_labels, y_test_labels = train_test_split(
        X_temp_paths, y_temp_labels, test_size=0.5, random_state=RANDOM_SEED, stratify=y_temp_labels
    )

    # Load images
    X_train, y_train = load_images_from_paths(X_train_paths, y_train_labels, max_samples=2000)
    X_val, y_val = load_images_from_paths(X_val_paths, y_val_labels, max_samples=500)
    X_test, y_test = load_images_from_paths(X_test_paths, y_test_labels, max_samples=500)

    return X_train, y_train, X_val, y_val, X_test, y_test, mapping

def load_dataset_lc25000():
    """Load LC25000 dataset with custom split"""
    test_folder = os.path.join(CONFIG['data_root'], "Test Set")
    train_val_folder = os.path.join(CONFIG['data_root'], "Train and Validation Set")

    # Load test set
    X_test, y_test, mapping = load_images_from_folder(test_folder, max_samples=500)

    # Load train-val set and split 80-20
    X_train_val, y_train_val, mapping = load_images_from_folder(train_val_folder, max_samples=2500)

    # Split train-val into 80% train, 20% validation
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.2, random_state=RANDOM_SEED, stratify=y_train_val
    )

    return X_train, y_train, X_val, y_val, X_test, y_test, mapping

# Main data loading function
def load_datasets():
    """Main function to load datasets based on choice"""
    print(f"📂 Loading {DATASET_CHOICE.upper()} dataset...")

    if CONFIG['split_type'] == "pre_split":
        return load_dataset_plants()
    elif CONFIG['split_type'] == "70_10_20":
        return load_dataset_aid()
    elif CONFIG['split_type'] == "lc25000_split":
        return load_dataset_lc25000()
    else:
        raise ValueError(f"Unknown split type: {CONFIG['split_type']}")

print("✅ Data loading functions defined!")

✅ Data loading functions defined!


## Cell 5: Feature Extraction Utilities

In [5]:
# Cell 5: Feature Extraction Utilities
# ====================================
def preprocess_single_image_optimized(img):
    """Enhanced preprocessing for VGG16"""
    resized_img = cv2.resize(img, VGG_TARGET_SIZE)
    if random.random() > 0.5:
        resized_img = cv2.flip(resized_img, 1)
    processed_img = preprocess_input(resized_img.astype('float32'))
    return processed_img

def extract_features_optimized(X, dataset_name, batch_size=32):
    """Optimized feature extraction using VGG16"""
    feature_file = os.path.join(CONFIG['features_dir'], f"{dataset_name}_features.pkl")

    if os.path.exists(feature_file):
        print(f"📁 Loading cached features for {dataset_name}...")
        with open(feature_file, 'rb') as f:
            return pickle.load(f)

    print(f"🔧 Extracting features for {dataset_name}...")
    base_model = VGG16(weights="imagenet", include_top=False, input_shape=(224, 224, 3))
    feature_extractor = tf.keras.Sequential([
        base_model,
        tf.keras.layers.GlobalAveragePooling2D()
    ])

    features = []
    total_batches = (len(X) + batch_size - 1) // batch_size

    for i in range(0, len(X), batch_size):
        batch_images = []
        for img in X[i:i+batch_size]:
            processed_img = preprocess_single_image_optimized(img)
            batch_images.append(processed_img)

        batch = np.array(batch_images)
        batch_features = feature_extractor.predict(batch, verbose=0)
        features.append(batch_features)

        # Progress tracking
        batch_num = i // batch_size + 1
        if batch_num % 10 == 0 or batch_num == total_batches:
            print(f"    Processed {batch_num}/{total_batches} batches")

        del batch, batch_images
        gc.collect()

    features = np.vstack(features)

    # Save features
    with open(feature_file, 'wb') as f:
        pickle.dump(features, f)

    # Cleanup
    del feature_extractor, base_model
    tf.keras.backend.clear_session()
    gc.collect()

    print(f"💾 Saved features for {dataset_name}: {features.shape}")
    return features

def load_and_extract_features():
    """Complete feature extraction pipeline"""
    print("🚀 Starting feature extraction pipeline...")

    # Load datasets
    X_train_img, y_train, X_val_img, y_val, X_test_img, y_test, mapping = load_datasets()

    print(f"🎯 Classes found: {len(mapping)}")
    print(f"📊 Dataset sizes - Train: {X_train_img.shape[0]}, Val: {X_val_img.shape[0]}, Test: {X_test_img.shape[0]}")

    # Extract features
    X_train_feats = extract_features_optimized(X_train_img, "train", batch_size=FEATURE_EXTRACTION_BATCH_SIZE)
    X_val_feats = extract_features_optimized(X_val_img, "val", batch_size=FEATURE_EXTRACTION_BATCH_SIZE)
    X_test_feats = extract_features_optimized(X_test_img, "test", batch_size=FEATURE_EXTRACTION_BATCH_SIZE)

    # Clean up image data
    del X_train_img, X_val_img, X_test_img
    gc.collect()

    print("✅ Feature extraction completed!")
    return X_train_feats, y_train, X_val_feats, y_val, X_test_feats, y_test, mapping

print("✅ Feature extraction utilities defined!")

✅ Feature extraction utilities defined!


## Cell 6: Structured Sparsity Norm (SSN) Implementation

In [6]:
# Cell 6: Structured Sparsity Norm (SSN) Implementation
# ====================================================
class StructuredSparsityNorm(tf.keras.regularizers.Regularizer):
    def __init__(self, lambda1=1e-4, lambda2=1e-4):
        self.lambda1 = lambda1
        self.lambda2 = lambda2

    def __call__(self, x):
        l11_norm = self.lambda1 * tf.reduce_sum(tf.abs(x))
        l2_norm = tf.sqrt(tf.reduce_sum(tf.square(x), axis=1) + 1e-8)
        l21_norm = self.lambda2 * tf.reduce_sum(l2_norm)
        return l11_norm + l21_norm

    def get_config(self):
        return {"lambda1": self.lambda1, "lambda2": self.lambda2}

def build_ssn_perceptron(input_dim, n_classes, hidden_units=512, dropout_rate=0.5, lr=1e-3):
    """Build ANN with Structured Sparsity Norm regularization"""
    tf.keras.backend.clear_session()
    ssn_reg = StructuredSparsityNorm(lambda1=1e-4, lambda2=1e-4)

    model = Sequential()
    model.add(tf.keras.layers.Input(shape=(input_dim,)))
    model.add(Dense(hidden_units, activation='relu', kernel_regularizer=ssn_reg))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))
    model.add(Dense(hidden_units//2, activation='relu', kernel_regularizer=ssn_reg))
    model.add(Dropout(dropout_rate))
    model.add(Dense(n_classes, activation='softmax'))

    model.compile(
        optimizer=Adam(learning_rate=lr),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

def evaluate_with_ssn(model, X_val, y_val):
    """Evaluate feature subset using Structured Sparsity Norm concept"""
    preds_proba = model.predict(X_val, verbose=0)
    preds = np.argmax(preds_proba, axis=1)
    classification_error = 1 - accuracy_score(y_val.flatten(), preds)

    # Get model weights for SSN evaluation
    weights = []
    for layer in model.layers:
        if hasattr(layer, 'kernel') and layer.kernel is not None:
            weights.append(layer.kernel.numpy())

    if len(weights) > 0:
        weight_matrix = weights[0]
        l11_penalty = np.sum(np.abs(weight_matrix))
        l21_penalty = np.sum(np.sqrt(np.sum(weight_matrix**2, axis=1)))
        ssn_evaluation = classification_error + 1e-4 * (l11_penalty + l21_penalty)
    else:
        ssn_evaluation = classification_error

    return ssn_evaluation, classification_error

print("✅ SSN implementation complete!")

✅ SSN implementation complete!


## Cell 7: GA-ANN-SSN Feature Selector Class

In [7]:
# Cell 7: GA-ANN-SSN Feature Selector Class
# =========================================
class GA_ANN_SSN_FeatureSelector:
    def __init__(self, population_size=20, max_generations=100, crossover_rate=0.8, mutation_rate=0.2):
        self.population_size = population_size
        self.max_generations = max_generations
        self.crossover_rate = crossover_rate
        self.mutation_rate = mutation_rate

    def initialize_population(self, n_features):
        population = (np.random.rand(self.population_size, n_features) > 0.5).astype(int)
        return population

    def select_feature_subset(self, chromosome, X_data):
        selected_indices = np.where(chromosome == 1)[0]
        if len(selected_indices) == 0:
            return X_data, np.arange(X_data.shape[1])
        return X_data[:, selected_indices], selected_indices

    def classify_with_perceptron(self, X_train, y_train, X_val, y_val, n_classes, selected_indices):
        if len(selected_indices) == 0:
            return None, 1.0

        try:
            model = build_ssn_perceptron(len(selected_indices), n_classes, hidden_units=256)
            early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=0)

            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=EVAL_EPOCHS,
                batch_size=32,
                verbose=0,
                callbacks=[early_stop]
            )
            return model, history
        except Exception as e:
            print(f"⚠️ Model training failed: {e}")
            return None, 1.0

    def evaluate_with_ssn_norm(self, model, X_val, y_val, feature_ratio):
        if model is None:
            return 1.0

        try:
            ssn_score, classification_error = evaluate_with_ssn(model, X_val, y_val)
            if DATASET_CHOICE == "aid":
              fitness = 0.9 * classification_error + 0.1 * feature_ratio
            else:
              fitness = 0.7 * classification_error + 0.3 * feature_ratio
            return fitness
        except Exception as e:
            print(f"⚠️ Evaluation failed: {e}")
            return 1.0

    def select_best_population(self, population, fitness_values, method='roulette'):
        if method == 'roulette':
            fitness_array = np.array(fitness_values)
            inverted_fitness = 1.0 / (1.0 + fitness_array)
            probabilities = inverted_fitness / np.sum(inverted_fitness)
            selected_indices = np.random.choice(len(population), size=len(population), p=probabilities)
            return population[selected_indices]
        else:
            selected_population = []
            for _ in range(len(population)):
                idx1, idx2 = np.random.randint(0, len(population), 2)
                if fitness_values[idx1] < fitness_values[idx2]:
                    selected_population.append(population[idx1].copy())
                else:
                    selected_population.append(population[idx2].copy())
            return np.array(selected_population)

    def apply_crossover(self, population):
        new_population = []
        for i in range(0, len(population), 2):
            if i + 1 < len(population):
                parent1, parent2 = population[i], population[i + 1]
                if np.random.rand() < self.crossover_rate:
                    n_features = len(parent1)
                    crossover_point = np.random.randint(1, n_features - 1)
                    child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
                    child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
                else:
                    child1, child2 = parent1.copy(), parent2.copy()
                new_population.extend([child1, child2])

        if len(new_population) < len(population):
            new_population.append(population[-1].copy())
        return np.array(new_population)

    def apply_mutation(self, population):
        mutated_population = []
        for chromosome in population:
            mutated_chromosome = chromosome.copy()
            for i in range(len(mutated_chromosome)):
                if np.random.rand() < self.mutation_rate:
                    mutated_chromosome[i] = 1 - mutated_chromosome[i]
            mutated_population.append(mutated_chromosome)
        return np.array(mutated_population)

    def optimize(self, X_train, y_train, X_val, y_val, n_classes, n_features):
        print("🚀 Starting GA-ANN-SSN Feature Selection")
        print("=" * 60)

        population = self.initialize_population(n_features)
        best_chromosome, best_fitness = None, float('inf')
        best_accuracy, no_improvement_count = 0.0, 0

        for generation in range(self.max_generations):
            print(f"\n🔄 Generation {generation + 1}/{self.max_generations}")
            fitness_values, selected_features_list = [], []

            for i, chromosome in enumerate(population):
                X_train_subset, selected_indices = self.select_feature_subset(chromosome, X_train)
                X_val_subset, _ = self.select_feature_subset(chromosome, X_val)
                feature_ratio = len(selected_indices) / n_features

                model, history = self.classify_with_perceptron(X_train_subset, y_train, X_val_subset, y_val, n_classes, selected_indices)
                fitness = self.evaluate_with_ssn_norm(model, X_val_subset, y_val, feature_ratio)

                fitness_values.append(fitness)
                selected_features_list.append(selected_indices)

                if fitness < best_fitness:
                    best_fitness, best_chromosome = fitness, chromosome.copy()
                    best_selected_features = selected_indices
                    no_improvement_count = 0

                    current_model = model
                    if current_model is not None:
                        try:
                            preds = np.argmax(current_model.predict(X_val_subset, verbose=0), axis=1)
                            best_accuracy = accuracy_score(y_val.flatten(), preds)
                            print(f"   ✅ New best! Fitness: {fitness:.4f}, Accuracy: {best_accuracy:.4f}, Features: {len(selected_indices)}")
                        except:
                            print(f"   ✅ New best! Fitness: {fitness:.4f}, Features: {len(selected_indices)} (accuracy calculation failed)")
                    else:
                        print(f"   ✅ New best! Fitness: {fitness:.4f}, Features: {len(selected_indices)} (no model)")

                if model is not None:
                    del model
                gc.collect()

            no_improvement_count += 1
            population_diversity = np.mean(np.std(population, axis=0))

            # Check termination
            if (generation >= self.max_generations or
                no_improvement_count >= 30 or
                population_diversity < 0.01):
                break

            selected_population = self.select_best_population(population, fitness_values)
            crossed_population = self.apply_crossover(selected_population)
            population = self.apply_mutation(crossed_population)

            avg_fitness, current_best = np.mean(fitness_values), np.min(fitness_values)
            print(f"   📊 Stats - Best: {current_best:.4f}, Avg: {avg_fitness:.4f}, Diversity: {population_diversity:.4f}")

        print("\n" + "=" * 60)
        print("🎯 GA-ANN-SSN COMPLETED")
        print("=" * 60)
        print(f"Best Fitness: {best_fitness:.4f}")
        print(f"Best Accuracy: {best_accuracy:.4f}")
        print(f"Selected Features: {len(best_selected_features)}/{n_features}")
        print(f"Final Generation: {generation + 1}")

        return best_chromosome, best_fitness, best_accuracy, best_selected_features

print("✅ GA-ANN-SSN Feature Selector class defined!")

✅ GA-ANN-SSN Feature Selector class defined!


## Cell 8: Main Pipeline Execution

In [8]:
# Cell 8: Main Pipeline Execution
# ===============================
print("🚀 Starting Unified GA-ANN-SSN Pipeline")
print("=" * 60)

# Step 1: Load and extract features
X_train_feats, y_train, X_val_feats, y_val, X_test_feats, y_test, mapping = load_and_extract_features()
n_classes = len(mapping)

# Step 2: Feature standardization
print("🔧 Standardizing features...")
scaler = StandardScaler()
X_train_feats = scaler.fit_transform(X_train_feats)
X_val_feats = scaler.transform(X_val_feats)
X_test_feats = scaler.transform(X_test_feats)

# Step 3: Apply PCA
print("📊 Applying PCA...")
pca_subset_size = min(2000, len(X_train_feats))
pca_indices = np.random.choice(len(X_train_feats), pca_subset_size, replace=False)

pca_temp = PCA(n_components=PCA_VARIANCE, random_state=RANDOM_SEED)
pca_temp.fit(X_train_feats[pca_indices])
n_components_95 = pca_temp.n_components_

pca = PCA(n_components=n_components_95, random_state=RANDOM_SEED)
pca.fit(X_train_feats[pca_indices])

X_train_pca = pca.transform(X_train_feats)
X_val_pca = pca.transform(X_val_feats)
X_test_pca = pca.transform(X_test_feats)

n_features = X_train_pca.shape[1]
print(f"✅ PCA completed: {n_features} features (from {X_train_feats.shape[1]})")

# Save PCA and scaler
pca_file = os.path.join(CONFIG['features_dir'], "pca_scaler.pkl")
with open(pca_file, 'wb') as f:
    pickle.dump({'pca': pca, 'scaler': scaler}, f)
print("💾 Saved PCA and Scaler")

print("✅ Data preprocessing completed!")

🚀 Starting Unified GA-ANN-SSN Pipeline
🚀 Starting feature extraction pipeline...
📂 Loading LC25000 dataset...
✅ Loaded 2499 images from ./lc25000-datasets/lc25000/lung_colon_image_set/Test Set
✅ Loaded 12500 images from ./lc25000-datasets/lc25000/lung_colon_image_set/Train and Validation Set
🎯 Classes found: 5
📊 Dataset sizes - Train: 10000, Val: 2500, Test: 2499
🔧 Extracting features for train...
    Processed 10/79 batches
    Processed 20/79 batches
    Processed 30/79 batches
    Processed 40/79 batches
    Processed 50/79 batches
    Processed 60/79 batches
    Processed 70/79 batches
    Processed 79/79 batches
💾 Saved features for train: (10000, 512)
🔧 Extracting features for val...
    Processed 10/20 batches
    Processed 20/20 batches
💾 Saved features for val: (2500, 512)
🔧 Extracting features for test...
    Processed 10/20 batches
    Processed 20/20 batches
💾 Saved features for test: (2499, 512)
✅ Feature extraction completed!
🔧 Standardizing features...
📊 Applying PCA...


## Cell 9: EFFICIENT GA-ANN-SSN Feature Selection

In [9]:
# Cell 9: EFFICIENT GA-ANN-SSN Feature Selection
# ==============================================
print("🚀 Starting EFFICIENT GA-ANN-SSN Feature Selection...")
print("=" * 60)

# Efficient parameters for faster execution
EFFICIENT_POPULATION_SIZE = 8
EFFICIENT_MAX_GENERATION = 10
EFFICIENT_EVAL_EPOCHS = 3
EVAL_EPOCHS = EFFICIENT_EVAL_EPOCHS  # Override for efficiency

print(f"⚡ RUNNING IN EFFICIENT MODE ⚡")
print(f"   • Population: {EFFICIENT_POPULATION_SIZE}")
print(f"   • Generations: {EFFICIENT_MAX_GENERATION}")
print(f"   • Evaluation Epochs: {EVAL_EPOCHS}")
print("-" * 60)

ga_selector = GA_ANN_SSN_FeatureSelector(
    population_size=EFFICIENT_POPULATION_SIZE,
    max_generations=EFFICIENT_MAX_GENERATION,
    crossover_rate=GA_CROSSOVER_RATE,
    mutation_rate=GA_MUTATION_RATE
)

try:
    best_mask, best_fitness, best_accuracy, selected_indices = ga_selector.optimize(
        X_train_pca, y_train, X_val_pca, y_val, n_classes, n_features
    )
    print("✅ GA-ANN-SSN feature selection completed!")

except UnboundLocalError as e:
    print(f"❌ Error in GA optimization: {e}")
    print("🔄 Creating fallback solution...")
    from sklearn.feature_selection import VarianceThreshold
    selector = VarianceThreshold()
    selector.fit(X_train_pca)
    variances = selector.variances_
    n_select = max(1, n_features // 2)
    selected_indices = np.argsort(variances)[-n_select:]
    best_mask = np.zeros(n_features)
    best_mask[selected_indices] = 1
    best_fitness, best_accuracy = 0.5, 0.5
    print(f"🔄 Using fallback: {len(selected_indices)} features selected")

except Exception as e:
    print(f"❌ Unexpected error: {e}")
    print("🔄 Creating random feature selection as fallback...")
    n_select = max(1, n_features // 2)
    selected_indices = np.random.choice(n_features, n_select, replace=False)
    best_mask = np.zeros(n_features)
    best_mask[selected_indices] = 1
    best_fitness, best_accuracy = 0.5, 0.5
    print(f"🔄 Using random selection: {len(selected_indices)} features selected")

print(f"🎯 Final selection: {len(selected_indices)} features out of {n_features}")

# Restore original value
EVAL_EPOCHS = 10
print(f"\nℹ️ Restored EVAL_EPOCHS to original value ({EVAL_EPOCHS})")

🚀 Starting EFFICIENT GA-ANN-SSN Feature Selection...
⚡ RUNNING IN EFFICIENT MODE ⚡
   • Population: 8
   • Generations: 10
   • Evaluation Epochs: 3
------------------------------------------------------------
🚀 Starting GA-ANN-SSN Feature Selection

🔄 Generation 1/10
   ✅ New best! Fitness: 0.1993, Accuracy: 0.9460, Features: 175
   ✅ New best! Fitness: 0.1838, Accuracy: 0.9484, Features: 160
   ✅ New best! Fitness: 0.1804, Accuracy: 0.9480, Features: 156
   📊 Stats - Best: 0.1804, Avg: 0.1995, Diversity: 0.4659

🔄 Generation 2/10
   ✅ New best! Fitness: 0.1787, Accuracy: 0.9596, Features: 163
   ✅ New best! Fitness: 0.1752, Accuracy: 0.9700, Features: 167
   ✅ New best! Fitness: 0.1637, Accuracy: 0.9640, Features: 150
   📊 Stats - Best: 0.1637, Avg: 0.1884, Diversity: 0.4428

🔄 Generation 3/10
   ✅ New best! Fitness: 0.1629, Accuracy: 0.9624, Features: 148
   📊 Stats - Best: 0.1629, Avg: 0.1950, Diversity: 0.4481

🔄 Generation 4/10
   📊 Stats - Best: 0.1705, Avg: 0.2005, Diversity: 0

## Cell 10: Final Model Training

In [10]:
# Cell 10: Final Model Training
# =============================
print("\n🎯 Training Final Model with Selected Features...")
print("=" * 60)

# Prepare data with selected features
X_train_final = np.vstack([X_train_pca, X_val_pca])[:, selected_indices]
y_train_final = np.vstack([y_train, y_val])
X_test_final = X_test_pca[:, selected_indices]

print(f"📊 Final training set: {X_train_final.shape}")
print(f"📊 Test set: {X_test_final.shape}")
print(f"🔧 Selected features: {X_train_final.shape[1]}")

# Build and train final model
final_model = build_ssn_perceptron(X_train_final.shape[1], n_classes, hidden_units=512, dropout_rate=0.5)

print("🔧 Training final model on combined train+val data...")
history = final_model.fit(
    X_train_final, y_train_final,
    epochs=FINAL_EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1,
    validation_split=0.0  # No validation during final training
)

print("✅ Final model training completed!")


🎯 Training Final Model with Selected Features...
📊 Final training set: (12500, 148)
📊 Test set: (2499, 148)
🔧 Selected features: 148
🔧 Training final model on combined train+val data...
Epoch 1/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 0.7806 - loss: 1.7036
Epoch 2/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9341 - loss: 1.1103
Epoch 3/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9496 - loss: 0.9753
Epoch 4/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9608 - loss: 0.8603
Epoch 5/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9652 - loss: 0.7573
Epoch 6/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9685 - loss: 0.6630
Epoch 7/50
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accu

## Cell 11: Evaluation and Results Summary

In [None]:
# Cell 11: Evaluation and Results Summary
# =======================================
print("\n📈 Evaluating on completely held-out TEST set...")
print("=" * 60)

# Validation performance for reference
X_val_selected = X_val_pca[:, selected_indices]
val_preds = np.argmax(final_model.predict(X_val_selected, verbose=0), axis=1)
val_accuracy = accuracy_score(y_val.flatten(), val_preds)

# Test performance (true performance)
test_preds = np.argmax(final_model.predict(X_test_final, verbose=0), axis=1)
test_accuracy = accuracy_score(y_test.flatten(), test_preds)

print(f"🏆 VALIDATION Accuracy (for reference): {val_accuracy:.4f}")
print(f"🎯 TEST Accuracy (true performance): {test_accuracy:.4f}")

# Final results summary
print("\n" + "=" * 60)
print(f"🎯 FINAL GA-ANN-SSN RESULTS - {DATASET_CHOICE.upper()}")
print("=" * 60)

print(f"📊 Dataset Usage:")
print(f"   • Train: {X_train_pca.shape[0]} samples (feature selection)")
print(f"   • Validation: {X_val_pca.shape[0]} samples (fitness evaluation)")
print(f"   • Test: {X_test_pca.shape[0]} samples (FINAL evaluation)")
print(f"   • Final Training: {X_train_final.shape[0]} samples (train+val)")

print(f"\n🎯 Performance Metrics:")
print(f"   • Best Fitness: {best_fitness:.4f}")
print(f"   • Validation Accuracy: {val_accuracy:.4f}")
print(f"   • 🏆 TEST Accuracy: {test_accuracy:.4f} ← TRUE PERFORMANCE")

print(f"\n🔧 Feature Selection:")
print(f"   • Selected Features: {len(selected_indices)}/{n_features}")
print(f"   • Feature Reduction: {((n_features - len(selected_indices)) / n_features * 100):.1f}%")

# Save results
results = {
    'dataset': DATASET_CHOICE,
    'test_accuracy': test_accuracy,
    'validation_accuracy': val_accuracy,
    'ga_best_fitness': best_fitness,
    'selected_features_count': len(selected_indices),
    'total_features': n_features,
    'selected_indices': selected_indices,
    'dataset_sizes': {
        'train': X_train_pca.shape[0],
        'validation': X_val_pca.shape[0],
        'test': X_test_pca.shape[0],
        'final_training': X_train_final.shape[0]
    },
    'classes': mapping
}

results_file = os.path.join(CONFIG['features_dir'], "ga_ann_ssn_results.pkl")
with open(results_file, 'wb') as f:
    pickle.dump(results, f)

print(f"\n💾 Results saved to: {results_file}")
print("✅ Unified pipeline completed successfully!")

In [12]:
# Cell 12: Results Consolidation
# ==============================
print("\n" + "=" * 60)
print("📊 RESULTS CONSOLIDATION")
print("=" * 60)

def save_consolidated_results(test_accuracy, val_accuracy, best_fitness, selected_indices,
                             n_features, n_classes, mapping, X_train_pca, X_test_pca):
    """Save all results to consolidated files"""

    # Create results entry for current dataset
    results_entry = {
        'dataset': DATASET_CHOICE,
        'performance_metrics': {
            'test_accuracy': float(test_accuracy),
            'validation_accuracy': float(val_accuracy),
            'best_fitness': float(best_fitness),
            'performance_gap': float(test_accuracy - val_accuracy)
        },
        'feature_selection': {
            'selected_features_count': len(selected_indices),
            'total_features': n_features,
            'feature_reduction_percentage': float((n_features - len(selected_indices)) / n_features * 100),
            'selected_features_ratio': float(len(selected_indices) / n_features)
        },
        'dataset_info': {
            'num_classes': n_classes,
            'class_names': list(mapping.values()),
            'training_samples': int(X_train_pca.shape[0]),
            'test_samples': int(X_test_pca.shape[0])
        },
        'timestamp': str(pd.Timestamp.now()) if 'pd' in globals() else "Not available"
    }

    # 1. Append to main JSON results file
    main_json_file = "ga_ann_ssn_datasets_results.json"

    # Load existing results or create new
    if os.path.exists(main_json_file):
        with open(main_json_file, 'r') as f:
            all_results = json.load(f)
    else:
        all_results = {}

    # Add/update current dataset results
    all_results[DATASET_CHOICE] = results_entry

    # Save updated results
    with open(main_json_file, 'w') as f:
        json.dump(all_results, f, indent=4)

    print(f"💾 Results appended to: {main_json_file}")

    # 2. Update summary text file
    summary_file = "ga_ann_ssn_datasets_summary.txt"

    # Create or update summary
    if os.path.exists(summary_file):
        with open(summary_file, 'r') as f:
            existing_content = f.read()
    else:
        existing_content = ""

    with open(summary_file, 'w') as f:
        f.write("=" * 70 + "\n")
        f.write("GA-ANN-SSN ALL DATASETS SUMMARY\n")
        f.write("=" * 70 + "\n\n")

        # Write header
        f.write(f"{'Dataset':<12} {'Test Acc':<10} {'Val Acc':<10} {'Features':<12} {'Reduction':<12} {'Classes':<8}\n")
        f.write("-" * 70 + "\n")

        # Write all dataset entries
        for dataset_name, dataset_results in all_results.items():
            perf = dataset_results['performance_metrics']
            feat = dataset_results['feature_selection']
            info = dataset_results['dataset_info']

            f.write(f"{dataset_name:<12} {perf['test_accuracy']:<10.4f} {perf['validation_accuracy']:<10.4f} "
                   f"{feat['selected_features_count']}/{feat['total_features']:<12} "
                   f"{feat['feature_reduction_percentage']:<12.1f}% "
                   f"{info['num_classes']:<8}\n")

        f.write("\n" + "=" * 70 + "\n")
        f.write("LATEST RUN DETAILS:\n")
        f.write("=" * 70 + "\n\n")

        # Add details for current run
        f.write(f"Dataset: {DATASET_CHOICE}\n")
        f.write(f"Test Accuracy: {test_accuracy:.4f}\n")
        f.write(f"Validation Accuracy: {val_accuracy:.4f}\n")
        f.write(f"Selected Features: {len(selected_indices)}/{n_features}\n")
        f.write(f"Feature Reduction: {((n_features - len(selected_indices)) / n_features * 100):.1f}%\n")
        f.write(f"Number of Classes: {n_classes}\n")
        f.write(f"Best Fitness: {best_fitness:.4f}\n")
        f.write(f"Classes: {', '.join(list(mapping.values()))}\n")
        f.write(f"Timestamp: {results_entry['timestamp']}\n")

    print(f"📋 Summary updated in: {summary_file}")

    # Print current results to console
    print(f"\n📊 {DATASET_CHOICE.upper()} RESULTS:")
    print("-" * 50)
    print(f"Test Accuracy:       {test_accuracy:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Selected Features:   {len(selected_indices)}/{n_features}")
    print(f"Feature Reduction:   {((n_features - len(selected_indices)) / n_features * 100):.1f}%")
    print(f"Best Fitness:        {best_fitness:.4f}")

    return results_entry

# Execute results consolidation
try:
    consolidated_results = save_consolidated_results(
        test_accuracy, val_accuracy, best_fitness, selected_indices,
        n_features, n_classes, mapping, X_train_pca, X_test_pca
    )
    print("\n✅ All results consolidated successfully!")

except Exception as e:
    print(f"⚠️ Error in consolidating results: {e}")
    print("Continuing with existing results...")

print("=" * 60)


📊 RESULTS CONSOLIDATION
💾 Results appended to: ga_ann_ssn_datasets_results.json
📋 Summary updated in: ga_ann_ssn_datasets_summary.txt

📊 LC25000 RESULTS:
--------------------------------------------------
Test Accuracy:       0.9756
Validation Accuracy: 0.9916
Selected Features:   148/325
Feature Reduction:   54.5%
Best Fitness:        0.1629

✅ All results consolidated successfully!


In [13]:
import json
import os

# --- Configuration (Ensure these match your saved filenames) ---
SUMMARY_FILE = "ga_ann_ssn_datasets_summary.txt"
JSON_FILE = "ga_ann_ssn_datasets_results.json"
# ----------------------------------------------------------------

print("="*60)
print("📄 CONSOLIDATED RESULTS SUMMARY (TXT)")
print("="*60)

if os.path.exists(SUMMARY_FILE):
    with open(SUMMARY_FILE, 'r') as f:
        summary_content = f.read()
    print(summary_content)
else:
    print(f"❌ Error: Summary file '{SUMMARY_FILE}' not found.")
    print("Please ensure the file was created in the correct directory.")

print("\n" + "="*60)
print("💡 FULL JSON RESULTS (DETAIL)")
print("="*60)

if os.path.exists(JSON_FILE):
    with open(JSON_FILE, 'r') as f:
        json_results = json.load(f)

    # Use json.dumps for neat, formatted printing
    print(json.dumps(json_results, indent=4))
else:
    print(f"❌ Error: JSON file '{JSON_FILE}' not found.")
    print("Please ensure the file was created in the correct directory.")

print("="*60)

📄 CONSOLIDATED RESULTS SUMMARY (TXT)
GA-ANN-SSN ALL DATASETS SUMMARY

Dataset      Test Acc   Val Acc    Features     Reduction    Classes 
----------------------------------------------------------------------
plants       0.9246     0.9743     171/331          48.3        % 30      
aid          0.7600     1.0000     160/327          51.1        % 30      
lc25000      0.9756     0.9916     148/325          54.5        % 5       

LATEST RUN DETAILS:

Dataset: lc25000
Test Accuracy: 0.9756
Validation Accuracy: 0.9916
Selected Features: 148/325
Feature Reduction: 54.5%
Number of Classes: 5
Best Fitness: 0.1629
Classes: colon_aca, colon_n, lung_aca, lung_n, lung_scc
Timestamp: Not available


💡 FULL JSON RESULTS (DETAIL)
{
    "plants": {
        "dataset": "plants",
        "performance_metrics": {
            "test_accuracy": 0.9246164109406271,
            "validation_accuracy": 0.9742574257425742,
            "best_fitness": 0.2903644322136141,
            "performance_gap": -0.049