In [1]:
# Import future annotations for type hints compatibility
from __future__ import annotations


import warnings
from pathlib import Path
import joblib
import numpy as np
import pandas as pd
from numpy.linalg import pinv, svd
from scipy.signal import savgol_filter, find_peaks
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.metrics import make_scorer, roc_auc_score
from IPython.display import display
import ipywidgets as widgets
import matplotlib.pyplot as plt
import seaborn as sns
import urllib.error
import time
import psutil

# Set plotting style using Seaborn
sns.set_style('whitegrid')
sns.set_context("notebook", font_scale=1.2)

In [2]:
# Memory-efficient CSV loader for local paths or URLs
def read_large_csv(path: str | Path) -> np.ndarray:
    """Load CSV from a local file path or GitHub URL with basic validation.

    Args:
        path (str | Path): The file path or URL to the CSV file.

    Returns:
        np.ndarray: The loaded data as a numpy array.

    Raises:
        ValueError: If the dataset has insufficient columns, invalid labels, or non-numeric/NaN labels.
        urllib.error.URLError: If the URL request fails.
        FileNotFoundError: If the file is not found.
        RuntimeError: For other loading errors.
    """

    try:
        data = pd.read_csv(path, header=None).to_numpy(dtype=np.float64)
        if data.shape[1] < 2:  # Ensure at least one feature and label column
            raise ValueError(f"Dataset {path} has insufficient columns: {data.shape[1]}")
        if len(np.unique(data[:, -1])) < 1:  # Ensure valid labels
            raise ValueError(f"Dataset {path} has no valid labels")
        # Check for non-numeric or NaN values in labels
        if not np.issubdtype(data[:, -1].dtype, np.number) or np.any(np.isnan(data[:, -1])):
            raise ValueError(f"Dataset {path} contains non-numeric or NaN labels")
        return data
    except urllib.error.URLError as e:
        raise ValueError(f"Failed to load CSV from {path}: {e}")
    except FileNotFoundError:
        raise FileNotFoundError(f"File not found at {path}")
    except Exception as e:
        raise RuntimeError(f"Error loading CSV from {path}: {e}")

# Outlier detection
def is_outlier_mean(x: np.ndarray, thresh: float = 3.0) -> np.ndarray:

    """Detect outliers in a numpy array based on mean and standard deviation.

    Args:
        x (np.ndarray): The input array to check for outliers.
        thresh (float, optional): Threshold multiplier for standard deviation. Defaults to 3.0.

    Returns:
        np.ndarray: Boolean array indicating outliers.
    """
    mu, sigma = np.nanmean(x), np.nanstd(x)
    return np.abs(x - mu) > thresh * sigma

# Custom scorer for AUC-ROC with multiclass support
auc_scorer = make_scorer(roc_auc_score, needs_proba=True, multi_class='ovr')

# Classification AUC-ROC
def classification_auc(
    data: np.ndarray,
    feature_idx: list[int],
    clf_name: str = "svm",
    folds: int = 5,
    seed: int = 42
) -> float:
    """Calculate AUC-ROC score using cross-validation for a given classifier.

    Args:
        data (np.ndarray): The full dataset including features and labels.
        feature_idx (list[int]): Indices of features to use.
        clf_name (str, optional): Name of the classifier. Defaults to "svm".
        folds (int, optional): Number of cross-validation folds. Defaults to 5.
        seed (int, optional): Random seed for reproducibility. Defaults to 42.

    Returns:
        float: Mean AUC-ROC score as a percentage.
    """

    X = data[:, feature_idx]
    y = data[:, -1]
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    if clf_name == "svm":
        clf = SVC(kernel="rbf", gamma="scale", C=1.0, probability=True)
    elif clf_name == "rf":
        clf = RandomForestClassifier(n_estimators=30, random_state=seed)
    elif clf_name == "lr":
        clf = LogisticRegression(random_state=seed, max_iter=1000)
    elif clf_name == "dt":
        clf = DecisionTreeClassifier(random_state=seed)
    elif clf_name == "knn":
        clf = KNeighborsClassifier(n_neighbors=5)
    else:
        raise ValueError(f"Unknown classifier: {clf_name}. Use 'svm', 'rf', 'lr', 'dt', or 'knn'.")
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    try:
        auc = cross_val_score(clf, X, y, cv=cv, scoring=auc_scorer).mean() * 100.0
    except ValueError as e:
        print(f"Error in AUC calculation for {clf_name}: {e}")
        print(f"X shape: {X.shape}, y unique values: {np.unique(y)}")
        auc = np.nan
    return auc

# Classification Accuracy
def classification_accuracy(
    data: np.ndarray,
    feature_idx: list[int],
    clf_name: str = "svm",
    folds: int = 5,
    seed: int = 42
) -> float:
    """Calculate accuracy score using cross-validation for a given classifier.

    Args:
        data (np.ndarray): The full dataset including features and labels.
        feature_idx (list[int]): Indices of features to use.
        clf_name (str, optional): Name of the classifier. Defaults to "svm".
        folds (int, optional): Number of cross-validation folds. Defaults to 5.
        seed (int, optional): Random seed for reproducibility. Defaults to 42.

    Returns:
        float: Mean accuracy score as a percentage.
    """

    X = data[:, feature_idx]
    y = data[:, -1]
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    if clf_name == "svm":
        clf = SVC(kernel="rbf", gamma="scale", C=1.0)
    elif clf_name == "rf":
        clf = RandomForestClassifier(n_estimators=30, random_state=seed)
    elif clf_name == "lr":
        clf = LogisticRegression(random_state=seed, max_iter=1000)
    elif clf_name == "dt":
        clf = DecisionTreeClassifier(random_state=seed)
    elif clf_name == "knn":
        clf = KNeighborsClassifier(n_neighbors=5)
    else:
        raise ValueError(f"Unknown classifier: {clf_name}. Use 'svm', 'rf', 'lr', 'dt', or 'knn'.")
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    try:
        acc = cross_val_score(clf, X, y, cv=cv, scoring="accuracy").mean() * 100.0
    except ValueError as e:
        print(f"Error in accuracy calculation for {clf_name}: {e}")
        print(f"X shape: {X.shape}, y unique values: {np.unique(y)}")
        acc = np.nan
    return acc



In [3]:
# Column-wise normalization
def normc(A: np.ndarray) -> np.ndarray:
    """Normalize the columns of a numpy array.

    Args:
        A (np.ndarray): The input array to normalize.

    Returns:
        np.ndarray: The normalized array.
    """
    return normalize(A, axis=0)

# Placeholder for Frobenius-penalized DNN
def frobenius_dnn(X, y):
    # Placeholder: Implement autoencoder-based feature selection (Li, 2023)
    """Placeholder for Frobenius-penalized Deep Neural Network feature selection.

    Args:
        X (np.ndarray): Feature matrix.
        y (np.ndarray): Target vector.

    Returns:
        list: Dummy list of the first 50 feature indices.
    """

    return list(range(min(50, X.shape[1])))  # Dummy: select first 50 features

# Placeholder for FREEFORM
def freeform_llm(X, y):
    # Placeholder: Implement LLM-based feature selection (Lee et al., 2024)
    """Placeholder for FREEFORM (LLM-based) feature selection.

    Args:
        X (np.ndarray): Feature matrix.
        y (np.ndarray): Target vector.

    Returns:
        list: Dummy list of the first 50 feature indices.
    """

    return list(range(min(50, X.shape[1])))  # Dummy: select first 50 features

# DRPT feature selection (void function)
def run_feature_selection(
    dataset: str,
    clf_name: str,
    base_dir: str = "https://raw.githubusercontent.com/def-abraham/Projects/refs/heads/main/Project%203",
    clusters: int = 50,
    run_iter: int = 10,
    t_perturb: int = 50,
    seed: int = 0,
    return_features: bool = False,
    display_results: bool = True
) -> None:

    """Perform DRPT feature selection and evaluate with a classifier.

    Args:
        dataset (str): Name of the dataset file.
        clf_name (str): Name of the classifier to use.
        base_dir (str, optional): Base directory or URL for dataset files. Defaults to GitHub raw URL.
        clusters (int, optional): Maximum number of feature clusters. Defaults to 50.
        run_iter (int, optional): Number of iterations/runs. Defaults to 10.
        t_perturb (int, optional): Number of perturbations. Defaults to 50.
        seed (int, optional): Random seed for reproducibility. Defaults to 0.
        return_features (bool, optional): Whether to return selected features. Defaults to False.
        display_results (bool, optional): Whether to display results and plots. Defaults to True.

    Returns:
        None or list: Returns selected features if return_features is True, otherwise None.
    """

    warnings.filterwarnings("ignore", category=RuntimeWarning)
    rng = np.random.default_rng(seed=seed)

    # Track time and memory
    start_time = time.time()
    start_memory = psutil.Process().memory_info().rss / 1024**3  # GB

    # Construct URL for GitHub-hosted dataset
    csv_url = f"{base_dir}/{dataset}"
    print(f"\nLoading {dataset} from {csv_url} ...")
    data = read_large_csv(csv_url)
    data = rng.permutation(data)
    org_data = data.copy()

    # Pre-allocate result arrays
    features_picked = np.zeros((clusters, run_iter), dtype=int)
    max_auc = np.zeros((run_iter, 2))
    max_acc = np.zeros((run_iter, 2))
    auc_per_k = [[] for _ in range(run_iter)]  # Store AUC for each k
    acc_per_k = [[] for _ in range(run_iter)]  # Store accuracy for each k

    # Runs
    for run in range(run_iter):
        # Stratified 70% train/30% test split
        labels, counts = np.unique(data[:, -1], return_counts=True)
        train_idx = []
        for lbl in labels:
            idx_lbl = np.where(data[:, -1] == lbl)[0]
            rng.shuffle(idx_lbl)
            k = int(np.floor(0.7 * len(idx_lbl)))
            train_idx.extend(idx_lbl[:k])
        rng.shuffle(train_idx)
        data_train = data[train_idx, :]

        # Variables
        A = data_train[:, :-1]
        B = data_train[:, -1]
        c_all = A.shape[1]

        # Normalise columns
        A = normc(A)
        C = A.copy()

        # Irrelevant-feature removal
        X = pinv(A) @ B
        cleaned_F = np.arange(c_all)

        # Iterative outlier elimination
        outliers_len = c_all
        list_X = np.zeros((5, c_all))
        list_X[0, :] = X
        ii = 0
        while outliers_len > (c_all * 0.021):
            mask = is_outlier_mean(np.abs(list_X[ii, :outliers_len]))
            tmp = np.where(mask)[0]
            outliers_len = len(tmp)
            if outliers_len == 0 or ii == 4:
                break
            ii += 1
            list_X[ii, :outliers_len] = list_X[ii - 1, tmp]
        if outliers_len < 10:
            outliers_len = is_outlier_mean(np.abs(X)).sum()

        # Threshold based on local maxima
        peaks, _ = find_peaks(np.abs(X))
        threshold = np.mean(np.abs(X)[peaks]) if len(peaks) > 0 else np.mean(np.abs(X))

        # Iteratively reduce features
        while len(cleaned_F) > (outliers_len * (2 / max(ii, 1))):
            irr_F = np.where(np.abs(X) < threshold)[0]
            threshold *= 1.03
            cleaned_F = np.setxor1d(np.arange(c_all), irr_F, assume_unique=True)

        A = A[:, cleaned_F]
        C = C[:, cleaned_F]
        c_all = len(cleaned_F)

        # Perturbation matrix
        singular_vals = svd(A, compute_uv=False)
        smallest_A = singular_vals.min()
        X = pinv(A) @ B
        m_error = 10 ** -3 * smallest_A

        def _single_perturb(_):
            per_val = m_error * rng.random(A.shape)
            pA = A + per_val
            DX = np.abs(pinv(pA) @ B - X)
            return DX

        px = joblib.Parallel(n_jobs=-1)(
            joblib.delayed(_single_perturb)(i) for i in range(t_perturb)
        )
        pX = np.mean(px, axis=0)
        ent = -np.nansum(C * np.log(C + 1e-12), axis=0).real

        # Rank features
        pX_smoothed = savgol_filter(pX, 11, polyorder=3)
        rounded_pX = pX_smoothed.copy()
        for round_metric in range(20, -1, -1):
            rounded_pX = np.round(pX_smoothed, round_metric)
            if len(np.unique(rounded_pX)) <= 50:
                break

        selected = []
        for key in np.unique(rounded_pX):
            idx_px = np.where(rounded_pX == key)[0]
            filtered_ent = ent[idx_px]
            rounded_ent = filtered_ent.copy()
            for round_metric in range(5, -1, -1):
                rounded_ent = np.round(filtered_ent, round_metric)
                if len(np.unique(rounded_ent)) <= 20:
                    break
            for ent_val in np.unique(rounded_ent):
                idx_ent = idx_px[np.where(rounded_ent == ent_val)[0]]
                idx_sorted = idx_ent[np.argsort(np.abs(X[idx_ent]))[::-1]]
                selected.append(idx_sorted[0])

        # Final ranking with entropy and |x_i|
        ent_normalized = (ent - ent.min()) / (ent.max() - ent.min() + 1e-12)
        x_abs_normalized = (np.abs(X) - np.abs(X).min()) / (np.abs(X).max() - np.abs(X).min() + 1e-12)
        scores = 0.5 * ent_normalized + 0.5 * x_abs_normalized
        ranked_features = cleaned_F[np.argsort(scores[selected])[::-1]]
        upper_band = min(clusters, len(ranked_features))

        # Classify with incremental subsets
        best_auc, best_k_auc = 0.0, 0
        best_acc, best_k_acc = 0.0, 0
        for k in range(1, upper_band + 1):
            centres = ranked_features[:k]
            auc = classification_auc(org_data, centres.tolist(), clf_name)
            acc = classification_accuracy(org_data, centres.tolist(), clf_name)
            auc_per_k[run].append((k, auc))
            acc_per_k[run].append((k, acc))
            if auc > best_auc:
                best_auc, best_k_auc = auc, k
                features_picked[:k, run] = centres
            if acc > best_acc:
                best_acc, best_k_acc = acc, k
            if best_auc == 100.0 and best_acc == 100.0:
                break

        max_auc[run, :] = (best_auc, best_k_auc)
        max_acc[run, :] = (best_acc, best_k_acc)
        if display_results:
            print(f"Run: {run+1}, Selected Features = {best_k_auc:2d}, AUC-ROC = {best_auc:5.2f}%, Accuracy = {best_acc:5.2f}%")

    # Summary (calculated once, outside per-run display)
    ave_auc = max_auc[:, 0].mean()
    ave_f_auc = max_auc[:, 1].mean()
    ave_acc = max_acc[:, 0].mean()
    ave_f_acc = max_acc[:, 1].mean()
    best_run = int(np.argmax(max_auc[:, 0]))
    full_auc = classification_auc(org_data, list(range(org_data.shape[1]-1)), clf_name)
    full_acc = classification_accuracy(org_data, list(range(org_data.shape[1]-1)), clf_name)
    print("\n------------------------------------------------------------------")
    print(f"\nSelected Features (mean) = {ave_f_auc:.2f}")
    print(f"\nAUC-ROC (mean) = {ave_auc:.2f}")
    print(f"\nAUC-ROC (original) = {full_auc:.2f}")
    print(f"\nSelected Features (mean, Accuracy) = {ave_f_acc:.2f}")
    print(f"\nAccuracy (mean) = {ave_acc:.2f}")
    print(f"\nAccuracy (original) = {full_acc:.2f}")
    print(f"\nStandard Deviation of Selected Features (AUC) = {np.std(max_auc[:,1]):.2f}")
    print(f"\nStandard Deviation of AUC-ROC = {np.std(max_auc[:,0]):.2f}")
    print(f"\nStandard Deviation of Selected Features (Accuracy) = {np.std(max_acc[:,1]):.2f}")
    print(f"\nStandard Deviation of Accuracy = {np.std(max_acc[:,0]):.2f}")
    print("\nOptimal subset =", features_picked[:, best_run][features_picked[:, best_run] > 0])

    # Performance metrics
    end_time = time.time()
    end_memory = psutil.Process().memory_info().rss / 1024**3
    running_time = end_time - start_time
    memory_usage = end_memory - start_memory
    print(f"\nRunning Time: {running_time:.2f} seconds")
    print(f"Memory Usage: {memory_usage:.2f} GB")

    # Plot 1: AUC-ROC vs. Number of Features
    if display_results:
        plt.figure(figsize=(10, 6))
        max_k = max(len(run_auc) for run_auc in auc_per_k)
        mean_auc = np.full(max_k, np.nan)
        std_auc = np.full(max_k, np.nan)
        for k in range(max_k):
            aucs = [run_auc[k][1] for run_auc in auc_per_k if k < len(run_auc)]
            if aucs:
                mean_auc[k] = np.mean(aucs)
                std_auc[k] = np.std(aucs)
        k_values = range(1, max_k + 1)
        plt.plot(k_values, mean_auc, label='Mean AUC-ROC', color='blue', marker='o')
        plt.fill_between(k_values, mean_auc - std_auc, mean_auc + std_auc, alpha=0.2, color='blue', label='±1 Std')
        plt.axhline(y=full_auc, color='red', linestyle='--', label='All Features AUC-ROC')
        plt.xlabel('Number of Features')
        plt.ylabel('AUC-ROC (%)')
        plt.title(f'AUC-ROC vs. Number of Features (DRPT on {dataset})')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    # Plot 2: Accuracy vs. Number of Features
    if display_results:
        plt.figure(figsize=(10, 6))
        mean_acc = np.full(max_k, np.nan)
        std_acc = np.full(max_k, np.nan)
        for k in range(max_k):
            accs = [run_acc[k][1] for run_acc in acc_per_k if k < len(run_acc)]
            if accs:
                mean_acc[k] = np.mean(accs)
                std_acc[k] = np.std(accs)
        plt.plot(k_values, mean_acc, label='Mean Accuracy', color='green', marker='o')
        plt.fill_between(k_values, mean_acc - std_acc, mean_acc + std_acc, alpha=0.2, color='green', label='±1 Std')
        plt.axhline(y=full_acc, color='red', linestyle='--', label='All Features Accuracy')
        plt.xlabel('Number of Features')
        plt.ylabel('Accuracy (%)')
        plt.title(f'Accuracy vs. Number of Features (DRPT on {dataset})')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    # Plot 3: Feature Selection Frequency
    if display_results:
        all_features = features_picked[features_picked > 0]
        if len(all_features) > 0:
            plt.figure(figsize=(12, 6))
            unique, counts = np.unique(all_features, return_counts=True)
            sns.barplot(x=unique, y=counts, color='skyblue')
            plt.xlabel('Feature Index')
            plt.ylabel('Selection Frequency')
            plt.title(f'Feature Selection Frequency Across {run_iter} Runs')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()

    # Plot 4: AUC-ROC Distribution Across Runs
    if display_results:
        plt.figure(figsize=(8, 6))
        sns.boxplot(data=max_auc[:, 0], color='lightgreen', width=0.4)
        plt.axhline(y=full_auc, color='red', linestyle='--', label='All Features AUC-ROC')
        plt.ylabel('AUC-ROC (%)')
        plt.title(f'AUC-ROC Distribution Across Runs (DRPT)')
        plt.legend()
        plt.tight_layout()
        plt.show()

    # Plot 5: Accuracy Distribution Across Runs
    if display_results:
        plt.figure(figsize=(8, 6))
        sns.boxplot(data=max_acc[:, 0], color='lightblue', width=0.4)
        plt.axhline(y=full_acc, color='red', linestyle='--', label='All Features Accuracy')
        plt.ylabel('Accuracy (%)')
        plt.title(f'Accuracy Distribution Across Runs (DRPT)')
        plt.legend()
        plt.tight_layout()
        plt.show()

    # Plot 6: Feature Importance Stability Scatter Plot
    if display_results:
        plt.figure(figsize=(12, 8))
        feature_presence = np.zeros((run_iter, org_data.shape[1] - 1))
        for run in range(run_iter):
            selected = features_picked[:, run][features_picked[:, run] > 0]
            feature_presence[run, selected] = 1
        # Find coordinates of selected features
        selected_features_x = []
        selected_features_y = []
        for run in range(run_iter):
            for feature in range(org_data.shape[1] - 1):
                if feature_presence[run, feature] == 1:
                    selected_features_x.append(feature)
                    selected_features_y.append(run)
        plt.scatter(selected_features_x, selected_features_y, color='blue', s=50, alpha=0.6)
        plt.xlabel('Feature Index')
        plt.ylabel('Run')
        plt.title(f'Feature Selection Stability Across {run_iter} Runs')
        plt.tight_layout()
        plt.show()

    # Plot 7: Δx vs. Smoothed Δx
    if display_results:
        plt.figure(figsize=(10, 6))
        plt.plot(pX, label='Δx', color='orange')
        plt.plot(pX_smoothed, label='Smoothed Δx', color='blue')
        plt.xlabel('Feature Index')
        plt.ylabel('Δx Value')
        plt.title(f'Δx vs. Smoothed Δx (DRPT on {dataset})')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    # Plot 8: Clustering of Smoothed Δx
    if display_results:
        plt.figure(figsize=(10, 6))
        plt.step(range(len(np.unique(rounded_pX))), np.unique(rounded_pX), label='Sorted Smoothed Δx', color='blue')
        plt.xlabel('Cluster Index')
        plt.ylabel('Smoothed Δx Value')
        plt.title(f'Clustering of Smoothed Δx (DRPT on {dataset})')
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    # Return features only if requested
    if return_features:
        return features_picked[:, best_run][features_picked[:, best_run] > 0].tolist()


In [None]:
# Benchmarking wrapper function
def benchmark_methods(dataset: str, methods: list[str], clf_name: str):
    """Benchmark multiple feature selection methods on a dataset with a single classifier.

    Args:
        dataset (str): Name of the dataset file.
        methods (list[str]): List of methods to benchmark.
        clf_name (str): Name of the classifier to use.
    """

    start_time = time.time()
    start_memory = psutil.Process().memory_info().rss / 1024**3
    print(f"\nBenchmarking on {dataset} using classifier {clf_name.upper()}")

    # Load and preprocess data
    csv_url = f"https://raw.githubusercontent.com/def-abraham/Projects/refs/heads/main/Project%203/{dataset}"
    try:
        data = read_large_csv(csv_url)
        data = np.random.default_rng(seed=0).permutation(data)
        X_full = data[:, :-1]
        y = data[:, -1]
    except Exception as e:
        print(f"Failed to load dataset {dataset}: {e}")
        return

    for method in methods:
        print(f"\n--- Running {method.upper()} ---")
        if method == "drpt":
            selected_features = run_feature_selection(dataset, clf_name, return_features=True, display_results=True)
        elif method == "dnn":
            selected_features = frobenius_dnn(X_full, y)
        elif method == "freeform":
            selected_features = freeform_llm(X_full, y)
        else:
            raise ValueError(f"Unknown method: {method}")

        # Evaluate with selected classifier
        auc_scores = []
        acc_scores = []
        for k in range(1, min(50, len(selected_features)) + 1):
            X_subset = X_full[:, selected_features[:k]]
            auc = classification_auc(data, selected_features[:k], clf_name)
            acc = classification_accuracy(data, selected_features[:k], clf_name)
            auc_scores.append(auc)
            acc_scores.append(acc)
        print(f"{method.upper()} with {clf_name.upper()}: Mean AUC-ROC = {np.mean(auc_scores):.2f}, Std AUC = {np.std(auc_scores):.2f}")
        print(f"{method.upper()} with {clf_name.upper()}: Mean Accuracy = {np.mean(acc_scores):.2f}, Std Acc = {np.std(acc_scores):.2f}")

    end_time = time.time()
    end_memory = psutil.Process().memory_info().rss / 1024**3
    print(f"\nBenchmark Time: {end_time - start_time:.2f} seconds")
    print(f"Memory Usage: {end_memory - start_memory:.2f} GB")

# Interactive widgets for Jupyter
classifiers = ["svm", "rf", "dt", "knn", "lr"]
methods = ["drpt", "dnn", "freeform"]
datasets = [
    "GDS1615_full_NoFeature.csv",
    "GDS968_full_NoFeature.csv",
    "GDS531_full_NoFeature.csv",
]

classifier_dropdown = widgets.Dropdown(
    options=classifiers,
    value=classifiers[0],
    description='Classifier:',
)
method_dropdown = widgets.Dropdown(
    options=methods,
    value=methods[0],
    description='Method:',
)
dataset_dropdown = widgets.Dropdown(
    options=datasets,
    value=datasets[0],
    description='Dataset:',
)
run_button = widgets.Button(
    description='Run Analysis',
    button_style='success',
    tooltip='Click to run the analysis',
)
output = widgets.Output()

def on_run_button_clicked(b):
    """Callback function to run the analysis when the button is clicked.

    Args:
        b: The button widget object.
    """

    with output:
        output.clear_output()
        clf_name = classifier_dropdown.value
        method = method_dropdown.value
        dataset = dataset_dropdown.value
        print(f"Running {method.upper()} on {dataset} using classifier {clf_name.upper()}")
        benchmark_methods(dataset, [method], clf_name)

run_button.on_click(on_run_button_clicked)
display(classifier_dropdown)
display(method_dropdown)
display(dataset_dropdown)
display(run_button)
display(output)

In [8]:
df = pd.read_csv('https://raw.githubusercontent.com/def-abraham/Projects/refs/heads/main/Project%203/GDS1615_full_NoFeature.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13641,13642,13643,13644,13645,13646,13647,13648,13649,13650
0,61.6284,22.4863,209.3067,47.622909,14.80811,293.0835,23.72019,5.944785,514.936,11.636235,...,32.7881,7.78844,43.7247,54.2718,22.6804,127.123,27.4609,34.7591,164.612,0
1,41.70175,30.45485,97.6285,41.967561,9.394665,174.339,18.258022,3.146002,975.5595,5.993805,...,56.8356,5.948395,31.9576,47.7542,12.1321,266.599,5.5491,32.3943,106.759,0
2,49.224875,35.74455,116.7763,46.319378,8.7895,140.313,30.99578,8.696365,509.2745,7.61626,...,49.2216,6.71187,45.9693,68.3708,3.00079,238.479,18.3581,41.5517,67.774,0
3,77.15215,45.8717,233.641,37.81053,10.53233,371.605,22.76824,3.83912,955.617,12.17111,...,62.3203,10.39958,61.4744,62.9321,17.6784,152.241,40.0221,35.523,134.461,0
4,52.1305,36.89575,159.46585,51.533716,16.3215,169.5985,21.497245,9.210775,1389.495,7.926882,...,15.9247,7.63658,44.8789,56.3066,20.3715,141.079,21.8371,28.0897,35.4106,2


In [15]:
df.shape

(172, 9392)

In [13]:
df2 = pd.read_csv('https://raw.githubusercontent.com/def-abraham/Projects/refs/heads/main/Project%203/GDS968_full_NoFeature.csv', header=None)
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9108,9109,9110,9111,9112,9113,9114,9115,9116,9117
0,447.52,21.16,40.375,68.11,6.035,11.91,116.93,942.465,4422.49,476.19,...,946.78,41.5375,600.35,31.44,55.82,245.79,34.46,21.23,7.59,1
1,230.86,81.73,40.375,68.11,6.035,11.91,360.68,998.215,4647.71,534.175,...,614.75,41.5375,377.13,10.86,75.41,245.79,23.51,42.93,13.59,2
2,439.64,75.99,52.04,68.11,6.035,119.88,221.745,1099.47,2851.32,536.565,...,974.87,41.5375,680.93,46.26,112.69,245.79,36.72,38.84,21.46,3
3,396.83,21.52,40.375,68.11,6.035,11.91,101.765,764.195,4002.5,284.855,...,804.35,41.5375,568.92,39.27,84.33,245.79,38.25,2.25,24.26,1
4,140.9,32.54,40.375,68.11,6.035,11.91,109.77,931.075,3608.33,312.575,...,577.47,41.5375,512.46,22.33,50.07,245.79,58.53,2.91,10.66,2


In [16]:
df2.shape

(171, 9118)

In [17]:
df3 = pd.read_csv('https://raw.githubusercontent.com/def-abraham/Projects/refs/heads/main/Project%203/GDS531_full_NoFeature.csv', header=None)
df3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9382,9383,9384,9385,9386,9387,9388,9389,9390,9391
0,2578.4,207.0,272.95,402.733333,30795.8,124.3,1275.75,11151.45,19844.7,1735.65,...,1058.8,38.95,1093.8,40.9,62.9,67.0,104.6,113.7,135.1,1
1,3629.8,96.7,535.0,305.366667,25368.1,48.2,508.4,10110.4,26675.0,2193.25,...,1053.2,44.7,2751.6,10.9,242.1,133.7,113.9,132.7,124.7,1
2,3051.3,354.9,1622.55,502.4,15712.5,78.0,1107.0,10633.65,13376.5,1878.05,...,1663.9,84.25,2660.8,93.3,294.2,159.2,133.6,225.1,46.1,1
3,3997.1,58.9,528.8,380.366667,47804.1,43.5,605.6,9091.05,13327.1,2438.8,...,888.0,46.95,2894.3,27.6,353.4,89.6,109.3,92.9,99.3,1
4,5150.2,95.4,293.75,481.633333,2168.3,22.7,1271.35,10493.65,27011.3,2266.75,...,733.4,88.65,2052.9,18.2,272.0,71.6,44.0,251.8,97.0,1


In [18]:
df3.shape

(173, 9392)

# **Visualization Results and Analysis Example**


![Data](https://github.com/def-abraham/Projects/blob/main/Project%203/images-results/GDS1615-drpt-svm/Screenshot%202025-06-25%20at%2021.30.12.png?raw=true)




![Data](https://github.com/def-abraham/Projects/blob/main/Project%203/images-results/GDS1615-drpt-svm/Screenshot%202025-06-25%20at%2021.30.57.png?raw=true)

![Data](https://github.com/def-abraham/Projects/blob/main/Project%203/images-results/GDS1615-drpt-svm/Screenshot%202025-06-25%20at%2021.31.17.png?raw=true)

![Data](https://github.com/def-abraham/Projects/blob/main/Project%203/images-results/GDS1615-drpt-svm/Screenshot%202025-06-25%20at%2021.31.36.png?raw=true)

![Data](https://github.com/def-abraham/Projects/blob/main/Project%203/images-results/GDS1615-drpt-svm/Screenshot%202025-06-25%20at%2022.23.54.png?raw=true)

![Data](https://github.com/def-abraham/Projects/blob/main/Project%203/images-results/GDS1615-drpt-svm/Screenshot%202025-06-25%20at%2022.24.09.png?raw=true)

NOTE: if the code doesn't run in your colab environment. Install the old version of scikit-lean below: copy and paste in the code cell and run. I used an old version of it.

pip install scikit-learn==1.5.0