# Computation Notebook 

This notebook is meant to show and compare the computational costs with the grid-based approach (searching all possible gridpoints) versus the gridless approach (binary search) to find decision boundary points. 

We show the times that are associated with each of these.

In [6]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
import numba
from numba import cuda
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import interp1d
import pandas as pd
from sklearn import svm
from scipy.linalg import norm
import numpy as np 

import pandas as pd
from sklearn import svm
import matplotlib.pyplot as plt
import numpy as np
import warnings
import random 
from scipy.interpolate import RBFInterpolator
from scipy.spatial import KDTree
import numba 

random.seed(0)
warnings.filterwarnings('ignore', category=UserWarning)

In [None]:
def closest_point(point, contour):
    """
    Finds the closest point on a contour to a given reference point.

    Parameters:
    -----------
    point : array-like or tuple
        A single point (e.g., [x, y]) for which the nearest contour point is to be found.
    
    contour : array-like of shape (n_points, n_dimensions)
        A list or array of points representing the contour. Each point should have the same dimensionality as `point`.

    Returns:
    --------
    closest_point : array-like
        The point on the contour that is closest to the input `point`.
    """
    
    # Build a KD-tree for fast nearest neighbor search over the contour points
    tree = KDTree(contour)

    # Find the index of the contour point closest to the input point
    closest_index = tree.query(point)[1]

    # If the result is an array (e.g., due to batch input), extract the scalar index
    if not isinstance(closest_index, np.int64): 
        closest_index = closest_index[0]

    # Retrieve the actual closest point using the index
    closest_point = contour[closest_index]

    return closest_point

def closest_border_point(border_points, contour): 
    """
    Finds the point in `border_points` that is closest to any point in the given `contour`.

    Parameters:
    -----------
    border_points : array-like of shape (n_points, n_dimensions)
        A list or array of candidate points (e.g., border or edge points).
    
    contour : array-like of shape (m_points, n_dimensions)
        A list or array of contour points to which the closest distance is measured.

    Returns:
    --------
    min_point : array-like
        The point from `border_points` that is closest to any point in the `contour`.
    """
    
    # Build a KDTree for efficient nearest neighbor queries on contour points
    tree = KDTree(contour)

    # Initialize variables to track the closest border point and the smallest distance found
    min_point = None         # Will hold the closest point from `border_points`
    total_min = float('inf') # Initialize the minimum distance as infinity

    # Iterate through each candidate border point
    for border_point in border_points: 
        # Query the KDTree to find the distance to the closest contour point
        dist, _ = tree.query(border_point)

        # If this distance is the smallest encountered so far, update tracking variables
        if dist < total_min: 
            total_min = dist 
            min_point = border_point 
    
    # Return the border point with the minimum distance to the contour
    return min_point

def euclidean_distance(point1, point2):
    """
    Computes the Euclidean distance between two points.

    Parameters:
    -----------
    point1 : array-like
        The first point (e.g., [x1, y1] or [x1, y1, z1]).
    
    point2 : array-like
        The second point (e.g., [x2, y2] or [x2, y2, z2]).

    Returns:
    --------
    float
        The Euclidean distance between `point1` and `point2`.
    """

    # Convert both points to NumPy arrays, subtract them element-wise,
    # and compute the L2 norm (i.e., Euclidean distance) of the result
    return np.linalg.norm(np.array(point1) - np.array(point2))

In [None]:
def move_from_A_to_B_with_x1_displacement(A, B, deltas, epsilon=1e-3):
    """
    Move from point A to point B in n-dimensional space with a desired movement in the x1 dimension.
    
    Parameters:
    - A: list or np.array, coordinates of the starting point A
    - B: list or np.array, coordinates of the target point B
    - delta_x1: float, the desired displacement in the x1 dimension
    
    Returns:
    - P: np.array, coordinates of the new point after moving delta_x1 along x1-axis
    """
    A = np.array(A)
    B = np.array(B)
    
    # Calculate direction vector from A to B
    D = B - A
    
    # Calculate the scaling factor t for the desired movement in x1
    t = deltas / (D + epsilon)   # D[0] is the x1 component of the direction vector
    
    # Calculate the new point P based on t
    P = A + t * D

    print(t) 
    print(D)
    
    return P

# Example usage
A = [1, 2, 3]  # Starting point in 3D space
B = [4, 5, 6]  # Target point in 3D space
delta_x1 = 1.5  # Desired movement in x1 dimension

P = move_from_A_to_B_with_x1_displacement(A, B, delta_x1)
print("New point P with desired x1 movement:", P)

[0.49983339 0.49983339 0.49983339]
[3 3 3]
New point P with desired x1 movement: [2.49950017 3.49950017 4.49950017]


In [None]:
@numba.njit
def prediction(Z, grid, epsilon): 
    """
    Identify points near the decision boundary by finding close pairs of grid points
    with different predicted classes and computing their midpoints.

    Parameters
    ----------
    Z : numpy.ndarray
        Array of predicted class labels for each point in the grid (shape: (n_grid_points,)).
    
    grid : numpy.ndarray
        Array of grid points in the feature space (shape: (n_grid_points, n_features)).
    
    epsilon : float
        Maximum distance threshold for considering two points as neighbors. Points
        closer than this with different classes are used to compute boundary midpoints.

    Returns
    -------
    list of numpy.ndarray
        List of midpoint arrays representing approximate boundary points.

    Raises
    ------
    None explicitly, but may raise ValueError if shapes of Z and grid mismatch,
    or if numba compilation fails (if @njit is used without numba installed).

    Notes
    -----
    - This function is JIT-compiled with numba for performance, but can run without it.
      Remove @njit if numba is unavailable.
    - Computational complexity is O(n_grid_points^2), which is inefficient for large grids.
      Suitable only for small grids (e.g., low resolution or few features).
    - Uses Euclidean norm (np.linalg.norm) for distance.
    - Usage: Called internally by boundary computation functions to extract transitions.
    """
    boundary_points = []  # List to collect midpoints
    for i in range(len(grid) - 1):  # Outer loop over grid points
            for j in range(i + 1, len(grid)):  # Inner loop to avoid duplicates/self
                # Check if points are close and have different predictions
                if np.linalg.norm(grid[i] - grid[j]) < epsilon and Z[i] != Z[j]:
                    # Append midpoint as boundary approximation
                    boundary_points.append((grid[i] + grid[j]) / 2)  
    return boundary_points

def compute_decision_boundary_points_all_features(model, X, resolution=100, epsilon=0.01):
    """
    Compute decision boundary points in the high-dimensional feature space by
    generating a dense grid, predicting classes, and finding transitions via midpoints.

    Parameters
    ----------
    model : object
        Trained binary or multi-class classifier with a `predict` method that takes
        an array of input points and returns predictions as an array.
        Example: sklearn.linear_model.LogisticRegression instance.
    
    X : pandas.DataFrame
        Input feature dataset (shape: (n_samples, n_features)). Used to determine
        min/max ranges for numeric features and categories for categorical ones.
    
    resolution : int, optional
        Number of points to sample along each feature axis for the grid. Higher values
        increase density but exponentially increase memory/computation. Default is 100.
    
    epsilon : float, optional
        Distance threshold for detecting class changes between grid points. Should be
        tuned based on feature scales (e.g., small for normalized data). Default is 0.01.

    Returns
    -------
    pandas.DataFrame
        DataFrame of unique approximate boundary points, with the same columns as X.

    Raises
    ------
    ValueError
        If grid shapes mismatch during generation, or if model.predict fails.
    
    MemoryError or OverflowError
        Likely for high resolution or many features due to grid size (resolution ** n_features).

    Notes
    -----
    - Generates a full Cartesian grid over all features, flattened to 1D array.
      For n_features = f, grid size = resolution^f, which is feasible only for small f
      (e.g., f<=3) and low resolution (e.g., <=20). For higher dimensions, consider
      sampling or dimensionality reduction instead.
    - For numeric features: Samples evenly from (min-1, max+1).
    - For categorical features: Maps to integer indices (0 to len(categories)-1).
      Note: The code assumes len(categories) == resolution; otherwise, it may produce
      an incorrect grid size (length mismatch). Consider adjusting resolution to match
      max categories or handling categoricals separately (e.g., one-hot encode beforehand).
    - Predictions are made on the entire grid, then boundary midpoints are found.
    - Unique points are taken to remove duplicates.
    - Usage: Visualize high-D boundaries by projecting (e.g., PCA) or for analysis.
      Best for low-D or with small resolution.

    Examples
    --------
    >>> import numpy as np
    >>> import pandas as pd
    >>> from sklearn.linear_model import LogisticRegression
    >>> X = pd.DataFrame({'feat1': [0, 1, 2], 'feat2': [0, 1, 0]})
    >>> model = LogisticRegression().fit(X, [0, 1, 0])
    >>> boundary = compute_decision_boundary_points_all_features(model, X, resolution=10, epsilon=0.1)
    >>> print(boundary.shape)  # e.g., (number_of_boundary_points, 2)
    (15, 2)
    """
    n_features = X.shape[1]  # Number of features
    # A grid that contains resolution^f samples from the f dimensional space where f is the number of features
    grid = np.zeros((resolution ** n_features, n_features))  # Initialize grid array

    # Generates a grid that contains resolution^f samples based on whether the column contains numeric or categorical values 
    # If the column contains numeric types, then the grid generates a column based on subdividing the numeric columns evenly
    for i in range(n_features):
        # Checks if the column is not a column consisting of categorical values
        # If it is not categorical, then the column must be numeric. 
        if not isinstance(X[X.columns[i]].dtype, pd.CategoricalDtype):
            # Sample evenly spaced points, slightly extended beyond data range
            grid[:, i] = np.tile(np.linspace(X.iloc[:, i].min() - 1, X.iloc[:, i].max() + 1, resolution).repeat(resolution ** (n_features - i - 1)), resolution ** i)
        else:
            # For categorical: Get unique categories and map to integers
            cat_array = X.iloc[:, i].astype('category').cat.categories
            cat_array = np.arange(len(cat_array))  # e.g., [0, 1, 2] for 3 categories
            repeats_per_cat = resolution ** (n_features - i - 1)
            tiles = resolution ** i
            col_values = np.tile(np.repeat(cat_array, repeats_per_cat), tiles)
            grid[:, i] = col_values  # Assign to grid; note: length must match resolution^n_features
            
    # Predict the class for each point in the grid
    Z = model.predict(grid)
    # Find points near the decision boundary
    boundary_points = prediction(Z, grid, epsilon)
 
    return pd.DataFrame(np.unique(boundary_points,axis=0), columns=X.columns)  # Unique points as DataFrame

In [None]:

def get_multi_dim_border_points(center, extents, step=0.1):
    """
    Generate points on the boundaries of an n-dimensional hyperrectangle.
    
    Parameters
    ----------
    center : list or numpy.ndarray
        The center of the hyperrectangle, a list or array of length n (number of dimensions).
    
    extents : list or numpy.ndarray
        The full widths (diameters) in each dimension, a list or array of length n.
        Note: The code uses half-widths internally (extents / 2).
    
    step : float, optional
        Step size for sampling points along each dimension's grid. Smaller values
        increase density but computation time. Default is 0.1.

    Returns
    -------
    list of tuples
        Each tuple represents a point on the boundary of the hyperrectangle.

    Raises
    ------
    None explicitly, but may raise ValueError if center and extents have mismatched lengths,
    or TypeError if inputs are not array-like.

    Notes
    -----
    - Uses a set to avoid duplicate points, which can occur at corners/edges.
    - For each dimension, fixes the boundary (min/max) and grids over others.
    - Handles 1D case specially.
    - Suitable for generating boundary samples in constrained optimization or
      visualization of feasible regions in n-D space.
    - Output as list of tuples for easy conversion to arrays if needed.

    Examples
    --------
    >>> center = [0, 0]
    >>> extents = [2, 2]  # Rectangle from (-1,-1) to (1,1)
    >>> points = get_multi_dim_border_points(center, extents, step=0.5)
    >>> print(len(points))  # e.g., number of sampled boundary points
    16
    """
    center = np.array(center)  # Convert center to NumPy array
    extents = np.array(extents)  # Convert extents to NumPy array
    n = len(center)  # Number of dimensions
    points = set()   # Use set to avoid duplicates
    
    # Define min and max bounds for each dimension (using half-extents)
    bounds = [(c - e / 2, c + e / 2) for c, e in zip(center, extents)]
    
    # For each dimension, generate points on the lower and upper boundaries
    for dim in range(n):
        # For lower and upper boundary in this dimension
        for bound_val in [bounds[dim][0], bounds[dim][1]]:
            # Generate grid points for all other dimensions
            other_dims = [i for i in range(n) if i != dim]
            ranges = [np.arange(bounds[i][0], bounds[i][1] + step, step) for i in other_dims]
            if not ranges:  # Handle 1D case
                points.add(tuple([bound_val] if dim == 0 else []))
                continue
            # Create meshgrid for other dimensions
            grids = np.meshgrid(*ranges, indexing='ij')
            coords = [grid.ravel() for grid in grids]
            
            # Construct points
            for coord in zip(*coords):
                point = [0] * n
                # Set the current dimension to the boundary value
                point[dim] = bound_val
                # Set other dimensions to the grid values
                for i, val in zip(other_dims, coord):
                    point[i] = val
                points.add(tuple(point))  # Add as tuple to set
    
    return list(points)

def det_constraints(datapt, deltas): 
    """
    Determine the effective constraints based on deltas, scaling them relative to the data point.

    Parameters
    ----------
    datapt : list or numpy.ndarray
        The data point (feature vector) to scale constraints against.
    
    deltas : list
        List of delta values for each feature. If float/int, it's treated as a percentage
        (e.g., 10 means 10% of datapt[i]); otherwise ignored.

    Returns
    -------
    tuple
        (constraints: list of scaled delta values or -1 if not applicable,
         len_constr: int count of active constraints)

    Raises
    ------
    None explicitly, but may raise TypeError if datapt/deltas are incompatible.

    Notes
    -----
    - Initializes constraints as [-1] * len(deltas), updating only for numeric deltas.
    - Scaling: constraint[i] = (deltas[i] / 100) * datapt[i], assuming percentages.
    - Used to count and quantify constraints for bounded regions.
    - Usage: Pre-process deltas before applying bounds in optimization or counterfactuals.

    Examples
    --------
    >>> datapt = [100, 200]
    >>> deltas = [10, 'none']  # 10% for first, ignore second
    >>> constraints, len_constr = det_constraints(datapt, deltas)
    >>> print(constraints, len_constr)  # [10.0, -1], 1
    [10.0, -1] 1
    """
    constraints = [-1] * len(deltas)  # Initialize with -1 (inactive)
    len_constr = 0  # Counter for active constraints
    for i in range(len(deltas)): 
        if type(deltas[i]) == float or type(deltas[i]) == int:  # Check if numeric
            constraints[i] = (deltas[i]/100)*datapt[i]  # Scale as percentage of datapt
            len_constr+=1  # Increment counter
    return constraints, len_constr

def constraint_bounds(contours, datapt, constraints): 
    """
    Filter contour points to those within specified bounds based on constraints.

    Parameters
    ----------
    contours : numpy.ndarray
        Array of contour points (shape: (n_points, n_features)).
    
    datapt : numpy.ndarray
        The reference data point (shape: (1, n_features)) to center bounds around.
    
    constraints : list
        List of delta values (bounds widths) for each feature; >0 activates filtering.

    Returns
    -------
    numpy.ndarray
        Filtered contour points within the bounds.

    Raises
    ------
    Exception
        If no constraints are assigned (all <=0).

    Notes
    -----
    - For each active constraint, computes [x - delta/2, x + delta/2] and filters.
    - Sequentially applies filters, potentially reducing points cumulatively.
    - Includes plotting of bounds (vertical/horizontal lines for dims 0/1).
    - Assumes 2D for plotting; extend for higher dims if needed.
    - Usage: Constrain boundary points in optimization, e.g., feasible counterfactuals.

    Examples
    --------
    >>> contours = np.array([[0,0], [1,1], [2,2], [3,3]])
    >>> datapt = np.array([[1,1]])
    >>> constraints = [2, 2]  # Bounds width 2 for each
    >>> bounded = constraint_bounds(contours, datapt, constraints)
    >>> print(bounded)  # e.g., array([[0,0], [1,1], [2,2]])
    """
    if len(constraints) == 0: 
        raise Exception("No constraints were assigned.")
    bounded_contour = contours.copy()  # Copy to avoid modifying original
    for i in range(len(constraints)): 
        if constraints[i] > 0:  # Active if >0
            x = datapt[0][i]  # Reference value for dimension i
            # This should just be the delta
            delta_x = constraints[i]
            # Generate a lower and upper bounds on each constraint
            lowb_x, highb_x = x - (delta_x / 2), x + (delta_x / 2)
            contour_arr = bounded_contour[:, i]  # Extract column i
            # Choose the correct indices for the multi-dimensional df
            indices = np.where((contour_arr >= lowb_x) & (contour_arr <= highb_x))
            bounded_contour_pts = bounded_contour[indices]  # Filter rows
            bounded_contour = bounded_contour_pts  # Update
            if i == 0:  # Plot vertical lines for dim 0
                plt.axvline(x=highb_x, color='b', linestyle='-', label='High Bound x')
                plt.axvline(x=lowb_x, color='b', linestyle='-', label='Low bound x')
            else:   # Plot horizontal for dim 1 (assumes 2D)
                plt.axhline(y=highb_x, color='b', linestyle='-', label='High Bound y')
                plt.axhline(y=lowb_x, color='b', linestyle='-', label='Low bound y')
    return bounded_contour

def real_world_constraints(points, undesired_coords, constraints): 
    """
    Filter points based on real-world constraints relative to undesired_coords.

    Parameters
    ----------
    points : pandas.DataFrame
        DataFrame of points to filter, with feature columns.
    
    undesired_coords : list or array
        The reference ("undesired") point coordinates, matching points' features.
    
    constraints : list of lists, optional
        Each sublist: [feature_name (str), operator ('equal', 'greater', or other for <)].
        Empty list returns points unchanged.

    Returns
    -------
    pandas.DataFrame
        Filtered points satisfying all constraints.

    Raises
    ------
    None explicitly, but may raise KeyError if feature_name not in points.columns,
    or IndexError if constraints malformed.

    Notes
    -----
    - Sequentially applies filters: == for 'equal', > for 'greater', < otherwise.
    - Uses column index from get_loc for comparison value.
    - Useful for imposing domain-specific rules, e.g., in counterfactual explanations.
    - If constraints empty, returns original points.
    - Usage: Post-process boundary points to respect real-world feasibility.

    Examples
    --------
    >>> points = pd.DataFrame({'pt1': [1,2,3], 'pt2': [4,5,6], 'pt3': [10,11,12]}, columns=['feat1','feat2','feat3'])
    >>> undesired_coords = [1,2,5]
    >>> constraints = [['feat1', 'greater']]
    >>> filtered = real_world_constraints(points, undesired_coords, constraints)
    >>> print(filtered)  # pd.DataFrame(data=[[4,5,6], [10,11,12]]) # Since feat1 = 1 for the undesired point, we select all points that have feat1 > 1
    >>> constraints = [['feat1', 'greater'], ['feat2', 'less']]. 
    >>> filtered = real_world_constraints(points, undesired_coords, constraints)
    >>> print(filtered)  # pd.DataFrame(data=[]) # Since feat2 = 2 for the undesired point, we select all points that have feat2 < 2. 
    """
    if len(constraints) == 0: 
        return points 
    
    for constraint in constraints: 
        select_pts = None
        if constraint[1] == "equal":  # Filter equal to undesired value
            select_pts = points.loc[points[constraint[0]] == undesired_coords[points.columns.get_loc(constraint[0])], :]
        elif constraint[1] == "greater":  # Filter greater than
            select_pts = points.loc[points[constraint[0]] > undesired_coords[points.columns.get_loc(constraint[0])], :] 
        else:  # Default: less than
            select_pts = points.loc[points[constraint[0]] < undesired_coords[points.columns.get_loc(constraint[0])], :]

        points = select_pts  # Update with filtered
    
    return points

In [None]:
def optimal_point(dataset, model, desired_class, original_class, undesired_coords, resolution=100, point_epsilon=0.1, epsilon=0.01, constraints=[], deltas=[]): 
    """
    Finds the closest point to the decision boundary from an undesired point using a grid-based
    approximation of the boundary, optionally constrained by real-world conditions. This generates
    a counterfactual explanation by minimizing the distance to the boundary while satisfying class
    change requirements and constraints.

    Parameters
    ----------
    dataset : pd.DataFrame
        Full dataset containing features and a final column with class labels.
    
    model : sklearn-like classifier
        A binary classification model with a `.fit()` and `.predict()` method.
    
    desired_class : int or label
        The target class we want the corrected point to belong to.
    
    original_class : int or label
        The actual class label of the undesired point.
    
    undesired_coords : list or array
        The coordinates of the original ("unhealthy") point.
    
    resolution : int, optional
        Number of points to sample along each feature axis for the grid in boundary computation.
        Higher values improve accuracy but increase computation exponentially. Default is 100.
    
    point_epsilon : float, optional
        Distance threshold for detecting class changes in the grid-based boundary search.
        Default is 0.1.
    
    epsilon : float, optional
        Step size used when displacing a point toward the decision boundary (for overshooting).
        Default is 0.01.
    
    constraints : list, optional
        A list of real-world constraints on the features (e.g., ranges, logic constraints).
        Default is [].
    
    deltas : list, optional
        Tolerances or maximum displacements for each feature. Default is [].

    Returns
    -------
    np.ndarray
        A corrected point (optimal_datapt) that satisfies the class change and real-world constraints.

    Raises
    ------
    Exception
        If the number of constraints exceeds the number of features.

    Notes
    -----
    - This variant uses a grid-based approach (`compute_decision_boundary_points_all_features`)
      to approximate the decision boundary, which is more exhaustive but computationally intensive
      for high dimensions or resolutions. Suitable for low-dimensional spaces (e.g., 2-3 features).
    - Trains the model on the dataset, generates boundary points via grid predictions and midpoint
      detection, applies constraints, and finds the closest optimal point.
    - Assumes binary classification and relies on external functions like `real_world_constraints`,
      `closest_point`, `move_from_A_to_B_with_x1_displacement`, `det_constraints`,
      `get_multi_dim_border_points`, `constraint_bounds`, and `closest_border_point`,
      which must be defined elsewhere.
    - Includes plotting for visualization (e.g., contours, points, lines), requiring matplotlib.
      Plots assume 2D for simplicity (e.g., contours[:,0] and [:,1]).
    - Print statements provide progress tracking.
    - If `desired_class != original_class`, overshoots the boundary slightly for class flip.
      Otherwise, handles bounded constraints differently (full grid or partial filtering).
    - Usage: Generate counterfactuals for explainable AI, optimization, or model interpretation.

    Examples
    --------
    >>> import pandas as pd
    >>> from sklearn.svm import SVC
    >>> dataset = pd.DataFrame({'feat1': [0, 1, 2], 'feat2': [0, 1, 0], 'label': [0, 1, 0]})
    >>> model = SVC(kernel='linear')
    >>> undesired_coords = [2, 0]  # Example point from class 0
    >>> optimal = optimal_point(dataset, model, desired_class=1, original_class=0, undesired_coords=undesired_coords, resolution=20)
    >>> print(optimal)  # e.g., array([[1.5, 0.5]])
    """
    X_train, y_train = dataset.iloc[:, 0:dataset.shape[1]-1], dataset.iloc[:, -1]  # Extract features and labels
    n_features = X_train.shape[1]  # Get number of features

    print("fitting model...")
    model.fit(X_train, y_train)  # Train the model
    print("model finished.")

    print("boundary points started generation...")
    # Use grid-based method to approximate boundary points
    boundary_points = compute_decision_boundary_points_all_features(model, X_train, resolution=resolution, epsilon=point_epsilon)
    print("boundary points finished.")

    # Fitting the boundary points to the constraints provided by the real world
    contours = real_world_constraints(points=boundary_points, undesired_coords=undesired_coords, constraints=constraints)
    contours = contours.to_numpy()  # Convert to NumPy for further processing

    # contours = boundary_points  # (Commented: Alternative to use raw boundary)
    undesired_datapt = np.reshape(np.array(list(undesired_coords)), (1, -1))  # Reshape undesired point to 2D array
    # Find the closest point from the undesired point to the contour line
    print("Finding the closest point from the contour line to the point...")
    optimal_datapt = closest_point(undesired_datapt, contour=contours)
    print("Finding the closest point from the contour line to the point.")  # Note: Duplicate print, possibly a typo
    plt.plot(contours[:,0], contours[:,1], lw=0.5, color='red')  # Plot contours (assumes 2D)

    if desired_class != original_class: 
        D = optimal_datapt - undesired_datapt  # Direction vector to boundary
        deltas = D * (1+epsilon)  # Scale to overshoot slightly
        optimal_datapt = move_from_A_to_B_with_x1_displacement(undesired_datapt, optimal_datapt, deltas=deltas)  # Move point
    else: 
        closest_boundedpt = None
        deltas, len_constr = det_constraints(datapt=undesired_datapt[0], changes=deltas)  # Determine constraints (note: param 'changes' may be a typo for 'deltas')
        bounded_contour_pts = None

        if len_constr > n_features: 
            raise Exception("There cannot be more constraints than features")
        elif len_constr == n_features:
            # Generate border points for fully constrained case
            bounded_contour_pts = get_multi_dim_border_points(center=undesired_datapt[0], extents=deltas, step=0.05)
            np_bounded_contour = np.array(bounded_contour_pts)  # To NumPy
            x_values, y_values = np_bounded_contour[:,0], np_bounded_contour[:, 1]  # Extract for plotting (assumes 2D)
            plt.scatter(x_values, y_values, marker='o')  # Plot bounded points
            closest_boundedpt = closest_border_point(bounded_contour_pts, contour=contours)  # Find closest on border (constraints in all dimensions)
        else: 
            # Generate bounded contour points for partial constraints 
            bounded_contour_pts = constraint_bounds(contours, undesired_datapt, deltas)
            closest_boundedpt = closest_point(point=undesired_datapt, contour=bounded_contour_pts)  # Find closest point based on partial constraints

        D = closest_boundedpt - undesired_datapt  # Direction vector
        optimal_datapt = move_from_A_to_B_with_x1_displacement(undesired_datapt, closest_boundedpt, deltas=D)  # Move point
    plt.scatter(undesired_datapt[0][0], undesired_datapt[0][1], c = 'r')  # Plot undesired point
    plt.text(undesired_datapt[0][0]+0.002, undesired_datapt[0][1]+0.002, 'NH')  # Label 'NH' (e.g., Non-Healthy)
    plt.scatter(optimal_datapt[0][0], optimal_datapt[0][1], c = 'r')  # Plot optimal point
    plt.text(optimal_datapt[0][0]+0.002, optimal_datapt[0][1]+0.002, 'NH')  # Label 'NH' (note: duplicate label, perhaps typo for 'H')
    plt.plot([undesired_datapt[0][0], optimal_datapt[0][0]], [undesired_datapt[0][1],optimal_datapt[0][1]], linestyle='--')  # Dashed line between points
    return optimal_datapt

# Experiments of the Computation Costs for Grid-based Approach (Logistic Regression with Numba)

$\textbf{ Logistic Regression with Grid-based Method }$ -- 50 features

Resolution: $R = 15$ ($15^{50}$ points), Memory Error (Maximum allowed dimension exceeded), 0 boundary points found

Resolution: $R = 10$ ($10^{50}$ points), Memory Error (Maximum allowed dimension exceeded), 0 boundary points found

$\textbf{ Logistic Regression with Grid-based Method }$ -- 10 features

Resolution: $R = 15$ ($15^{15}$ points), 42 Terabyte Memory Error, 0 boundary points found

Resolution: $R = 10$ ($10^{10}$ points), 745 Gigabytes Memory Error, 0 boundary points found


$\textbf{ Logistic Regression with Grid-based Method }$ -- 2 features

Resolution: $R = 150$ ($150^2 = 22,500$ points searched), 27.2 second runtime, 454 boundary points found

Resolution: $R = 100$ ($100^2 = 10,000$ points searched) 5.4 seconds runtime, 104 boundary points found

In [None]:
X, y = make_classification(n_samples=2000, n_features=2, n_informative=2, n_redundant=0, random_state=42, n_classes=2)
model = LogisticRegression()
y = y.reshape(-1,1)
df1 = pd.DataFrame(data=np.hstack((X,y)))

In [20]:
df1.head(n=10)

Unnamed: 0,0,1,2
0,0.800062,-0.957489,1.0
1,1.187099,1.159787,1.0
2,0.154512,1.21752,0.0
3,0.179014,-0.852832,1.0
4,-0.735827,-0.245366,0.0
5,0.039487,1.320957,1.0
6,-1.482199,0.419738,0.0
7,-0.622829,-0.803223,0.0
8,0.965721,-1.068587,1.0
9,0.798459,-1.022348,1.0


In [None]:
optimal_point(df1, model=model, desired_class=0, original_class=1, resolution=150, undesired_coords=df1.iloc[1,:df1.shape[1]-1], point_epsilon=0.1, epsilon=0.07)

fitting model...
model finished.
boundary points started generation...
boundary points finished.
Finding the closest point from the contour line to the point...
Finding the closest point from the contour line to the point.
[[1.07092306 0.87624128]]
[[-1.16018181  0.00452233]]


array([[-0.05536678,  1.16374982]])

In [21]:
optimal_point(df1, model=model, desired_class=0, original_class=1, resolution=100, undesired_coords=df1.iloc[1,:df1.shape[1]-1], point_epsilon=0.1, epsilon=0.07)

fitting model...
model finished.
boundary points started generation...
boundary points finished.
Finding the closest point from the contour line to the point...
Finding the closest point from the contour line to the point.
[[1.07092411 1.09801295]]
[[-1.15887253 -0.03919662]]


array([[-0.05396586,  1.11674877]])

In [22]:
X, y = make_classification(n_samples=2000, n_features=10, n_informative=10, n_redundant=0, random_state=42, n_classes=2)
model = LogisticRegression()
y = y.reshape(-1,1)
df2 = pd.DataFrame(data=np.hstack((X,y)))

In [23]:
df2.head(n=10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.99062,-2.921959,-5.689956,3.173745,-4.393239,0.538239,-0.96265,-3.024594,-0.837913,-1.944185,1.0
1,1.342356,0.942957,3.057782,0.059667,-0.611317,-0.985521,0.323851,0.853719,1.741036,2.069323,1.0
2,-1.063987,2.185873,-2.230002,1.095676,-0.240747,-1.776678,-4.16509,-0.122159,0.688403,1.401783,1.0
3,-4.852159,1.29738,3.466596,4.114327,-4.004815,-0.591707,2.035825,3.118117,-0.996438,1.46557,1.0
4,0.033975,-0.181,0.438378,-1.50681,-0.503984,2.442108,-0.40801,1.722336,2.563095,-3.228898,1.0
5,2.77068,-0.667671,-2.726522,1.778277,3.03332,1.052492,1.810689,-3.73043,-2.032935,2.37108,0.0
6,-1.375939,0.889744,-0.487178,0.394568,-1.553368,2.302102,-0.134183,1.3401,0.741324,0.554025,1.0
7,0.060123,-2.850018,-1.808885,1.64985,0.827631,-0.681723,3.049334,-1.560887,-0.029848,0.037102,0.0
8,1.735121,4.266722,0.767749,-2.012636,2.538945,-4.960979,2.908603,-6.451668,2.723697,0.785976,0.0
9,0.436313,1.826231,-0.232626,-0.57004,1.101895,0.658344,-0.079847,-2.15896,0.985347,0.426746,0.0


In [24]:
optimal_point(df2, model=model, desired_class=0, original_class=1, resolution=15, undesired_coords=df2.iloc[1,:df2.shape[1]-1], point_epsilon=0.1, epsilon=0.07)

fitting model...
model finished.
boundary points started generation...


MemoryError: Unable to allocate 42.0 TiB for an array with shape (576650390625, 10) and data type float64

In [25]:
optimal_point(df2, model=model, desired_class=0, original_class=1, resolution=10, undesired_coords=df2.iloc[1,:df2.shape[1]-1], point_epsilon=0.1, epsilon=0.07)

fitting model...
model finished.
boundary points started generation...


MemoryError: Unable to allocate 745. GiB for an array with shape (10000000000, 10) and data type float64

In [26]:
X, y = make_classification(n_samples=2000, n_features=50, n_informative=50, n_redundant=0, random_state=42, n_classes=2)
model = LogisticRegression()
y = y.reshape(-1,1)
df3 = pd.DataFrame(data=np.hstack((X,y)))

In [27]:
df3.head(n=10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,1.49896,1.740244,2.852685,0.48325,2.629995,2.421617,-0.605894,3.277559,-4.254546,0.657476,...,1.655511,-5.087583,-1.446621,5.061929,-4.813585,-8.186449,3.429404,-10.006712,3.805926,1.0
1,11.043912,1.305697,-4.162067,7.062152,10.188181,-5.216963,0.869844,6.395281,-14.499055,-2.626266,...,-1.37164,-2.518497,-1.723985,-2.125003,-2.793473,-0.928793,-7.307003,-9.042577,-1.534964,1.0
2,1.699169,-8.6435,5.242382,4.438892,-0.622756,0.935984,5.435414,3.986439,5.502292,2.903077,...,-2.215406,5.446412,4.758631,1.948837,-2.243126,-10.913266,2.709645,-5.33916,-5.383915,1.0
3,2.454609,4.730429,-1.116898,-5.399067,-1.189977,-1.699942,3.976442,4.936787,8.205961,-0.676943,...,1.208745,-1.076917,-1.119831,-0.596468,3.882105,4.146733,5.821566,5.414165,1.990073,0.0
4,0.400635,-2.403447,0.837233,1.377394,4.496939,-3.341007,0.073394,-1.53637,-2.858345,1.481029,...,-11.217268,4.594756,4.380212,3.271706,-4.442203,-5.418086,0.023346,-2.009643,-0.778441,1.0
5,2.545557,10.026681,2.393387,-2.369646,-0.269603,0.979513,1.661028,0.86173,-1.251917,6.446719,...,-2.476918,-1.510854,-1.485889,2.787948,-1.637217,4.217302,2.820862,3.438934,0.154469,0.0
6,-1.593692,3.824784,-0.908609,2.942972,3.894036,3.413266,1.553961,-7.739468,2.197464,0.299972,...,-0.561935,4.208008,7.395545,4.576173,0.815791,-3.038333,-1.339261,7.664257,-2.959643,0.0
7,3.681587,-4.058194,2.212674,-0.258745,-1.462812,-3.909477,3.147509,-4.719408,-6.281705,3.619122,...,-7.608988,0.617678,3.293293,2.675376,-6.577331,-4.715912,-3.124655,-1.824645,-3.132426,1.0
8,3.159427,-4.605655,3.807224,0.908897,10.764171,-3.639272,-3.447677,4.983319,5.15923,1.414902,...,3.107829,4.186121,0.839391,-1.92586,-2.886895,-3.350357,-6.439152,-1.476345,-2.515509,1.0
9,1.633173,3.666814,-0.403462,-3.688655,6.947526,-3.817231,-3.978926,0.316689,5.10209,-3.981173,...,-10.370874,3.227933,-9.640087,5.534013,-3.588567,3.703978,3.262374,3.831146,3.089794,0.0


In [28]:
optimal_point(df3, model=model, desired_class=0, original_class=1, resolution=15, undesired_coords=df3.iloc[1,:df3.shape[1]-1], point_epsilon=0.1, epsilon=0.07)

fitting model...
model finished.
boundary points started generation...


ValueError: Maximum allowed dimension exceeded

In [29]:
optimal_point(df3, model=model, desired_class=0, original_class=1, resolution=10, undesired_coords=df3.iloc[1,:df3.shape[1]-1], point_epsilon=0.1, epsilon=0.07)

fitting model...
model finished.
boundary points started generation...


ValueError: Maximum allowed dimension exceeded

In [None]:
def alpha_binary_search(model, point, opp_point, point_target, opp_target, epsilon=0.01):
    """
    Perform a binary search along the line segment between two points to find the
    approximate alpha value where the model's prediction changes from one target
    label to another. This is useful for approximating decision boundaries in
    binary classification by finding the transition point along a segment connecting
    points from opposite classes.

    Parameters
    ----------
    model : object
        A trained machine learning model with a `predict` method that takes a list
        or array of input points and returns predictions as an array. The model
        should be a binary classifier (e.g., from scikit-learn, PyTorch, etc.).
        Example: sklearn.linear_model.LogisticRegression instance.
    
    point : numpy.ndarray
        A 1D array representing the starting point (feature vector) in the feature
        space, typically from one class. Must have the same shape as `opp_point`.
    
    opp_point : numpy.ndarray
        A 1D array representing the opposing point (feature vector) in the feature
        space, typically from the opposite class. Must have the same shape as `point`.
    
    point_target : int or str
        The expected prediction label for the `point`. This is used to initialize
        the search and compare against the model's prediction at interpolated points.
        Should match the model's output format (e.g., 0 or 1 for binary classes).
    
    opp_target : int or str
        The expected prediction label for the `opp_point`. This should be different
        from `point_target` and is used to detect when the prediction flips.
    
    epsilon : float, optional
        The tolerance for convergence in the binary search. The loop stops when the
        difference between the search bounds is less than this value. Default is 0.01.
        Smaller values yield more precise alphas but may increase computation time.

    Returns
    -------
    float
        The approximate alpha value (between 0 and 1) where the model's prediction
        transitions. A value closer to 0 means the boundary is nearer to `point`,
        while closer to 1 means nearer to `opp_point`.

    Raises
    ------
    None explicitly, but may raise exceptions from `model.predict` if the input
    shapes are incompatible or if the model is not properly trained.

    Notes
    -----
    - This function assumes the decision boundary is crossed exactly once along the
      line segment; multiple crossings (e.g., in non-linear models) may lead to
      approximate or incorrect results.
    - The binary search updates bounds based on prediction matches, but if no flip
      occurs (e.g., both points predicted the same), it will converge to a midpoint
      without a true boundary.
    - Usage: Typically called within a loop over pairs of points from different
      classes to sample multiple boundary points.

    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.linear_model import LogisticRegression
    >>> model = LogisticRegression().fit(np.array([[0], [1]]), [0, 1])
    >>> point = np.array([0.0])
    >>> opp_point = np.array([1.0])
    >>> alpha = alpha_binary_search(model, point, opp_point, 0, 1, epsilon=0.001)
    >>> print(alpha)  # Approximately 0.5 for a linear boundary at 0.5
    0.5
    """
    start, end = 0, 1  # Initialize search bounds: 0 at 'point', 1 at 'opp_point'
    while abs(end - start) >= epsilon:  # Loop until convergence within epsilon
        alpha = (start + end) / 2  # Midpoint alpha (float division ensured)
        # Interpolate: weighted average between point and opp_point
        temp_candidate = (1 - alpha) * point + alpha * opp_point
        # Predict on the interpolated point (assumes model.predict returns array)
        temp_target = model.predict([temp_candidate])[0]
        if temp_target == point_target: 
            start = alpha  # Move start bound if prediction matches point's target
            point_target = temp_target  # Update target (though often redundant)
        elif temp_target == opp_target: 
            end = alpha  # Move end bound if prediction matches opp's target
            opp_target = temp_target  # Update target (though often redundant)
    return (start + end) / 2  # Return midpoint as approximate transition alpha


def find_decision_boundary(model, X, y, epsilon=1e-3, threshold=10000):
    """
    Approximate the decision boundary of a binary classification model by sampling
    points along line segments between correctly classified points from opposite
    classes. Uses binary search to find transition points and collects them into
    a DataFrame. Handles categorical features by rounding them to integers.

    Parameters
    ----------
    model : object
        A trained binary classification model with a `predict` method that takes
        a list or array of input points and returns predictions as an array.
        Example: sklearn.svm.SVC instance.
    
    X : pandas.DataFrame
        The feature dataset, where rows are samples and columns are features.
        Supports mixed types, including integer categoricals.
    
    y : pandas.Series or numpy.ndarray
        The target labels corresponding to X. Must contain exactly two unique
        binary labels (e.g., 0 and 1).
    
    epsilon : float, optional
        The precision for the binary search in `alpha_binary_search`. Smaller
        values increase accuracy but computation time. Default is 1e-3.
    
    threshold : int, optional
        The maximum number of boundary points to generate. Stops early if reached
        to prevent excessive computation on large datasets. Default is 10000.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing the approximated boundary points, with the same
        columns as X. Categorical columns (detected as int types) are converted
        to integers.

    Raises
    ------
    ValueError
        If y does not contain exactly two unique labels (non-binary classification).

    Notes
    -----
    - This function clusters points by true labels (y), then filters pairs where
      the model correctly predicts them (to ensure opposite sides of the boundary).
    - It may miss boundaries if the model has high error rates (few correct pairs).
    - Computational complexity is O(n*m) where n and m are cluster sizes, capped
      by threshold. For large datasets, reduce threshold or sample clusters.
    - A `bool_vec` is created but unused; it may be a remnant for future masking
      (e.g., to ignore categoricals in interpolation).
    - Categorical features are auto-detected as int columns and rounded to int
      in the output for interpretability.
    - Usage: Call after training a model to visualize or analyze its boundary,
      e.g., plot the points in 2D or use for explanations.

    Examples
    --------
    >>> import numpy as np
    >>> import pandas as pd
    >>> from sklearn.svm import SVC
    >>> X = pd.DataFrame({'feat1': [0, 1, 2], 'feat2': [0, 1, 0]})
    >>> y = np.array([0, 1, 0])
    >>> model = SVC(kernel='linear').fit(X, y)
    >>> boundary = find_decision_boundary(model, X, y, epsilon=0.001, threshold=5)
    >>> print(boundary.shape)  # e.g., (number_of_points, 2)
    (2, 2)
    """
    # Detect categorical features (assumed as int columns)
    categorical_features = X.select_dtypes(include=int).columns.tolist()
    cat_indices = [X.columns.get_loc(col) for col in categorical_features]

    # Create a boolean vector (1 for continuous, 0 for categorical; currently unused)
    bool_vec = [1] * (len(X.columns)) 
    for i in range(len(cat_indices)): 
        bool_vec[cat_indices[i]] = 0 

    X_np = X.to_numpy()  # Convert features to NumPy for efficient ops
    y_np = y.to_numpy() if not isinstance(y, np.ndarray) else y  # Ensure y is NumPy
    boundary_points = []  # List to collect boundary point arrays
    unique_labels = np.unique(y_np)  # Get unique class labels
    if len(unique_labels) != 2:
        raise ValueError("Only supports binary classification.")
    
    label_a, label_b = unique_labels[0], unique_labels[1]  # Assign labels

    # Cluster points by true labels
    cluster_a = X_np[y_np == label_a]
    cluster_b = X_np[y_np == label_b]

    total_N = 0  # Counter for generated points
    for i in range(cluster_a.shape[0]):
        point = cluster_a[i]
        pt_pred = model.predict([point])  # Predict on point from cluster A

        for j in range(cluster_b.shape[0]): 
            match_point = cluster_b[j]
            match_pt_pred = model.predict([match_point])  # Predict on point from B
            # Check if model correctly classifies both (ensures opposite sides)
            if pt_pred.item() == label_a and match_pt_pred.item() == label_b: 
                # Find alpha where prediction flips
                alpha = alpha_binary_search(model, point, match_point, label_a, label_b, epsilon=epsilon)
                # Compute boundary point via interpolation
                boundary = (1 - alpha) * point + alpha * match_point
                boundary_points.append(boundary)

                total_N += 1
                if total_N >= threshold:  # Early stop inner loop
                    break
        if total_N >= threshold:  # Early stop outer loop
            break
    
    # Convert list to DataFrame with original columns
    boundary_pts = pd.DataFrame(data=boundary_points, columns=X.columns)

    # Round categoricals to int for discrete values
    for col in categorical_features: 
        boundary_pts[col] = boundary_pts[col].astype(int)

    return boundary_pts

In [None]:
def optimal_point(dataset, model, desired_class, original_class, undesired_coords, threshold=10000, point_epsilon=0.1, epsilon=0.01, constraints=[], deltas=[]): 
    """
    Finds the closest point to the decision boundary from an undesired point,
    optionally constrained by real-world conditions.
    This essentially finds the counterfactual explanation for a given point by minimizing the distance to the given boundary.
    This method is important because it addresses a key problem with the original optimal_point() function where we generated an R^n dimensional grid that we would then have to iterate over. 
    The problem with iterating over such a grid is eventually that we will hit a memory error for high-dimensional features such as 20, 30 or 40 features. This will cause the function to crash. 
    Additionally, due to the exponential increase of the number of features to search, the grid will become infeasible to search (curse of dimensionality). 

    Parameters
    ----------
    dataset : pd.DataFrame
        Full dataset containing features and a final column with class labels.
    
    model : sklearn-like classifier
        A binary classification model with a `.fit()` and `.predict()` method.
    
    desired_class : int or label
        The target class we want the corrected point to belong to.
    
    original_class : int or label
        The actual class label of the undesired point.
    
    undesired_coords : list or array
        The coordinates of the original ("unhealthy") point.
    
    threshold : int, optional
        Max number of decision boundary points to sample. Default is 10000.
    
    point_epsilon : float, optional
        Precision used to estimate decision boundary points. Default is 0.1.
    
    epsilon : float, optional
        Step size used when displacing a point toward the decision boundary. Default is 0.01.
    
    constraints : list, optional
        A list of real-world constraints on the features (e.g., ranges, logic constraints). Default is [].
    
    deltas : list, optional
        Tolerances or maximum displacements for each feature. Default is [].

    Returns
    -------
    np.ndarray
        A corrected point that satisfies the class change and real-world constraints.

    Raises
    ------
    Exception
        If the number of constraints exceeds the number of features.

    Notes
    -----
    - This function trains the model on the provided dataset, generates boundary points using
      `find_decision_boundary`, applies constraints, and finds the closest optimal point.
    - Assumes binary classification and relies on external functions like `real_world_constraints`,
      `closest_point`, `move_from_A_to_B_with_x1_displacement`, etc., which must be defined elsewhere.
    - Includes plotting for visualization (e.g., boundary contours, points), which requires matplotlib.
    - The function blends boundary approximation with counterfactual generation, useful for explainable AI.
    - Print statements are for progress tracking; plotting is partially commented out but can be enabled.
    - Usage: Call with a dataset and model to generate counterfactuals, e.g., for model interpretation or optimization.

    Examples
    --------
    >>> import pandas as pd
    >>> from sklearn.linear_model import LogisticRegression
    >>> dataset = pd.DataFrame({'feat1': [0, 1, 2], 'feat2': [0, 1, 0], 'label': [0, 1, 0]})
    >>> model = LogisticRegression()
    >>> undesired_coords = [2, 0]  # Example point from class 0
    >>> optimal = optimal_point(dataset, model, desired_class=1, original_class=0, undesired_coords=undesired_coords)
    >>> print(optimal)  # e.g., array([[1.5, 0.5]])
    """
    # -------------------------------
    # STEP 1: Train the model
    # -------------------------------
    X_train = dataset.iloc[:, 0:-1]  # Extract features from dataset
    y_train = dataset.iloc[:, -1]  # Extract labels from dataset
    n_features = X_train.shape[1]  # Get number of features

    print("fitting model...")
    model.fit(X_train, y_train)  # Train the model on the dataset
    print("model finished.")

    # -------------------------------
    # STEP 2: Find decision boundary
    # -------------------------------
    print("boundary points started generation...")

    # This step uses binary interpolation to get points close to the decision boundary
    boundary_points = find_decision_boundary(model, X_train, y_train,
                                             threshold=threshold, epsilon=point_epsilon)
    print("boundary points finished.")
    print(boundary_points.shape)

    # -------------------------------
    # STEP 3: Apply real-world constraints (optional)
    # -------------------------------
    # Reduce boundary points based on external rules (e.g., cost limits, physics constraints)
    contours = real_world_constraints(points=boundary_points,
                                      undesired_coords=undesired_coords,
                                      constraints=constraints)
    contours = np.unique(contours.to_numpy(), axis=0)  # Remove duplicates from constrained points
    undesired_datapt = np.reshape(np.array(list(undesired_coords)), (1, -1))  # Reshape undesired point to 2D array

    # -------------------------------
    # STEP 4: Find closest point on constrained boundary
    # -------------------------------
    print("Finding the closest point from the contour line to the point...")
    optimal_datapt = closest_point(undesired_datapt, contour=contours)
    print("Finding the closest point from the contour line to the point.")  # Note: Duplicate print, possibly a typo
    #plt.plot(contours[:,0], contours[:,1], lw=0.5, color='red')  # Commented: Plot contours for visualization


    # -------------------------------
    # STEP 5: Post-process based on class flip requirement
    # -------------------------------

    # If we want to *flip* the class of the point...
    if desired_class != original_class: 
         # Move in the direction of the boundary, slightly overshooting
        D = optimal_datapt - undesired_datapt  # Compute direction vector
        deltas = D * (1+epsilon)  # Scale by (1 + epsilon) to overshoot
        optimal_datapt = move_from_A_to_B_with_x1_displacement(undesired_datapt, optimal_datapt, deltas=deltas)
    else:
        # If we want to *stay within* the same class (more constrained)
        closest_boundedpt = None
        deltas, len_constr = det_constraints(datapt=undesired_datapt[0], deltas=deltas)  # Determine constraints

        if len_constr > X_train.shape[1]:
            raise Exception("There cannot be more constraints than features")

        elif len_constr == X_train.shape[1]:
            # All n dimensions are constrained, so generate an exact grid of boundary candidates
            bounded_contour_pts = get_multi_dim_border_points(center=undesired_datapt[0],
                                                              extents=deltas,
                                                              step=0.05)
            np_bounded_contour = np.array(bounded_contour_pts)  # Convert to NumPy array
            x_values, y_values = np_bounded_contour[:, 0], np_bounded_contour[:, 1]  # Extract x/y for plotting
            plt.scatter(x_values, y_values, color='blue', marker='o')  # Plot bounded points
            closest_boundedpt = closest_border_point(bounded_contour_pts, contour=contours)  # Find closest on border

        else:
            # Partially constrained - less than n dimensions are constrained
            bounded_contour_pts = constraint_bounds(contours, undesired_datapt, deltas)  # Apply partial bounds
            closest_boundedpt = closest_point(point=undesired_datapt, contour=bounded_contour_pts)  # Find closest
        
        D = closest_boundedpt - undesired_datapt  # Compute direction
        optimal_datapt = move_from_A_to_B_with_x1_displacement(undesired_datapt, closest_boundedpt, deltas=D)  # Move point
    
    # Plot original and optimal points with connecting line
    plt.scatter(undesired_datapt[0][0], undesired_datapt[0][1], c = 'r')  # Plot undesired point
    plt.text(undesired_datapt[0][0]+0.002, undesired_datapt[0][1]+0.002, 'NH')  # Label 'NH' (e.g., Non-Healthy)
    plt.scatter(optimal_datapt[0][0], optimal_datapt[0][1], c = 'g')  # Plot optimal point (changed to green for distinction)
    plt.text(optimal_datapt[0][0]+0.002, optimal_datapt[0][1]+0.002, 'H')  # Label 'H' (e.g., Healthy; adjusted from duplicate 'NH')
    plt.plot([undesired_datapt[0][0], optimal_datapt[0][0]], [undesired_datapt[0][1],optimal_datapt[0][1]], linestyle='--')  # Dashed line between points
    return optimal_datapt

# Experiments of the Computation Costs for Gridless Approach (Logistic Regression)

$\textbf{ Logistic Regression with Gridless Method }$ -- 50 features

Search: $S = 1,000,000$, 29.7 second runtime, $50,000$ boundary points found

Search: $S = 200,000$, 6.1 second runtime, $10,000$ boundary points found

$\textbf{ Logistic Regression with Gridless Method }$ -- 10 features

Search: $S = 2,000,000$, 58.6 second runtime, $100,000$ boundary points found

Search: $S = 200,000$, 5.9 second runtime, $10,000$ boundary points found


$\textbf{ Logistic Regression with Gridless Method }$ -- 2 features

Search: $S = 30,000$, 0.9 seconds runtime, $1,500$ boundary points found

Search: $S = 10,000$, 0.3 seconds runtime, $500$ boundary points found

In [32]:
optimal_point(df1, model=model, desired_class=0, original_class=1, threshold=1500, undesired_coords=df1.iloc[1,:df1.shape[1]-1], point_epsilon=1e-6, epsilon=0.07)

fitting model...
model finished.
boundary points started generation...
boundary points finished.
(1500, 2)
Finding the closest point from the contour line to the point...
Finding the closest point from the contour line to the point.
[[1.07091351 1.09432123]]
[[-1.1723115  -0.04499449]]


array([[-0.06834554,  1.11054874]])

In [33]:
optimal_point(df1, model=model, desired_class=0, original_class=1, threshold=500, undesired_coords=df1.iloc[1,:df1.shape[1]-1], point_epsilon=1e-6, epsilon=0.07)

fitting model...
model finished.
boundary points started generation...
boundary points finished.
(500, 2)
Finding the closest point from the contour line to the point...
Finding the closest point from the contour line to the point.
[[1.07091407 1.08740554]]
[[-1.17158501 -0.06247468]]


array([[-0.06756819,  1.09185185]])

In [34]:
optimal_point(df2, model=model, desired_class=0, original_class=1, threshold=10000, undesired_coords=df2.iloc[1,:df2.shape[1]-1], point_epsilon=1e-6, epsilon=0.07)

fitting model...
model finished.
boundary points started generation...
boundary points finished.
(10000, 10)
Finding the closest point from the contour line to the point...
Finding the closest point from the contour line to the point.
[[1.08086181 1.08009778 1.05649038 1.34857541 1.06644951 1.08088977
  1.0904509  1.07676375 1.07914465 1.07358859]]
[[-0.09951026 -0.10696393  0.07820282 -0.00484097  0.30036689 -0.09925739
  -0.05332044 -0.15919617 -0.11800827 -0.29916742]]


array([[ 1.23479959,  0.82742501,  3.14040285,  0.0531383 , -0.29099069,
        -1.09280704,  0.26570737,  0.68230259,  1.61368784,  1.74813994]])

In [35]:
optimal_point(df2, model=model, desired_class=0, original_class=1, threshold=100000, undesired_coords=df2.iloc[1,:df2.shape[1]-1], point_epsilon=1e-6, epsilon=0.07)

fitting model...
model finished.
boundary points started generation...
boundary points finished.
(100000, 10)
Finding the closest point from the contour line to the point...
Finding the closest point from the contour line to the point.
[[1.04479475 1.07618195 1.10204599 1.07415725 1.06247779 1.0765085
  1.37979831 1.07430465 1.06180044 1.07748248]]
[[ 0.04145148 -0.17408467 -0.03438951 -0.25838176  0.14124536 -0.16540036
  -0.00445386 -0.24956845  0.12949476 -0.14400074]]


array([[ 1.38566472,  0.75560974,  3.0198835 , -0.21787592, -0.46124675,
        -1.16357564,  0.31770526,  0.5856067 ,  1.87853343,  1.91416439]])

In [36]:
optimal_point(df3, model=model, desired_class=0, original_class=1, threshold=10000, undesired_coords=df3.iloc[1,:df3.shape[1]-1], point_epsilon=1e-6, epsilon=0.07)

fitting model...
model finished.
boundary points started generation...
boundary points finished.
(10000, 50)
Finding the closest point from the contour line to the point...
Finding the closest point from the contour line to the point.
[[1.07023637 1.06976976 1.06969373 1.07021297 1.07019208 1.06967599
  1.06746763 1.07036306 1.06984842 1.06977869 1.07058837 1.0702381
  1.06975027 1.06972009 1.06973857 1.0712423  1.07144123 1.07081446
  1.06865035 1.07067915 1.06942924 1.07035981 1.06899849 1.07111809
  1.06024232 1.07163432 1.06724779 1.06810594 1.0702466  1.06864233
  1.07050402 1.07028733 1.07047349 1.07057354 1.06968933 1.07808458
  1.07140577 1.06982413 1.07138016 1.07078645 1.07023026 1.07182012
  1.06801061 1.06163103 1.06959137 1.06826589 1.06960988 1.06980174
  1.06983912 1.06881256]]
[[-4.52777119  4.64638377  3.49262824 -5.02509248 -5.57172006  3.30137171
   0.42152892 -2.94817682  7.05783795  4.83392349 -1.81956897 -4.49499512
   4.28371073  3.82166172  4.09181003 -0.8623047

array([[ 6.19812621,  6.27625802, -0.42602433,  1.68423239,  4.22537044,
        -1.6855653 ,  1.31981248,  3.23966109, -6.94823857,  2.54496262,
        -0.56307918, -2.99339257,  2.10512098,  3.38857828,  4.87270043,
        -0.82116314,  0.38646645,  1.00855226, -1.63925939, -3.04571639,
        -2.1559183 ,  5.84787037, -0.72097048, -0.38259003, -1.73338932,
         1.02140087,  3.84590092,  2.38359545,  2.76937251, -5.68655905,
         1.02765196, -0.44046882, -6.08452679, -1.51411165,  3.39464693,
         2.52157268,  4.76625411,  2.07175185,  4.04757458, -0.70950234,
         1.51880869, -2.00280518, -1.94513123, -1.58931375,  0.67468693,
        -2.13538688,  2.00380672, -1.534412  , -1.92822669, -0.57292597]])

In [37]:
optimal_point(df3, model=model, desired_class=0, original_class=1, threshold=50000, undesired_coords=df3.iloc[1,:df3.shape[1]-1], point_epsilon=1e-6, epsilon=0.07)

fitting model...
model finished.
boundary points started generation...
boundary points finished.
(50000, 50)
Finding the closest point from the contour line to the point...
Finding the closest point from the contour line to the point.
[[1.07023637 1.06976976 1.06969373 1.07021297 1.07019208 1.06967599
  1.06746763 1.07036306 1.06984842 1.06977869 1.07058837 1.0702381
  1.06975027 1.06972009 1.06973857 1.0712423  1.07144123 1.07081446
  1.06865035 1.07067915 1.06942924 1.07035981 1.06899849 1.07111809
  1.06024232 1.07163432 1.06724779 1.06810594 1.0702466  1.06864233
  1.07050402 1.07028733 1.07047349 1.07057354 1.06968933 1.07808458
  1.07140577 1.06982413 1.07138016 1.07078645 1.07023026 1.07182012
  1.06801061 1.06163103 1.06959137 1.06826589 1.06960988 1.06980174
  1.06983912 1.06881256]]
[[-4.52777119  4.64638377  3.49262824 -5.02509248 -5.57172006  3.30137171
   0.42152892 -2.94817682  7.05783795  4.83392349 -1.81956897 -4.49499512
   4.28371073  3.82166172  4.09181003 -0.8623047

array([[ 6.19812621,  6.27625802, -0.42602433,  1.68423239,  4.22537044,
        -1.6855653 ,  1.31981248,  3.23966109, -6.94823857,  2.54496262,
        -0.56307918, -2.99339257,  2.10512098,  3.38857828,  4.87270043,
        -0.82116314,  0.38646645,  1.00855226, -1.63925939, -3.04571639,
        -2.1559183 ,  5.84787037, -0.72097048, -0.38259003, -1.73338932,
         1.02140087,  3.84590092,  2.38359545,  2.76937251, -5.68655905,
         1.02765196, -0.44046882, -6.08452679, -1.51411165,  3.39464693,
         2.52157268,  4.76625411,  2.07175185,  4.04757458, -0.70950234,
         1.51880869, -2.00280518, -1.94513123, -1.58931375,  0.67468693,
        -2.13538688,  2.00380672, -1.534412  , -1.92822669, -0.57292597]])