# Ablation Study: Increasing $n$ 

Below, we show that our SSBA method is robust for finding decision boundary points in high-dimensional space independent of whether the space is $\mathbf{R}^2$, $\mathbf{R}^{10}$, $\mathbf{R}^{50}$, $\mathbf{R}^{100}$, $\mathbf{R}^{1,000}$, and $\mathbf{R}^{10,000}$.

In [5]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression

import numpy as np
import pandas as pd
import warnings
import random

random.seed(0)
warnings.filterwarnings('ignore', category=UserWarning)

import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [6]:
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from files.binary_search_optimal_point import optimal_point
from files.binary_search_optimal_point import multi_decision_boundary
from files.common_functions import convert_columns

In [7]:
%load_ext cuml.accel

The cuml.accel extension is already loaded. To reload it, use:
  %reload_ext cuml.accel


# 2-dimensions (Make a dataset of 2 features with 10,000 samples)

We generate this dataset with sklearn's ```make_classification``` function and then generate the decision boundary points.

In [19]:
X, y = make_classification(n_samples=10000, n_features=2, n_informative=2,
                           n_redundant=0, n_classes=2, random_state=42)
model = LogisticRegression()
y = y.reshape(-1,1)
df1 = pd.DataFrame(data=np.hstack((X,y)))
x_train = df1.iloc[:,:-1]
y_train = df1.iloc[:,-1]
model.fit(x_train,y_train)

boundary_points = multi_decision_boundary(model, x_train, y_train, threshold=100000, epsilon=1e-4)

# Print the decision bo un dary points
print("Decision Boundary Points (All Features):")
print(boundary_points)
print(boundary_points.shape)

Decision Boundary Points (All Features):
              0         1
0      0.040599  0.020677
1      0.080198  0.009600
2     -2.035127  0.601792
3     -0.542671  0.183966
4      0.818561 -0.197125
...         ...       ...
99995  0.806588 -0.193759
99996  0.815623 -0.196318
99997  0.392988 -0.077962
99998  1.044134 -0.260215
99999  0.116609 -0.000680

[100000 rows x 2 columns]
(100000, 2)


In [20]:
preds = model.predict_proba(boundary_points)
print(np.mean(preds, axis=0))

[0.5000003 0.4999997]


# 10-dimensions (Make a dataset of 10 features with 10,000 samples)

We generate this dataset with sklearn's ```make_classification``` function and then generate the decision boundary points.

In [10]:
X, y = make_classification(n_samples=10000, n_features=10, n_informative=10,
                           n_redundant=0, n_classes=2, random_state=42)
model = LogisticRegression()
y = y.reshape(-1,1)
df1 = pd.DataFrame(data=np.hstack((X,y)))
x_train = df1.iloc[:,:-1]
y_train = df1.iloc[:,-1]
model.fit(x_train,y_train)

boundary_points = multi_decision_boundary(model, x_train, y_train, threshold=100000, epsilon=1e-4)

# Print the decision bo un dary points
print("Decision Boundary Points (All Features):")
print(boundary_points)
print(boundary_points.shape)

Decision Boundary Points (All Features):
              0         1         2         3         4         5         6  \
0      0.344977 -0.711479  0.827653 -0.333810  0.215280 -1.388243  0.720845   
1      1.291301  1.315044  0.612367  1.792299  1.012659  0.811537  1.028102   
2      0.273924 -1.467676  0.610951  0.182350 -0.990745 -2.413791  0.042187   
3     -0.515922  1.090350  0.072823 -1.858131 -1.046706 -0.118363  1.419709   
4      0.304791 -0.197972 -1.243321  0.299243 -0.247227 -0.597046  0.357661   
...         ...       ...       ...       ...       ...       ...       ...   
99995  0.917459 -1.713634  0.040953 -2.214056  0.279853  2.615245 -2.301024   
99996  0.455650 -2.575891  1.017164 -1.178451  0.087640  1.437389 -0.052359   
99997 -1.833092 -2.070737 -1.158018  1.948476 -0.457385 -0.180142 -0.171904   
99998 -4.054517 -0.741327 -1.441348  4.185225  2.688837 -2.881441  2.663434   
99999  1.658367 -2.429500  0.428732 -0.691912  0.012256  2.741878 -2.212955   

          

In [11]:
preds = model.predict_proba(boundary_points)
print(np.mean(preds, axis=0))

[0.50000001 0.49999999]


# 50-dimensions (Make a dataset of 50 features with 10,000 samples)

We generate this dataset with sklearn's ```make_classification``` function and then generate the decision boundary points.

In [12]:
X, y = make_classification(n_samples=10000, n_features=50, n_informative=50,
                           n_redundant=0, n_classes=2, random_state=42)
model = LogisticRegression()
y = y.reshape(-1,1)
df1 = pd.DataFrame(data=np.hstack((X,y)))
x_train = df1.iloc[:,:-1]
y_train = df1.iloc[:,-1]
model.fit(x_train,y_train)

boundary_points = multi_decision_boundary(model, x_train, y_train, threshold=100000, epsilon=1e-4)

# Print the decision bo un dary points
print("Decision Boundary Points (All Features):")
print(boundary_points)
print(boundary_points.shape)

Decision Boundary Points (All Features):
             0         1         2         3         4         5         6   \
0      0.218235  7.693868  1.665100  1.277351  1.724377  0.126018 -3.840929   
1      5.819518 -3.426723  0.231446 -1.772583  3.016519  1.969557 -8.112472   
2     -1.308817 -0.710177  1.145592 -5.076736 -5.043330  3.143188  0.761503   
3      0.953608  1.783508  2.607021  8.360754  3.528969 -2.856033 -0.230467   
4      1.016778  7.020466 -0.769971  4.953099  4.608775  1.805922 -1.066066   
...         ...       ...       ...       ...       ...       ...       ...   
99995 -1.224525 -1.856533 -6.802889  1.669026 -0.323434  4.354281 -8.008714   
99996  0.181690 -1.104154  0.308954  1.008115  3.319840 -3.337906 -3.596922   
99997 -2.510536  0.081679 -1.114450  1.254121 -0.613860  5.408797  1.239509   
99998  6.047208  1.822928  0.821667  1.336501 -0.414399  2.476832 -3.575018   
99999 -3.696475  0.473278 -2.413074 -0.834047  5.846010 -3.142128  3.627409   

          

In [13]:
preds = model.predict_proba(boundary_points)
print(np.mean(preds, axis=0))

[0.49999991 0.50000009]


# 100-dimensions (Make a dataset of 100 features with 10,000 samples)

We generate this dataset with sklearn's ```make_classification``` function and then generate the decision boundary points.

In [14]:
X, y = make_classification(n_samples=10000, n_features=100, n_informative=100,
                           n_redundant=0, n_classes=2, random_state=42)
model = LogisticRegression()
y = y.reshape(-1,1)
df1 = pd.DataFrame(data=np.hstack((X,y)))
x_train = df1.iloc[:,:-1]
y_train = df1.iloc[:,-1]
model.fit(x_train,y_train)

boundary_points = multi_decision_boundary(model, x_train, y_train, threshold=100000, epsilon=1e-4)

# Print the decision bo un dary points
print("Decision Boundary Points (All Features):")
print(boundary_points)
print(boundary_points.shape)

Decision Boundary Points (All Features):
              0         1         2         3         4         5         6   \
0      -2.123425 -3.573830  4.059642  1.231292 -0.922513  6.103835 -4.131007   
1       2.898384  4.559818 -7.956736 -0.979956 -2.113784  5.851543 -2.546975   
2       4.721329 -0.110252 -4.304770 -0.402017 -6.764269  4.817211 -0.389918   
3       4.799491 -0.556895  4.761027 -3.441070 -8.988543 -2.150269  7.718331   
4      10.248486  4.021094 -0.673259  4.370290  0.825422 -4.537100  5.021605   
...          ...       ...       ...       ...       ...       ...       ...   
99995  -2.641115 -5.706880  1.641024 -3.685080  4.990607  2.595660 -6.330964   
99996  -4.430069  5.638364 -2.842166  3.295114  7.099627 -2.887428 -4.161907   
99997  -4.870791 -2.612838 -8.129071 -0.954027  6.604742  1.556592 -1.862930   
99998   4.049109  3.838568 -5.182991 -5.563124  3.632545 -3.617184 -6.762722   
99999  -8.501031 -3.290897 -3.478756 -8.968627 -0.952077 -1.141752  6.249120   

In [15]:
preds = model.predict_proba(boundary_points)
print(np.mean(preds, axis=0))

[0.5 0.5]


# 1,000-dimensions (Make a dataset of 1,000 features with 10,000 samples)

We generate this dataset with sklearn's ```make_classification``` function and then generate the decision boundary points.

In [16]:
X, y = make_classification(n_samples=10000, n_features=1000, n_informative=1000,
                           n_redundant=0, n_classes=2, random_state=42)
model = LogisticRegression()
y = y.reshape(-1,1)
df1 = pd.DataFrame(data=np.hstack((X,y)))
x_train = df1.iloc[:,:-1]
y_train = df1.iloc[:,-1]
model.fit(x_train,y_train)

boundary_points = multi_decision_boundary(model, x_train, y_train, threshold=100000, epsilon=1e-4)

# Print the decision bo un dary points
print("Decision Boundary Points (All Features):")
print(boundary_points)
print(boundary_points.shape)

Decision Boundary Points (All Features):
             0          1          2          3          4          5    \
0     -13.045026   8.243744  14.586948 -20.198750  -2.438791  11.864213   
1      21.337388  -5.768303  15.879739 -31.699827  38.359384  -4.350691   
2      -5.094464  -7.429413   4.198130  -6.938273  22.462336  -2.772532   
3      -5.115606 -22.910134  -8.727914 -21.588171   3.792181  20.597647   
4     -43.272824 -27.929853  10.235323  -5.115085  14.113220  10.691403   
...          ...        ...        ...        ...        ...        ...   
99995   4.107162   9.464778   5.374426  -8.303174   0.941675   4.191754   
99996  -6.844027  -4.858039  -0.431992 -13.887410  -7.091641   2.337031   
99997  -8.762266 -19.647824   7.689708  -1.916639   3.655792   5.927405   
99998  -2.480628   8.506003 -21.134646 -11.966840 -23.134685  40.350068   
99999  22.695474 -17.619865 -19.211069   4.610801  15.226567  -0.138220   

             6          7          8          9    ...    

In [17]:
preds = model.predict_proba(boundary_points)
print(np.mean(preds, axis=0))

[0.49999995 0.50000005]


# 10,000-dimensions (Make a dataset of 10,000 features with 10,000 samples)

We generate this dataset with sklearn's ```make_classification``` function and then generate the decision boundary points.

In [None]:
X, y = make_classification(n_samples=10000, n_features=10000, n_informative=10000,
                           n_redundant=0, n_classes=2, random_state=42)
model = LogisticRegression()
y = y.reshape(-1,1)
df1 = pd.DataFrame(data=np.hstack((X,y)))
x_train = df1.iloc[:,:-1]
y_train = df1.iloc[:,-1]
model.fit(x_train,y_train)

boundary_points = multi_decision_boundary(model, x_train, y_train, threshold=100000, epsilon=1e-4)

# Print the decision bo un dary points
print("Decision Boundary Points (All Features):")
print(boundary_points)
print(boundary_points.shape)

In [None]:
preds = model.predict_proba(boundary_points)
print(np.mean(preds, axis=0))