This tutorial measures the runtime of Global Effect plots, which is mainly influenced by the following factors:

- $t_f$: The runtime of the underlying black-box function, i.e., how fast is to execute $f$ on a batch of data (or, possibly, on the whole dataset).
- $N$: The number of instances.
- $D$: The number of features.
- $T$: The number of points used for centering the feature effect plot.

In this tutorial, we'll focus on $t_f$, $N$, and $D$ to measure how each influences the runtime of Global PDP.

In [None]:
import effector
import numpy as np
import timeit
import time
import matplotlib.pyplot as plt
np.random.seed(21)

In [None]:
def return_predict(t):
    def predict(x):
        time.sleep(t)
        model = effector.models.DoubleConditionalInteraction()
        return model.predict(x)
    return predict

def return_jacobian(t):
    def jacobian(x):
        time.sleep(t)
        model = effector.models.DoubleConditionalInteraction()
        return model.jacobian(x)
    return jacobian

In [None]:
def measure_time(method_name, repetitions):
    time_list = []
    for _ in range(repetitions):
        X = np.random.uniform(-1, 1, (N, D))
        xx = np.linspace(-1, 1, T)

        axis_limits = np.zeros((2,D))
        axis_limits[0, :] = -1
        axis_limits[1, :] = 1
        
        start_time = time.time()
        if method_name == "pdp_vectorized":
            tic = time.time()
            pdp = effector.PDP(data=X, model=model, axis_limits=axis_limits)
            toc = time.time()
            print(f"Time for int: {toc - tic}")
            tic = time.time()
            pdp.fit(0, centering=True, points_for_centering=T, use_vectorized=True)
            toc = time.time()
            print(f"Time for int: {toc - tic}")
            tic = time.time()
            pdp.eval(feature=0, xs=xx, centering=True, heterogeneity=True)
            toc = time.time()
            print(f"Time for int: {toc - tic}")
        elif method_name == "d_pdp_vectorized":
            d_pdp = effector.DerPDP(data=X, model=model, model_jac=model_jac, axis_limits=axis_limits)
            d_pdp.fit(0, centering=True, points_for_centering=T, use_vectorized=True)
            d_pdp.eval(feature=0, xs=xx, centering=True, heterogeneity=True)
        elif method_name == "ale":
            tic = time.time()
            ale = effector.ALE(data=X, model=model, axis_limits=axis_limits)
            toc = time.time()
            print(f"Time for int: {toc - tic}")
            tic = time.time()
            ale.fit(0, centering=True, points_for_centering=T)
            toc = time.time()
            print(f"Time for fit: {toc - tic}")
            ale.eval(feature=0, xs=xx, centering=True, heterogeneity=True)
        elif method_name == "rhale":
            # data_effect = model_jac(X)
            rhale = effector.RHALE(data=X, model=model, model_jac=model_jac, axis_limits=axis_limits)
            binning_method = effector.binning_methods.Fixed(nof_bins=20)
            rhale.fit(0, centering=True, points_for_centering=T, binning_method=binning_method)
            rhale.eval(feature=0, xs=xx, centering=True, heterogeneity=True)
        stop_time = time.time()
    
        time_list.append(stop_time - start_time)
    return np.mean(time_list)

## Global effect time (sec) vs $t_f$--time for a single evaluation of f (sec)

In [None]:
t = 0.001
N = 100_000
D = 30
T = 100

In [None]:
vec = [.1, .5] # , 1., 2.]
time_dict = {"ale": [], "rhale": [], "pdp":[], "d_pdp":[]}
for t in vec:
    print("t:", t)
    model = return_predict(t)
    model_jac = return_jacobian(t)
    time_dict["ale"].append(measure_time("ale", 3))
    time_dict["rhale"].append(measure_time("rhale", 3))
    time_dict["pdp"].append(measure_time("pdp_vectorized", 3))
    time_dict["d_pdp"].append(measure_time("d_pdp_vectorized", 3))

In [None]:
import matplotlib.pyplot as plt
plt.figure()
plt.plot(vec, time_dict["ale"], "x--", label="ALE")
plt.plot(vec, time_dict["rhale"], "x--", label="RHALE")
plt.plot(vec, time_dict["pdp"], "x--", label="PDP")
plt.plot(vec, time_dict["d_pdp"], "x--", label="d-PDP")
plt.title("Execution time: FE method vs. f execution")
plt.xlabel("f(x) execution: (sec)")
plt.ylabel("Global Effect method execution (sec)")
plt.legend()
plt.show()

## PDP time vs N (nof instances)

In [None]:
t = 0.001
N = 10_000
D = 10
T = 100

In [None]:
vec = [1_000, 10_000, 50_000, 100_000, 1_000_000]
time_dict = {"ale": [], "rhale": [], "pdp_vectorized":[]}
for N in vec:
    model = return_predict(t)
    model_jac = return_jacobian(t)
    print("ALE")
    time_dict["ale"].append(measure_time("ale", 3))
    print("RHALE")
    time_dict["rhale"].append(measure_time("rhale", 3))
    print("PDP")
    time_dict["pdp_vectorized"].append(measure_time("pdp_vectorized", 3))

In [None]:
plt.figure()
plt.plot(vec, time_dict["ale"], "x--", label="ALE")
plt.plot(vec, time_dict["rhale"], "x--", label="RHALE")
plt.plot(vec, time_dict["pdp_vectorized"], "x--", label="pdp (vectorized)")
plt.title("Execution time vs nof instances (N)")
plt.xlabel("number of instances (N)")
plt.ylabel("time (s) for FE execution")
plt.xscale("log")
plt.legend()
plt.show()

## PDP time vs D (nof features)

In [None]:
t = 0.001
N = 10_000
T = 100

In [None]:
vec = [5, 10, 50, 100, 200, 300]
time_dict = {"ale": [], "rhale": [], "pdp_vectorized":[]}
for D in vec:
    model = return_predict(t)
    model_jac = return_jacobian(t)
    print("ALE")
    time_dict["ale"].append(measure_time("ale", 3))
    print("RHALE")
    time_dict["rhale"].append(measure_time("rhale", 3))
    print("PDP")
    time_dict["pdp_vectorized"].append(measure_time("pdp_vectorized", 3))

In [None]:
plt.figure()
plt.plot(vec, time_dict["ale"], "x--", label="ALE")
plt.plot(vec, time_dict["rhale"], "x--", label="RHALE")
plt.plot(vec, time_dict["pdp_vectorized"], "x--", label="pdp (vectorized)")
plt.title("Execution time vs nof features (D)")
plt.xlabel("number of features (D)")
plt.ylabel("time (s) for FE execution")
# plt.xscale("log")
plt.legend()
plt.show()

## Conclusion

In practice, the vectorized version outperforms the non-vectorized version in all cases:

- The runtime of the black-box model is the key factor. As the model size increases, the non-vectorized implementation shows a linear increase in runtime, while the vectorized implementation remains unaffected.
- The number of instances does not significantly impact the runtime for either version, as long as \( y \) can be obtained in a single pass of \( f(x) \).
- Both versions scale linearly with the number of features, but the vectorized version is consistently faster by a constant margin.