## Feature effect with piecewise-linear parts of different size

#### Goal
* Show that if feature effect contains both high and low resolution parts, fixed-bin size approaches fail.
  - If we choose a small K(=nof bins) => small bin size, then bin effect is very noisy (interpolation is erroneous)
  - If we choose a big K => large bin width, then we loose all high-resolution artifacts
* Show that variable-bin size can create the bins correctly



In [1]:
import sys
import os
sys.path.insert(0, os.path.abspath('..'))
import matplotlib.pyplot as plt
import ipywidgets as widgets
plt.rcParams["figure.figsize"] = (10,6)

In [2]:
import numpy as np
import examples.example_utils as utils
import feature_effect as fe

In [3]:
def create_model_params():
    def find_a(params, x_start):
        params[0]["a"] = x_start
        for i, param in enumerate(params):
            if i < len(params) - 1:
                a_next = param["a"] + (param["to"] - param["from"]) * param["b"]
                params[i + 1]["a"] = a_next

    params = [{"b":10, "from": 0., "to": 5.},
              {"b":-10, "from": 5., "to": 10.},
              {"b":0. , "from": 10., "to": 100}]

    x_start = -25.
    find_a(params, x_start)
    return params


def generate_samples(N):
    eps = 1e-05
    x1 = np.random.uniform(0., 5, size=int(N / 3))
    x2 = np.random.uniform(5, 10, size=int(N / 3))
    x3 = np.random.uniform(10, 99, size=int(N / 3))
    x = np.expand_dims(np.concatenate((np.array([0.0]), x1, x2, x3, np.array([100-eps]))), axis=-1)
    return x


In [4]:
N = 5000
noise_level = 3.
K_max_fixed = 50
K_max_var = 50
min_points_per_bin = 10

In [5]:
# set seed
seed = 4834545
np.random.seed(seed)

# define functions
model_params = create_model_params()
model = utils.create_model(model_params)
model_jac = utils.create_noisy_jacobian(model_params, noise_level, seed)

# generate data and data effect
data = generate_samples(N=N)
y = model(data)
data_effect = model_jac(data)

# DALE fixed-size interactive

In [6]:
dale = fe.DALE(data=data, model=model, model_jac=model_jac)
def plot(bins):
    dale.fit(features=[0], alg_params={"nof_bins": bins})
    dale.plot(s=0, block=False, gt=model)
    plt.show()

dale_plot = widgets.interactive(plot, bins=(1, 100, 1))
dale_output = dale_plot.children[-1]
dale_output.layout.height = '400px'
dale_plot

interactive(children=(IntSlider(value=50, description='bins', min=1), Output(layout=Layout(height='400px'))), …

# DALE variable-size interactive

In [7]:
dale = fe.DALE(data=data, model=model, model_jac=model_jac)
gt_bins = utils.create_gt_bins(model_params)
def plot(bins):
    dale.fit(features=[0], alg_params={"max_nof_bins": bins}, method = "variable-size")
    # dale.plot(s=0, block=False, gt=model)
    
    lims = dale.dale_params["feature_0"]["limits"]
    positions = [(lims[i] + (lims[i+1] - lims[i])/2) for i in range(len(lims) - 1)]
    dx = [lims[i+1]-lims[i] for i in range(len(lims) - 1)]
    height = dale.dale_params["feature_0"]["bin_effect"]
    plt.bar(x=positions, height=height, width=dx, color=(0.1, 0.1, 0.1, 0.1), edgecolor='blue', label="bin estimation")

    
    lims = gt_bins["limits"]
    positions = [(lims[i] + (lims[i+1] - lims[i])/2) for i in range(len(lims) - 1)]
    height = gt_bins["height"]
    dx = [lims[i+1]-lims[i] for i in range(len(lims) - 1)]
    plt.bar(x=positions, height=height, width=dx, color=(0.1, 0.1, 0.1, 0.1), edgecolor='red', label="bin gt")
    
    plt.legend()
    plt.show()
    
    dale.plot(s=0, block=False, gt=model)

dale_plot = widgets.interactive(plot, bins=(1, 66, 1))
dale_output = dale_plot.children[-1]
dale_output.layout.height = '1200px'
dale_plot

interactive(children=(IntSlider(value=33, description='bins', max=66, min=1), Output(layout=Layout(height='120…

In [8]:
k_list_fixed, mse_fixed, loss_fixed, dale_fixed = utils.count_loss_mse(K_max_fixed, model, data, model, model_jac,
                                                                       min_points_per_bin, method="fixed-size")
k_list_var, mse_var, loss_var, dale_var = utils.count_loss_mse(K_max_var, model, data, model, model_jac,
                                                               min_points_per_bin, method="variable-size")

AttributeError: module 'examples.example_utils' has no attribute 'count_loss_mse'

In [None]:
# plot
plt.figure()
plt.title("Loss vs K")
plt.plot(k_list_fixed, loss_fixed, "bo-", label="fixed size")
plt.legend()
plt.show(block=False)

plt.figure()
plt.title("Loss vs K")
plt.plot(k_list_var, loss_var, "ro-", label="variable size")
plt.legend()
plt.show(block=False)


In [None]:
# plot best fixed solution
best_fixed = np.nanargmin(loss_fixed)
dale_fixed[best_fixed].plot(s=0,
                            gt=model,
                            gt_bins=utils.create_gt_bins(f_params),
                            block=False)


In [None]:
# plot best variable size solution
best_var = np.nanargmin(loss_var)
dale_var[best_var].plot(s=0,
                        gt=model,
                        gt_bins=utils.create_gt_bins(f_params),
                        block=False)


# DALE fixed-size: best solution based on auto algorithm

In [None]:
best_fixed = np.nanargmin(loss_fixed)
print("===========================================")
print("Best K is: %2d, with loss=%.3f" % (best_fixed+1, loss_fixed[best_fixed]))

print("===========================================")
print("Dx is: %.3f" %(dale_fixed[best_fixed].dale_params["feature_0"]["dx"]))

print("===========================================")
print("Effect per bin is:")
print(dale_fixed[best_fixed].dale_params["feature_0"]["bin_effect"])

print("===========================================")
print("Loss per K:")
for i, k in enumerate(k_list_fixed):
    print("Fixed-size k= %d, loss=%.3f" % (k, loss_fixed[i]))

# DALE variable-size: best solution based on auto algorithm

In [None]:
best_var = np.nanargmin(loss_var)
print("===========================================")
print("Best K is: %2d, with loss=%.3f" % (best_var+1, loss_var[best_var]))

print("===========================================")
print("Bin limits are:")
print(dale_var[best_var].dale_params["feature_0"]["limits"])

print("===========================================")
print("Effect per bin is:")
print(dale_var[best_var].dale_params["feature_0"]["bin_effect"])

print("===========================================")
print("Loss per K:")
for i, k in enumerate(k_list_var):
    print("Fixed-size k= %d, loss=%.3f" % (k, loss_var[i]))